diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d019d937e36b9f90f4ef838fc8f01c17d1746ad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b9dbd6da26402b76b495ee26389fdafeb63b201 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc24f1c4f1696ebc0908b6cc043db79a5c52222c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..483bf08614088b6cd4d06b5974e0cc7788c4a248 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8224610d66fd1c5004dfc7eadc82e27ee50ab21 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbd1ab5c9577c92db3aaafe816e32b77bec83b09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6174d5a00087c1655e8a73a1f66723bb5ea92cd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bda9742a3b5977d27386d273a27da388828814d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0c2043734d21e9627a895dfc5e3bbb22619d446 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9be1ba6f1270c476be449f498261329b3735b33e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c3bd69ca253be1166c547063ccc1e7c27070590 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..378840fbffe2837421c1c2ec4369a2d66e85a224 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9111a0d62d5ea0f01e737607491357ba1bc76bbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..192d0e77aa3f30d9d109c4f05571fea9f3f48655 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8133a950f518dae3f2bbf10e80d143006666ca0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9809387cacd51acd44bea3c77f2144d21476a8af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d4adcbcfb4db85d82f9cd093430755405b8ac72 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bf4dae6626ef438ba2b578c9dbc36f4a6b182b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e630df369770af988af38199677d09f08e09ebe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f22e785e6e2d126d2e596e9ecbd0d9ce0bff6a2b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bca1967136deed7a85f9362845a92fac83e0b712 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25a00e7e65e45c0659fb28025ab8e06823b3a878 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8effb2515a01e1602d83267b5c6e39a46ba2032 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..361ace1626099e497a41dbe61ee4df54c14f19e3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae40dc16ff7c3204adca1ec979b1c674448a6367 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6bf882ffa83a508d059f4af8c5b4d6e42706afb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb96db69290ba932955aad575c20279229f9442d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d26020cb49577f2e62772e3f93c846d2da62fa8e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3307040faddcfe47c9035ac2039323681b3a65ed Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd392de1bb4e3db5fdb00f824484130d2c517a37 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d308b2b9c0e1ffe51b6dd11dcca0aad86a4b4cce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56a22011447d2c103fb3614ef1d6e6e7ae1f973a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0b8fd1274ee33ddad10f03f118dd8088745cbfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36d491f75a164f1e7635a82ebf77f3512a5b7f36 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5a73c9fe94823366198b03f23fcb556cbebdb76 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5497c473c1cdf7c00db59ea77cb331eadbfc7853 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5943b219427149b89e50685aa25d6b39096743bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e09f0b408a98b42e6e7ff3783db082542c5ee34 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e09c989ca10d3dc5cbd33381310d9819a671787 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5483b9d95ee87c3a349ec943b3cf6136600001c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e71e5ef93742375e00e8b276f18e3993b269227d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..80ba84a84251db6229c38b5f2c48b233fe594fbb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py @@ -0,0 +1,41 @@ +import types + +from .modules import * # noqa: F403 +from .modules.fused import _FusedModule # noqa: F403 + + +# # Subpackages +# from . import qat # noqa: F403 +# from . import quantized # noqa: F403 + +__all__ = [ + "ConvBn1d", + "ConvBn2d", + "ConvBn3d", + "ConvBnReLU1d", + "ConvBnReLU2d", + "ConvBnReLU3d", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "LinearReLU", + "BNReLU2d", + "BNReLU3d", + "LinearBn1d", + "LinearLeakyReLU", + "LinearTanh", + "ConvAdd2d", + "ConvAddReLU2d", +] + + +# We are exposing all subpackages to the end-user. +# Because of possible inter-dependency, we want to avoid +# the cyclic imports, thus implementing lazy version +# as per https://peps.python.org/pep-0562/ +def __getattr__(name: str) -> types.ModuleType: + if name in __all__: + import importlib + + return importlib.import_module("." + name, __name__) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74572b52e820de65e63ae05585bfe1f41c34ded9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..132137b7357378fe29ef9a63310a554725aea86a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py @@ -0,0 +1,41 @@ +from .fused import ( # noqa: F401 + _FusedModule, + BNReLU2d, + BNReLU3d, + ConvAdd2d, + ConvAddReLU2d, + ConvBn1d, + ConvBn2d, + ConvBn3d, + ConvBnReLU1d, + ConvBnReLU2d, + ConvBnReLU3d, + ConvReLU1d, + ConvReLU2d, + ConvReLU3d, + LinearBn1d, + LinearLeakyReLU, + LinearReLU, + LinearTanh, +) + + +__all__ = [ + "ConvBn1d", + "ConvBn2d", + "ConvBn3d", + "ConvBnReLU1d", + "ConvBnReLU2d", + "ConvBnReLU3d", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "LinearReLU", + "BNReLU2d", + "BNReLU3d", + "LinearBn1d", + "LinearLeakyReLU", + "LinearTanh", + "ConvAdd2d", + "ConvAddReLU2d", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a82f8dab4a219e2ec35a172cd1a33815874becef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5acc7bfc26ef4b76adbdaa7f0aaad074f1dd5b5d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py new file mode 100644 index 0000000000000000000000000000000000000000..d189e3d92447da930ba487034b58c623e2e7a4ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py @@ -0,0 +1,289 @@ +# mypy: allow-untyped-defs +import torch +from torch.nn import ( + BatchNorm1d, + BatchNorm2d, + BatchNorm3d, + Conv1d, + Conv2d, + Conv3d, + Linear, + ReLU, +) +from torch.nn.utils.parametrize import type_before_parametrizations + + +__all__ = [ + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "LinearReLU", + "ConvBn1d", + "ConvBn2d", + "ConvBnReLU1d", + "ConvBnReLU2d", + "ConvBn3d", + "ConvBnReLU3d", + "BNReLU2d", + "BNReLU3d", + "LinearBn1d", + "LinearLeakyReLU", + "LinearTanh", + "ConvAdd2d", + "ConvAddReLU2d", +] + + +# Used for identifying intrinsic modules used in quantization +class _FusedModule(torch.nn.Sequential): + pass + + +class ConvReLU1d(_FusedModule): + r"""This is a sequential container which calls the Conv1d and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, relu): + assert ( + type_before_parametrizations(conv) == Conv1d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, relu) + + +class ConvReLU2d(_FusedModule): + r"""This is a sequential container which calls the Conv2d and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, relu): + assert ( + type_before_parametrizations(conv) == Conv2d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, relu) + + +class ConvReLU3d(_FusedModule): + r"""This is a sequential container which calls the Conv3d and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, relu): + assert ( + type_before_parametrizations(conv) == Conv3d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, relu) + + +class LinearReLU(_FusedModule): + r"""This is a sequential container which calls the Linear and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, linear, relu): + assert ( + type_before_parametrizations(linear) == Linear + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(linear)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(linear, relu) + + +class ConvBn1d(_FusedModule): + r"""This is a sequential container which calls the Conv 1d and Batch Norm 1d modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn): + assert ( + type_before_parametrizations(conv) == Conv1d + and type_before_parametrizations(bn) == BatchNorm1d + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + ) + super().__init__(conv, bn) + + +class ConvBn2d(_FusedModule): + r"""This is a sequential container which calls the Conv 2d and Batch Norm 2d modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn): + assert ( + type_before_parametrizations(conv) == Conv2d + and type_before_parametrizations(bn) == BatchNorm2d + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + ) + super().__init__(conv, bn) + + +class ConvBnReLU1d(_FusedModule): + r"""This is a sequential container which calls the Conv 1d, Batch Norm 1d, and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn, relu): + assert ( + type_before_parametrizations(conv) == Conv1d + and type_before_parametrizations(bn) == BatchNorm1d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, bn, relu) + + +class ConvBnReLU2d(_FusedModule): + r"""This is a sequential container which calls the Conv 2d, Batch Norm 2d, and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn, relu): + assert ( + type_before_parametrizations(conv) == Conv2d + and type_before_parametrizations(bn) == BatchNorm2d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, bn, relu) + + +class ConvBn3d(_FusedModule): + r"""This is a sequential container which calls the Conv 3d and Batch Norm 3d modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn): + assert ( + type_before_parametrizations(conv) == Conv3d + and type_before_parametrizations(bn) == BatchNorm3d + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + ) + super().__init__(conv, bn) + + +class ConvBnReLU3d(_FusedModule): + r"""This is a sequential container which calls the Conv 3d, Batch Norm 3d, and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, bn, relu): + assert ( + type_before_parametrizations(conv) == Conv3d + and type_before_parametrizations(bn) == BatchNorm3d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(conv)}" + f"{type_before_parametrizations(bn)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(conv, bn, relu) + + +class BNReLU2d(_FusedModule): + r"""This is a sequential container which calls the BatchNorm 2d and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, batch_norm, relu): + assert ( + type_before_parametrizations(batch_norm) == BatchNorm2d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(batch_norm, relu) + + +class BNReLU3d(_FusedModule): + r"""This is a sequential container which calls the BatchNorm 3d and ReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, batch_norm, relu): + assert ( + type_before_parametrizations(batch_norm) == BatchNorm3d + and type_before_parametrizations(relu) == ReLU + ), ( + f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}" + f"{type_before_parametrizations(relu)}" + ) + super().__init__(batch_norm, relu) + + +class LinearBn1d(_FusedModule): + r"""This is a sequential container which calls the Linear and BatchNorm1d modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, linear, bn): + assert ( + type_before_parametrizations(linear) == Linear + and type_before_parametrizations(bn) == BatchNorm1d + ), ( + f"Incorrect types for input modules{type_before_parametrizations(linear)}" + f"{type_before_parametrizations(bn)}" + ) + super().__init__(linear, bn) + + +class LinearLeakyReLU(_FusedModule): + r"""This is a sequential container which calls the Linear and LeakyReLU modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, linear, leaky_relu): + assert type(linear) is Linear and type(leaky_relu) is torch.nn.LeakyReLU, ( + f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}" + ) + super().__init__(linear, leaky_relu) + + +class LinearTanh(_FusedModule): + r"""This is a sequential container which calls the Linear and Tanh modules. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, linear, tanh): + assert type(linear) is Linear and type(tanh) is torch.nn.Tanh, ( + f"Incorrect types for input modules{type(linear)}{type(tanh)}" + ) + super().__init__(linear, tanh) + + +class ConvAdd2d(_FusedModule): + r"""This is a sequential container which calls the Conv2d modules with extra Add. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, add): + super().__init__(conv) + self.add = add + + def forward(self, x1, x2): # type: ignore[override] + r"""Applies convolution to x1 and adds the result to x2.""" + return self.add(self[0](x1), x2) + + +class ConvAddReLU2d(_FusedModule): + r"""This is a sequential container which calls the Conv2d, add, Relu. + During quantization this will be replaced with the corresponding fused module.""" + + def __init__(self, conv, add, relu): + super().__init__(conv) + self.add = add + self.relu = relu + + def forward(self, x1, x2): # type: ignore[override] + r"""Applies convolution to x1, adds the result to x2, and applies ReLU.""" + return self.relu(self.add(self[0](x1), x2)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f49dc17b617241eae01635293e1203d6f994f61 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18534bbc588e7480ac6529c6648c5976eadaea3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py @@ -0,0 +1,32 @@ +from .conv_fused import ( + ConvBn1d, + ConvBn2d, + ConvBn3d, + ConvBnReLU1d, + ConvBnReLU2d, + ConvBnReLU3d, + ConvReLU1d, + ConvReLU2d, + ConvReLU3d, + freeze_bn_stats, + update_bn_stats, +) +from .linear_fused import LinearBn1d +from .linear_relu import LinearReLU + + +__all__ = [ + "LinearReLU", + "LinearBn1d", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "ConvBn1d", + "ConvBn2d", + "ConvBn3d", + "ConvBnReLU1d", + "ConvBnReLU2d", + "ConvBnReLU3d", + "update_bn_stats", + "freeze_bn_stats", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d192c40cfc9cb81d5283dac9d0ecfe8cacb77c5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fedc28632165898913829990fc0448331cc7594 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2505e061a09624a7f47415610567c7069ae3ed57 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03742ef1550d71d7777d1bd1abc6dca514501008 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py new file mode 100644 index 0000000000000000000000000000000000000000..10f67764d8f05143e4bcc15ad1196f801015370a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py @@ -0,0 +1,958 @@ +# mypy: allow-untyped-defs +import math +from typing import ClassVar + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.qat as nnqat +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import init +from torch.nn.modules.utils import _pair, _single, _triple +from torch.nn.parameter import Parameter +from torch.nn.utils import fuse_conv_bn_weights + + +__all__ = [ + "ConvBn1d", + "ConvBnReLU1d", + "ConvReLU1d", + "ConvBn2d", + "ConvBnReLU2d", + "ConvReLU2d", + "ConvBn3d", + "ConvBnReLU3d", + "ConvReLU3d", + "update_bn_stats", + "freeze_bn_stats", +] +_BN_CLASS_MAP = { + 1: nn.BatchNorm1d, + 2: nn.BatchNorm2d, + 3: nn.BatchNorm3d, +} + + +class _ConvBnNd(nn.modules.conv._ConvNd, nni._FusedModule): + _version = 2 + _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]] + + def __init__( + self, + # ConvNd args + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + # BatchNormNd args + # num_features: out_channels + eps=1e-05, + momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None, + dim=2, + ): + nn.modules.conv._ConvNd.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + False, + padding_mode, + ) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.freeze_bn = freeze_bn if self.training else True + self.bn = _BN_CLASS_MAP[dim](out_channels, eps, momentum, True, True) + self.weight_fake_quant = self.qconfig.weight() + if bias: + self.bias = Parameter(torch.empty(out_channels)) + else: + self.register_parameter("bias", None) + self.reset_bn_parameters() + + # this needs to be called after reset_bn_parameters, + # as they modify the same state + if self.training: + if freeze_bn: + self.freeze_bn_stats() + else: + self.update_bn_stats() + else: + self.freeze_bn_stats() + + self._enable_slow_path_for_better_numerical_stability = False + + def reset_running_stats(self): + self.bn.reset_running_stats() + + def reset_bn_parameters(self): + self.bn.reset_running_stats() + init.uniform_(self.bn.weight) + init.zeros_(self.bn.bias) + # note: below is actually for conv, not BN + if self.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) + init.uniform_(self.bias, -bound, bound) + + def update_bn_stats(self): + self.freeze_bn = False + self.bn.training = True + return self + + def freeze_bn_stats(self): + self.freeze_bn = True + self.bn.training = False + return self + + def _forward(self, input): + if self._enable_slow_path_for_better_numerical_stability: + return self._forward_slow(input) + return self._forward_approximate(input) + + def _forward_approximate(self, input): + """Approximated method to fuse conv and bn. It requires only one forward pass. + conv_orig = conv / scale_factor where scale_factor = bn.weight / running_std + """ + assert self.bn.running_var is not None + running_std = torch.sqrt(self.bn.running_var + self.bn.eps) + scale_factor = self.bn.weight / running_std + weight_shape = [1] * len(self.weight.shape) + weight_shape[0] = -1 + bias_shape = [1] * len(self.weight.shape) + bias_shape[1] = -1 + scaled_weight = self.weight_fake_quant( + self.weight * scale_factor.reshape(weight_shape) + ) + # using zero bias here since the bias for original conv + # will be added later + if self.bias is not None: + zero_bias = torch.zeros_like(self.bias, dtype=input.dtype) + else: + zero_bias = torch.zeros( + self.out_channels, device=scaled_weight.device, dtype=input.dtype + ) + conv = self._conv_forward(input, scaled_weight, zero_bias) + conv_orig = conv / scale_factor.reshape(bias_shape) + if self.bias is not None: + conv_orig = conv_orig + self.bias.reshape(bias_shape) + conv = self.bn(conv_orig) + return conv + + def _forward_slow(self, input): + """ + A more accurate but slow method to compute conv bn fusion, following https://arxiv.org/pdf/1806.08342.pdf + It requires two forward passes but handles the case bn.weight == 0 + + Conv: Y = WX + B_c + Conv without bias: Y0 = WX = Y - B_c, Y = Y0 + B_c + + Batch statistics: + mean_Y = Y.mean() + = Y0.mean() + B_c + var_Y = (Y - mean_Y)^2.mean() + = (Y0 - Y0.mean())^2.mean() + BN (r: bn.weight, beta: bn.bias): + Z = r * (Y - mean_Y) / sqrt(var_Y + eps) + beta + = r * (Y0 - Y0.mean()) / sqrt(var_Y + eps) + beta + + Fused Conv BN training (std_Y = sqrt(var_Y + eps)): + Z = (r * W / std_Y) * X + r * (B_c - mean_Y) / std_Y + beta + = (r * W / std_Y) * X - r * Y0.mean() / std_Y + beta + + Fused Conv BN inference (running_std = sqrt(running_var + eps)): + Z = (r * W / running_std) * X - r * (running_mean - B_c) / running_std + beta + + QAT with fused conv bn: + Z_train = fake_quant(r * W / running_std) * X * (running_std / std_Y) - r * Y0.mean() / std_Y + beta + = conv(X, fake_quant(r * W / running_std)) * (running_std / std_Y) - r * Y0.mean() / std_Y + beta + Z_inference = conv(X, fake_quant(r * W / running_std)) - r * (running_mean - B_c) / running_std + beta + """ + + assert self.bn.running_var is not None + assert self.bn.running_mean is not None + + # using zero bias here since the bias for original conv + # will be added later + zero_bias = torch.zeros( + self.out_channels, device=self.weight.device, dtype=input.dtype + ) + + weight_shape = [1] * len(self.weight.shape) + weight_shape[0] = -1 + bias_shape = [1] * len(self.weight.shape) + bias_shape[1] = -1 + + if self.bn.training: + # needed to compute batch mean/std + conv_out = self._conv_forward(input, self.weight, zero_bias) + # update bn statistics + with torch.no_grad(): + conv_out_bias = ( + conv_out + if self.bias is None + else conv_out + self.bias.reshape(bias_shape) + ) + self.bn(conv_out_bias) + + # fused conv + bn without bias using bn running statistics + running_std = torch.sqrt(self.bn.running_var + self.bn.eps) + scale_factor = self.bn.weight / running_std + scaled_weight = self.weight_fake_quant( + self.weight * scale_factor.reshape(weight_shape) + ) + # fused conv without bias for inference: (r * W / running_std) * X + conv_bn = self._conv_forward(input, scaled_weight, zero_bias) + + avg_dims = [0] + list(range(2, len(self.weight.shape))) + batch_mean = conv_out.mean(avg_dims) + batch_var = torch.square(conv_out - batch_mean.reshape(bias_shape)).mean( + avg_dims + ) + batch_std = torch.sqrt(batch_var + self.bn.eps) + + # scale to use batch std in training mode + # conv(X, r * W / std_Y) = conv(X, r * W / running_std) * (running_std / std_Y) + unscale_factor = running_std / batch_std + conv_bn *= unscale_factor.reshape(bias_shape) + + fused_mean = batch_mean + fused_std = batch_std + else: + # fused conv + bn without bias using bn running statistics + running_std = torch.sqrt(self.bn.running_var + self.bn.eps) + scale_factor = self.bn.weight / running_std + scaled_weight = self.weight_fake_quant( + self.weight * scale_factor.reshape(weight_shape) + ) + # fused conv without bias for inference: (r * W / running_std) * X + conv_bn = self._conv_forward(input, scaled_weight, zero_bias) + + fused_mean = self.bn.running_mean - ( + self.bias if self.bias is not None else 0 + ) + fused_std = running_std + + # fused bias = beta - r * mean / std + fused_bias = self.bn.bias - self.bn.weight * fused_mean / fused_std + conv_bn += fused_bias.reshape(bias_shape) + + # HACK to let conv bias participate in loss to avoid DDP error (parameters + # were not used in producing loss) + if self.bias is not None: + conv_bn += (self.bias - self.bias).reshape(bias_shape) + + return conv_bn + + def forward(self, input): + return self._forward(input) + + def train(self, mode=True): + """ + Batchnorm's training behavior is using the self.training flag. Prevent + changing it if BN is frozen. This makes sure that calling `model.train()` + on a model with a frozen BN will behave properly. + """ + self.training = mode + if not self.freeze_bn: + for module in self.children(): + module.train(mode) + return self + + # ===== Serialization version history ===== + # + # Version 1/None + # self + # |--- weight : Tensor + # |--- bias : Tensor + # |--- gamma : Tensor + # |--- beta : Tensor + # |--- running_mean : Tensor + # |--- running_var : Tensor + # |--- num_batches_tracked : Tensor + # + # Version 2 + # self + # |--- weight : Tensor + # |--- bias : Tensor + # |--- bn : Module + # |--- weight : Tensor (moved from v1.self.gamma) + # |--- bias : Tensor (moved from v1.self.beta) + # |--- running_mean : Tensor (moved from v1.self.running_mean) + # |--- running_var : Tensor (moved from v1.self.running_var) + # |--- num_batches_tracked : Tensor (moved from v1.self.num_batches_tracked) + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + if version is None or version == 1: + # BN related parameters and buffers were moved into the BN module for v2 + v2_to_v1_names = { + "bn.weight": "gamma", + "bn.bias": "beta", + "bn.running_mean": "running_mean", + "bn.running_var": "running_var", + "bn.num_batches_tracked": "num_batches_tracked", + } + for v2_name, v1_name in v2_to_v1_names.items(): + if prefix + v1_name in state_dict: + state_dict[prefix + v2_name] = state_dict[prefix + v1_name] + state_dict.pop(prefix + v1_name) + elif prefix + v2_name in state_dict: + # there was a brief period where forward compatibility + # for this module was broken (between + # https://github.com/pytorch/pytorch/pull/38478 + # and https://github.com/pytorch/pytorch/pull/38820) + # and modules emitted the v2 state_dict format while + # specifying that version == 1. This patches the forward + # compatibility issue by allowing the v2 style entries to + # be used. + pass + elif strict: + missing_keys.append(prefix + v2_name) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module or qparams_dict + + Args: `mod` a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + # The ignore is because _FLOAT_MODULE is a TypeVar here where the bound + # has no __name__ (code is fine though) + assert type(mod) is cls._FLOAT_MODULE, ( + "qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + qconfig = mod.qconfig + conv, bn = mod[0], mod[1] # type: ignore[index] + qat_convbn = cls( + conv.in_channels, + conv.out_channels, + conv.kernel_size, + conv.stride, + conv.padding, + conv.dilation, + conv.groups, + conv.bias is not None, + conv.padding_mode, + bn.eps, + bn.momentum, + False, + qconfig, + ) + qat_convbn.weight = conv.weight + qat_convbn.bias = conv.bias + qat_convbn.bn.weight = bn.weight + qat_convbn.bn.bias = bn.bias + qat_convbn.bn.running_mean = bn.running_mean + qat_convbn.bn.running_var = bn.running_var + # mypy error: Cannot determine type of 'num_batches_tracked' + qat_convbn.bn.num_batches_tracked = bn.num_batches_tracked + return qat_convbn + + def to_float(self): + cls = type(self) + conv = cls._FLOAT_CONV_MODULE( # type: ignore[attr-defined] + self.in_channels, + self.out_channels, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.groups, + self.bias is not None, + self.padding_mode, + ) + conv.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + conv.bias = torch.nn.Parameter(self.bias.detach()) + + if cls._FLOAT_BN_MODULE: # type: ignore[attr-defined] + # fuse bn into conv + assert self.bn.running_var is not None and self.bn.running_mean is not None + conv.weight, conv.bias = fuse_conv_bn_weights( + conv.weight, + conv.bias, + self.bn.running_mean, + self.bn.running_var, + self.bn.eps, + self.bn.weight, + self.bn.bias, + ) + + if cls._FLOAT_RELU_MODULE: # type: ignore[attr-defined] + modules = [] + modules.append(conv) + relu = cls._FLOAT_RELU_MODULE() # type: ignore[attr-defined] + modules.append(relu) + conv_relu = cls._FUSED_FLOAT_MODULE(*modules) # type: ignore[attr-defined] + conv_relu.train(self.training) + return conv_relu + else: + conv.train(self.training) + return conv + + +class ConvBn1d(_ConvBnNd, nn.Conv1d): + r""" + A ConvBn1d module is a module fused from Conv1d and BatchNorm1d, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv1d` and + :class:`torch.nn.BatchNorm1d`. + + Similar to :class:`torch.nn.Conv1d`, with FakeQuantize modules initialized + to default. + + Attributes: + freeze_bn: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBn1d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + + def __init__( + self, + # Conv1d args + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=None, + padding_mode="zeros", + # BatchNorm1d args + # num_features: out_channels + eps=1e-05, + momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None, + ): + kernel_size = _single(kernel_size) + stride = _single(stride) + padding = _single(padding) + dilation = _single(dilation) + _ConvBnNd.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _single(0), + groups, + bias, + padding_mode, + eps, + momentum, + freeze_bn, + qconfig, + dim=1, + ) + + +class ConvBnReLU1d(ConvBn1d): + r""" + A ConvBnReLU1d module is a module fused from Conv1d, BatchNorm1d and ReLU, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv1d` and + :class:`torch.nn.BatchNorm1d` and :class:`torch.nn.ReLU`. + + Similar to `torch.nn.Conv1d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + # base class defines _FLOAT_MODULE as "ConvBn1d" + _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBnReLU1d + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU + # module class after fusing bn into conv + _FUSED_FLOAT_MODULE: ClassVar[type[nn.Module] | None] = nni.ConvReLU1d + + def forward(self, input): + r"""Performs forward pass through fused Conv1d, BatchNorm1d, and ReLU.""" + return F.relu(self._forward(input)) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Creates a QAT module from a floating point module.""" + return super().from_float(mod, use_precomputed_fake_quant) + + +class ConvReLU1d(nnqat.Conv1d, nni._FusedModule): + r"""A ConvReLU1d module is a fused module of Conv1d and ReLU, attached with + FakeQuantize modules for weight for + quantization aware training. + + We combined the interface of :class:`~torch.nn.Conv1d` and + :class:`~torch.nn.BatchNorm1d`. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nni.ConvReLU1d]] = nni.ConvReLU1d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + qconfig=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + # pyrefly: ignore [bad-argument-type] + padding_mode=padding_mode, + qconfig=qconfig, + ) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.weight_fake_quant = self.qconfig.weight() + + def forward(self, input): + r"""Performs forward pass through fused Conv1d and ReLU.""" + return F.relu( + self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a QAT module from a floating point module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class ConvBn2d(_ConvBnNd, nn.Conv2d): + r""" + A ConvBn2d module is a module fused from Conv2d and BatchNorm2d, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv2d` and + :class:`torch.nn.BatchNorm2d`. + + Similar to :class:`torch.nn.Conv2d`, with FakeQuantize modules initialized + to default. + + Attributes: + freeze_bn: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nni.ConvBn2d]] = nni.ConvBn2d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm2d + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + + def __init__( + self, + # ConvNd args + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=None, + padding_mode="zeros", + # BatchNorm2d args + # num_features: out_channels + eps=1e-05, + momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None, + ): + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + _ConvBnNd.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _pair(0), + groups, + bias, + padding_mode, + eps, + momentum, + freeze_bn, + qconfig, + dim=2, + ) + + +class ConvBnReLU2d(ConvBn2d): + r""" + A ConvBnReLU2d module is a module fused from Conv2d, BatchNorm2d and ReLU, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv2d` and + :class:`torch.nn.BatchNorm2d` and :class:`torch.nn.ReLU`. + + Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + # base class defines _FLOAT_MODULE as "ConvBn2d" + _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU2d]] = nni.ConvBnReLU2d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm2d]] = nn.BatchNorm2d + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU + # module class after fusing bn into conv + _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU2d] | None] = nni.ConvReLU2d + + def forward(self, input): + r"""Performs forward pass through fused Conv2d, BatchNorm2d, and ReLU.""" + return F.relu(self._forward(input)) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Creates a QAT module from a floating point module.""" + return super().from_float(mod, use_precomputed_fake_quant) + + +class ConvReLU2d(nnqat.Conv2d, nni._FusedModule): + r"""A ConvReLU2d module is a fused module of Conv2d and ReLU, attached with + FakeQuantize modules for weight for + quantization aware training. + + We combined the interface of :class:`~torch.nn.Conv2d` and + :class:`~torch.nn.BatchNorm2d`. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvReLU2d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + qconfig=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + # pyrefly: ignore [bad-argument-type] + padding_mode=padding_mode, + qconfig=qconfig, + ) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.weight_fake_quant = self.qconfig.weight() + + def forward(self, input): + r"""Performs forward pass through fused Conv2d and ReLU.""" + return F.relu( + self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a QAT module from a floating point module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class ConvBn3d(_ConvBnNd, nn.Conv3d): + r""" + A ConvBn3d module is a module fused from Conv3d and BatchNorm3d, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv3d` and + :class:`torch.nn.BatchNorm3d`. + + Similar to :class:`torch.nn.Conv3d`, with FakeQuantize modules initialized + to default. + + Attributes: + freeze_bn: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nni.ConvBn3d]] = nni.ConvBn3d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm3d + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + + def __init__( + self, + # ConvNd args + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=None, + padding_mode="zeros", + # BatchNorm3d args + # num_features: out_channels + eps=1e-05, + momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None, + ): + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + _ConvBnNd.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _triple(0), + groups, + bias, + padding_mode, + eps, + momentum, + freeze_bn, + qconfig, + dim=3, + ) + + +class ConvBnReLU3d(ConvBn3d): + r""" + A ConvBnReLU3d module is a module fused from Conv3d, BatchNorm3d and ReLU, + attached with FakeQuantize modules for weight, + used in quantization aware training. + + We combined the interface of :class:`torch.nn.Conv3d` and + :class:`torch.nn.BatchNorm3d` and :class:`torch.nn.ReLU`. + + Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU3d]] = nni.ConvBnReLU3d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm3d]] = nn.BatchNorm3d + _FLOAT_RELU_MODULE: ClassVar[type[nn.ReLU] | None] = nn.ReLU + # module class after fusing bn into conv + _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d] | None] = nni.ConvReLU3d + + def forward(self, input): + r"""Performs forward pass through fused Conv3d, BatchNorm3d, and ReLU.""" + return F.relu(ConvBn3d._forward(self, input)) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Creates a QAT module from a floating point module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class ConvReLU3d(nnqat.Conv3d, nni._FusedModule): + r"""A ConvReLU3d module is a fused module of Conv3d and ReLU, attached with + FakeQuantize modules for weight for + quantization aware training. + + We combined the interface of :class:`~torch.nn.Conv3d` and + :class:`~torch.nn.BatchNorm3d`. + + Attributes: + weight_fake_quant: fake quant module for weight + + """ + + _FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d]] = nni.ConvReLU3d # type: ignore[assignment] + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + qconfig=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + # pyrefly: ignore [bad-argument-type] + padding_mode=padding_mode, + qconfig=qconfig, + ) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.weight_fake_quant = self.qconfig.weight() + + def forward(self, input): + r"""Performs forward pass through fused Conv3d and ReLU.""" + return F.relu( + self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a QAT module from a floating point module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +def update_bn_stats(mod): + if type(mod) in { + ConvBnReLU1d, + ConvBnReLU2d, + ConvBnReLU3d, + ConvBn1d, + ConvBn2d, + ConvBn3d, + }: + mod.update_bn_stats() + + +def freeze_bn_stats(mod): + if type(mod) in { + ConvBnReLU1d, + ConvBnReLU2d, + ConvBnReLU3d, + ConvBn1d, + ConvBn2d, + ConvBn3d, + }: + mod.freeze_bn_stats() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py new file mode 100644 index 0000000000000000000000000000000000000000..8458cef76ee3a37bce33d924d2d60d2ca971a614 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py @@ -0,0 +1,191 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.intrinsic as nni +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import init +from torch.nn.parameter import Parameter +from torch.nn.utils.fusion import fuse_linear_bn_weights + + +__all__ = [ + "LinearBn1d", +] + + +class LinearBn1d(nn.modules.linear.Linear, nni._FusedModule): + r""" + A LinearBn1d module is a module fused from Linear and BatchNorm1d, attached + with FakeQuantize modules for weight, used in quantization aware training. + + We combined the interface of :class:`torch.nn.Linear` and + :class:torch.nn.BatchNorm1d`. + + Similar to :class:`torch.nn.Linear`, with FakeQuantize modules initialized + to default. + + Attributes: + freeze_bn: + weight_fake_quant: fake quant module for weight + + """ + + def __init__( + self, + # Linear args + in_features, + out_features, + bias=True, + # BatchNorm1d args + # num_features: out_features + eps=1e-05, + momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None, + ): + nn.modules.linear.Linear.__init__(self, in_features, out_features, bias) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.freeze_bn = freeze_bn if self.training else True + self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True) + self.weight_fake_quant = self.qconfig.weight() + if bias: + self.bias = Parameter(torch.empty(out_features)) + else: + self.register_parameter("bias", None) + self.reset_bn_parameters() + + # this needs to be called after reset_bn_parameters, + # as they modify the same state + if self.training: + if freeze_bn: + self.freeze_bn_stats() + else: + self.update_bn_stats() + else: + self.freeze_bn_stats() + + def reset_running_stats(self): + self.bn.reset_running_stats() + + def reset_bn_parameters(self): + self.bn.reset_running_stats() + init.uniform_(self.bn.weight) + init.zeros_(self.bn.bias) + + def update_bn_stats(self): + self.freeze_bn = False + self.bn.training = True + return self + + def freeze_bn_stats(self): + self.freeze_bn = True + self.bn.training = False + return self + + def forward(self, input): + assert self.bn.running_var is not None + + # Scale the linear weights by BN's running statistics to reduce + # weight jitter, see https://arxiv.org/pdf/1806.08342.pdf, page 18 + # for motivation. + # + # Instead of + # + # x1 = F.linear(x0, fq(w), b) + # x2 = self.bn(x1) + # + # We have + # + # # scale the weight by previous batch's running statistics + # scale_factor = bn.w / bn.running_std_from_prev_batch + # # do the linear transformation without bias + # x1_scaled = F.linear(x0, fq(w * scale_factor), 0) + # # reverse the scaling and add original bias + # x1_orig = x1_scaled / scale_factor + b + # x2 = self.bn(x1_orig) + + running_std = torch.sqrt(self.bn.running_var + self.bn.eps) + scale_factor = self.bn.weight / running_std + weight_shape = [1] * len(self.weight.shape) + weight_shape[0] = -1 + bias_shape = [1] * len(self.weight.shape) + bias_shape[1] = -1 + scaled_weight = self.weight_fake_quant( + self.weight * scale_factor.reshape(weight_shape) + ) + if self.bias is not None: + zero_bias = torch.zeros_like(self.bias) + else: + zero_bias = torch.zeros(self.out_features, device=scaled_weight.device) + linear_out = F.linear(input, scaled_weight, zero_bias) + linear_out_orig = linear_out / scale_factor.reshape(bias_shape) + if self.bias is not None: + linear_out_orig = linear_out_orig + self.bias.reshape(bias_shape) + bn_out = self.bn(linear_out_orig) + return bn_out + + def train(self, mode=True): + """ + Batchnorm's training behavior is using the self.training flag. Prevent + changing it if BN is frozen. This makes sure that calling `model.train()` + on a model with a frozen BN will behave properly. + """ + self.training = mode + if not self.freeze_bn: + for module in self.children(): + module.train(mode) + return self + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module or qparams_dict + + Args: + mod: A float module, either produced by torch.ao.quantization + utilities or directly from the user. + """ + assert type(mod) is nni.LinearBn1d, ( + "qat." + + cls.__name__ + + ".from_float only works for " + + nni.LinearBn1d.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid config" + qconfig = mod.qconfig + linear, bn = mod[0], mod[1] + qat_linearbn = cls( + linear.in_features, + linear.out_features, + linear.bias is not None, + bn.eps, + bn.momentum, + False, + qconfig, + ) + qat_linearbn.weight = linear.weight # type: ignore[assignment] + qat_linearbn.bias = linear.bias # type: ignore[assignment] + qat_linearbn.bn.weight = bn.weight # type: ignore[assignment] + qat_linearbn.bn.bias = bn.bias # type: ignore[assignment] + qat_linearbn.bn.running_mean = bn.running_mean # type: ignore[assignment] + qat_linearbn.bn.running_var = bn.running_var # type: ignore[assignment] + qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked # type: ignore[assignment] + return qat_linearbn + + def to_float(self): + linear = torch.nn.Linear(self.in_features, self.out_features) + assert self.bn.running_var is not None and self.bn.running_mean is not None + linear.weight, linear.bias = fuse_linear_bn_weights( + self.weight, + self.bias, + self.bn.running_mean, + self.bn.running_var, + self.bn.eps, + self.bn.weight, + self.bn.bias, + ) + return linear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..183286ebb8dad25e49cd2fcd7c2dba2436003823 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.qat as nnqat +import torch.nn.functional as F +from torch.ao.nn.intrinsic.modules.fused import _FusedModule + + +if TYPE_CHECKING: + from torch.ao.quantization.qconfig import QConfigAny + + +__all__ = ["LinearReLU"] + + +class LinearReLU(nnqat.Linear, _FusedModule): + r""" + A LinearReLU module fused from Linear and ReLU modules, attached with + FakeQuantize modules for weight, used in + quantization aware training. + + We adopt the same interface as :class:`torch.nn.Linear`. + + Similar to `torch.ao.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to + default. + + Attributes: + weight: fake quant module for weight + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.qat.LinearReLU(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + # pyrefly: ignore [bad-override] + _FLOAT_MODULE = nni.LinearReLU + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + qconfig: QConfigAny = None, + ) -> None: + super().__init__(in_features, out_features, bias, qconfig) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias)) + + @classmethod + def from_float( + cls, + mod: torch.nn.Module, + use_precomputed_fake_quant: bool = False, + ) -> LinearReLU: + return super().from_float(mod, use_precomputed_fake_quant) # type: ignore[no-untyped-call,no-any-return] + + def to_float(self) -> nni.LinearReLU: + linear = torch.nn.Linear( + self.in_features, self.out_features, self.bias is not None + ) + linear.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + linear.bias = torch.nn.Parameter(self.bias.detach()) + relu = torch.nn.ReLU() + return torch.ao.nn.intrinsic.LinearReLU(linear, relu) # type: ignore[no-untyped-call] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6af3b4aeee893966323cc4e73a27ff41814fc251 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py @@ -0,0 +1,15 @@ +from .modules import * # noqa: F403 + + +__all__ = [ + "BNReLU2d", + "BNReLU3d", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "LinearReLU", + "LinearLeakyReLU", + "LinearTanh", + "ConvAdd2d", + "ConvAddReLU2d", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10580962ae775a5a49ac465319aaa6fd4edf2f11 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..956eef6919409cf2426f95790f2f066faeaff6cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a6c3c57c7828861b574e76b134aee2c23f0aad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py @@ -0,0 +1,6 @@ +from .linear_relu import LinearReLU + + +__all__ = [ + "LinearReLU", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd0e26bfd4772a75c5cb704d49b6d765155d4fb3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2debf206f3a9c51d4625a0676dc559e0c21add3d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..620d24ae43e466ecd7883acf7df627641ebfdb24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py @@ -0,0 +1,72 @@ +from typing import Any +from typing_extensions import Self + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.quantized.dynamic as nnqd + + +__all__ = ["LinearReLU"] + + +class LinearReLU(nnqd.Linear): + r""" + A LinearReLU module fused from Linear and ReLU modules that can be used + for dynamic quantization. + Supports both, FP16 and INT8 quantization. + + We adopt the same interface as :class:`torch.ao.nn.quantized.dynamic.Linear`. + + Attributes: + Same as torch.ao.nn.quantized.dynamic.Linear + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + # pyrefly: ignore [bad-override] + _FLOAT_MODULE = nni.LinearReLU + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + dtype: torch.dtype = torch.qint8, + ) -> None: + super().__init__(in_features, out_features, bias, dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self._packed_params.dtype == torch.qint8: + # TODO check if we should set reduce_rage = True by default here + Y = torch.ops.quantized.linear_relu_dynamic( + x, self._packed_params._packed_params, reduce_range=True + ) + elif self._packed_params.dtype == torch.float16: + Y = torch.ops.quantized.linear_relu_dynamic_fp16( + x, self._packed_params._packed_params + ) + else: + raise RuntimeError("Unsupported dtype on dynamic quantized linear relu!") + return Y.to(x.dtype) + + def _get_name(self) -> str: + return "DynamicQuantizedLinearReLU" + + @classmethod + def from_float( + cls, mod: torch.nn.Module, use_precomputed_fake_quant: bool = False + ) -> Self: + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_qlinear_relu: Any) -> Self: # type: ignore[override] + return super().from_reference(ref_qlinear_relu[0]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7fa4dcec2597e18c002489405894ea7251d5156 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py @@ -0,0 +1,18 @@ +from .bn_relu import BNReLU2d, BNReLU3d +from .conv_add import ConvAdd2d, ConvAddReLU2d +from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d +from .linear_relu import LinearLeakyReLU, LinearReLU, LinearTanh + + +__all__ = [ + "LinearReLU", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "BNReLU2d", + "BNReLU3d", + "LinearLeakyReLU", + "LinearTanh", + "ConvAdd2d", + "ConvAddReLU2d", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57fbe9f680f564c2be42a8155f1c615a622afcf3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c9bbce6d61bde4e58f90703f5e5f3b2d5464220 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d50d2c92d0aae6038fd9a2ce8d9dbeaffbbe61f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33e90ba6621d722045b0b5491b9cfd292f779110 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f697d44edac8de37994418f7c93c0fcdf2199d1b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..f05618c0949e1164f05cbd1edbfb8eb6440063e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py @@ -0,0 +1,113 @@ +# mypy: allow-untyped-defs + +import torch +import torch.ao.nn.intrinsic +import torch.ao.nn.intrinsic.qat +import torch.ao.nn.quantized as nnq + + +__all__ = ["BNReLU2d", "BNReLU3d"] + + +class BNReLU2d(nnq.BatchNorm2d): + r""" + A BNReLU2d module is a fused module of BatchNorm2d and ReLU + + We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm2d`. + + Attributes: + Same as torch.ao.nn.quantized.BatchNorm2d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU2d + + def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None): + super().__init__( + num_features, eps=eps, momentum=momentum, device=device, dtype=dtype + ) + + def forward(self, input): + r"""Applies fused BatchNorm2d and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + return torch.ops.quantized.batch_norm2d_relu( + input, + self.weight, + self.bias, + self.running_mean, + self.running_var, + self.eps, + self.scale, + self.zero_point, + ) + + def _get_name(self): + return "QuantizedBNReLU2d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + # TODO: Add qat support for BNReLU2d + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, bn_relu, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + return super().from_reference(bn_relu[0], output_scale, output_zero_point) + + +class BNReLU3d(nnq.BatchNorm3d): + r""" + A BNReLU3d module is a fused module of BatchNorm3d and ReLU + + We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm3d`. + + Attributes: + Same as torch.ao.nn.quantized.BatchNorm3d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU3d + + def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None): + super().__init__( + num_features, eps=eps, momentum=momentum, device=device, dtype=dtype + ) + + def forward(self, input): + r"""Applies fused BatchNorm3d and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, D, H, W)`!") + return torch.ops.quantized.batch_norm3d_relu( + input, + self.weight, + self.bias, + self.running_mean, + self.running_var, + self.eps, + self.scale, + self.zero_point, + ) + + def _get_name(self): + return "QuantizedBNReLU3d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + # TODO: Add qat support for BNReLU3d + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, bn_relu, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + return super().from_reference(bn_relu[0], output_scale, output_zero_point) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py new file mode 100644 index 0000000000000000000000000000000000000000..82d5673e7173c56b5b56d2bd48a0b154bbfdfe9e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py @@ -0,0 +1,153 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.intrinsic +import torch.ao.nn.intrinsic.qat +import torch.ao.nn.quantized as nnq +import torch.nn.functional as F + + +_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding + + +class ConvAdd2d(nnq.Conv2d): + r""" + A ConvAdd2d module is a fused module of Conv2d and Add + + We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`. + + Attributes: + Same as torch.ao.nn.quantized.Conv2d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAdd2d # type: ignore[assignment] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + + def forward(self, input, extra_input): # type: ignore[override] + r"""Applies fused quantized Conv2d and addition.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return torch.ops.quantized.conv2d_add( + input, extra_input, self._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedConvAdd2d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + return super().from_reference(ref_qconv[0], output_scale, output_zero_point) + + +class ConvAddReLU2d(nnq.Conv2d): + r""" + A ConvAddReLU2d module is a fused module of Conv2d, Add and Relu + + We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`. + + Attributes: + Same as torch.ao.nn.quantized.Conv2d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAddReLU2d # type: ignore[assignment] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + + def forward(self, input, extra_input): # type: ignore[override] + r"""Applies fused quantized Conv2d, addition, and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return torch.ops.quantized.conv2d_add_relu( + input, extra_input, self._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedConvAddReLU2d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + return super().from_reference(ref_qconv[0], output_scale, output_zero_point) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..c31df28905cd7c9c17147c965f5bd2199af2920a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py @@ -0,0 +1,276 @@ +# mypy: allow-untyped-defs + +import torch +import torch.ao.nn.intrinsic +import torch.ao.nn.intrinsic.qat +import torch.ao.nn.quantized as nnq +import torch.nn.functional as F +from torch.nn.utils import fuse_conv_bn_weights + + +__all__ = [ + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", +] + +_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding + + +# TODO: factor out the common parts to ConvNd +class ConvReLU1d(nnq.Conv1d): + r""" + A ConvReLU1d module is a fused module of Conv1d and ReLU + + We adopt the same interface as :class:`torch.ao.nn.quantized.Conv1d`. + + Attributes: + Same as torch.ao.nn.quantized.Conv1d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU1d # type: ignore[assignment] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + # pyrefly: ignore [bad-argument-type] + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + + def forward(self, input): + r"""Applies fused quantized Conv1d and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + if self.padding_mode != "zeros": + # Padding in Conv1d is stored as (p, p), need to get (p,) + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1]) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return torch.ops.quantized.conv1d_relu( + input, self._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedConvReLU1d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU1d: + assert mod.bn.running_var is not None and mod.bn.running_mean is not None + mod.weight, mod.bias = fuse_conv_bn_weights( + mod.weight, + mod.bias, + mod.bn.running_mean, + mod.bn.running_var, + mod.bn.eps, + mod.bn.weight, + mod.bn.bias, + ) + return super().from_float(mod, use_precomputed_fake_quant) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU1d, ( + "BatchNorm1d should be fused into Conv1d before converting to reference module" + ) + return super().from_reference(ref_qconv[0], output_scale, output_zero_point) + + +class ConvReLU2d(nnq.Conv2d): + r""" + A ConvReLU2d module is a fused module of Conv2d and ReLU + + We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`. + + Attributes: + Same as torch.ao.nn.quantized.Conv2d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU2d # type: ignore[assignment] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + + def forward(self, input): + r"""Applies fused quantized Conv2d and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return torch.ops.quantized.conv2d_relu( + input, self._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedConvReLU2d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU2d: + assert mod.bn.running_var is not None and mod.bn.running_mean is not None + mod.weight, mod.bias = fuse_conv_bn_weights( + mod.weight, + mod.bias, + mod.bn.running_mean, + mod.bn.running_var, + mod.bn.eps, + mod.bn.weight, + mod.bn.bias, + ) + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU2d, ( + "BatchNorm2d should be fused into Conv2d before converting to reference module" + ) + return super().from_reference(ref_qconv[0], output_scale, output_zero_point) + + +class ConvReLU3d(nnq.Conv3d): + r""" + A ConvReLU3d module is a fused module of Conv3d and ReLU + + We adopt the same interface as :class:`torch.ao.nn.quantized.Conv3d`. + + Attributes: Same as torch.ao.nn.quantized.Conv3d + + """ + + _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU3d # type: ignore[assignment] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + assert padding_mode != "reflect", "Conv3d does not support reflection padding" + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + + def forward(self, input): + r"""Applies fused quantized Conv3d and ReLU.""" + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, D, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return torch.ops.quantized.conv3d_relu( + input, self._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedConvReLU3d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module.""" + if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU3d: + assert mod.bn.running_var is not None and mod.bn.running_mean is not None + mod.weight, mod.bias = fuse_conv_bn_weights( + mod.weight, + mod.bias, + mod.bn.running_mean, + mod.bn.running_var, + mod.bn.eps, + mod.bn.weight, + mod.bn.bias, + ) + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Creates a quantized module from a reference module.""" + assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU3d, ( + "BatchNorm3d should be fused into Conv3d before converting to reference module" + ) + return super().from_reference(ref_qconv[0], output_scale, output_zero_point) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec84101ee0da62e3923362f444368b2a429d8b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py @@ -0,0 +1,190 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.quantized as nnq +from torch.ao.nn.quantized.modules.utils import _quantize_weight + + +__all__ = [ + "LinearReLU", + "LinearLeakyReLU", + "LinearTanh", +] + + +class LinearReLU(nnq.Linear): + r""" + A LinearReLU module fused from Linear and ReLU modules + + We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`. + + Attributes: + Same as torch.ao.nn.quantized.Linear + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.intrinsic.LinearReLU(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + _FLOAT_MODULE = nni.LinearReLU # type: ignore[assignment] + + def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8): + super().__init__(in_features, out_features, bias, dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.linear_relu( + x, self._packed_params._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedLinearReLU" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float(mod, use_precomputed_fake_quant) + + @classmethod + def from_reference(cls, ref_linear_relu, output_scale, output_zero_point): + return super().from_reference( + ref_linear_relu[0], output_scale, output_zero_point + ) + + +class LinearLeakyReLU(nnq.Linear): + r""" + For onednn backend only + A LinearLeakyReLU module fused from Linear and LeakyReLU modules + We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`. + Attributes: + Same as torch.ao.nn.quantized.Linear + + negative_slope + Examples:: + >>> # xdoctest: +SKIP + >>> m = nn.intrinsic.LinearLeakyReLU(20, 30, 0.01) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + _FLOAT_MODULE = nni.LinearLeakyReLU # type: ignore[assignment] + + def __init__( + self, in_features, out_features, negative_slope, bias=True, dtype=torch.qint8 + ): + super().__init__(in_features, out_features, bias, dtype) + self.negative_slope = negative_slope + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.linear_leaky_relu( + x, + self._packed_params._packed_params, + self.scale, + self.zero_point, + self.negative_slope, + ) + + def _get_name(self): + return "QuantizedLinearLeakyReLU" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + assert type(mod) is nni.LinearLeakyReLU, ( + "Input float module should be LinearLeakyReLU" + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + activation_post_process = mod.activation_post_process + leaky_relu = mod[1] + mod = mod[0] + weight_post_process = mod.qconfig.weight() # type: ignore[union-attr, operator] + weight_post_process(mod.weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() # type: ignore[union-attr,operator] + assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8" + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + qlinear_leaky_relu = cls( + mod.in_features, mod.out_features, leaky_relu.negative_slope, dtype=dtype + ) + qlinear_leaky_relu.set_weight_bias(qweight, mod.bias) # type: ignore[arg-type] + qlinear_leaky_relu.scale = float(act_scale) + qlinear_leaky_relu.zero_point = int(act_zp) + return qlinear_leaky_relu + + @classmethod + def from_reference(cls, ref_mod, output_scale, output_zero_point): + linear = ref_mod[0] + leaky_relu = ref_mod[1] + qlinear_leaky_relu = cls( + linear.in_features, linear.out_features, leaky_relu.negative_slope + ) + qweight = linear.get_quantized_weight() + qlinear_leaky_relu.set_weight_bias(qweight, linear.bias) + qlinear_leaky_relu.scale = float(output_scale) + qlinear_leaky_relu.zero_point = int(output_zero_point) + return qlinear_leaky_relu + + +class LinearTanh(nnq.Linear): + r""" + A LinearTanh module fused from Linear and Tanh modules + + We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`. + + Attributes: + Same as torch.ao.nn.quantized.Linear + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.intrinsic.LinearTanh(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + _FLOAT_MODULE = nni.LinearTanh # type: ignore[assignment] + + def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8): + super().__init__(in_features, out_features, bias, dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.linear_tanh( + x, self._packed_params._packed_params, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedLinearTanh" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + assert type(mod) is nni.LinearTanh, "Input float module should be LinearTanh" + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + activation_post_process = mod.activation_post_process + mod = mod[0] + weight_post_process = mod.qconfig.weight() # type: ignore[union-attr,operator] + weight_post_process(mod.weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() # type: ignore[union-attr,operator] + assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8" + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + qlinear_tanh = cls(mod.in_features, mod.out_features, dtype=dtype) + qlinear_tanh.set_weight_bias(qweight, mod.bias) # type: ignore[arg-type] + qlinear_tanh.scale = float(act_scale) + qlinear_tanh.zero_point = int(act_zp) + return qlinear_tanh + + @classmethod + def from_reference(cls, ref_mod, output_scale, output_zero_point): + linear = ref_mod[0] + qlinear_tanh = cls(linear.in_features, linear.out_features) + qweight = linear.get_quantized_weight() + qlinear_tanh.set_weight_bias(qweight, linear.bias) + qlinear_tanh.scale = float(output_scale) + qlinear_tanh.zero_point = int(output_zero_point) + return qlinear_tanh diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e28e0968a60d7612ebbd26d5f607b4407c2d380 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py @@ -0,0 +1,13 @@ +from .conv import Conv1d, Conv2d, Conv3d +from .embedding_ops import Embedding, EmbeddingBag +from .linear import Linear + + +__all__ = [ + "Linear", + "Conv1d", + "Conv2d", + "Conv3d", + "Embedding", + "EmbeddingBag", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..9d228d56fce129860f0ebad805b042771b941804 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py @@ -0,0 +1,312 @@ +# mypy: allow-untyped-defs +from typing import ClassVar, Literal + +import torch +import torch.nn as nn +from torch.ao.nn.intrinsic import _FusedModule +from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t +from torch.nn.modules.utils import _pair, _single, _triple + + +__all__ = ["Conv1d", "Conv2d", "Conv3d"] + + +class _ConvNd(nn.modules.conv._ConvNd): + _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]] + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: tuple[int, ...], + stride: tuple[int, ...], + padding: str | tuple[int, ...], + dilation: tuple[int, ...], + transposed: bool, + output_padding: tuple[int, ...], + groups: int, + bias: bool, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"], + qconfig=None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + nn.modules.conv._ConvNd.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @staticmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module + + Args: + `mod`: a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type(mod) is cls._FLOAT_MODULE, ( + "qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + if issubclass(type(mod), _FusedModule): + mod = mod[0] + qconfig = mod.qconfig + qat_conv = cls( + mod.in_channels, + mod.out_channels, + mod.kernel_size, + stride=mod.stride, + padding=mod.padding, + dilation=mod.dilation, + groups=mod.groups, + bias=mod.bias is not None, + padding_mode=mod.padding_mode, + qconfig=qconfig, + ) + qat_conv.weight = mod.weight + qat_conv.bias = mod.bias + return qat_conv + + def to_float(self): + """This works for both single qat conv, and the qat conv - relu modules + to convert the qat module to a floating point module + """ + cls = type(self) + conv = cls._FLOAT_CONV_MODULE( # type: ignore[attr-defined] + self.in_channels, + self.out_channels, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.groups, + self.bias is not None, + self.padding_mode, + ) + conv.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + conv.bias = torch.nn.Parameter(self.bias.detach()) + # conv relu + if issubclass(cls, _FusedModule): + modules = [conv] + assert hasattr(cls, "_FLOAT_RELU_MODULE") + relu = cls._FLOAT_RELU_MODULE() + modules.append(relu) + # pyrefly: ignore [missing-attribute] + fused = cls._FLOAT_MODULE(*modules) + fused.train(self.training) + return fused + else: + return conv + + +class Conv1d(_ConvNd, nn.Conv1d): + r""" + A Conv1d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as :class:`~torch.nn.Conv1d` + + Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: str | _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + qconfig=None, + device=None, + dtype=None, + ) -> None: + kernel_size_ = _single(kernel_size) + stride_ = _single(stride) + padding_ = padding if isinstance(padding, str) else _single(padding) + dilation_ = _single(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_single(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + return super().from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class Conv2d(_ConvNd, nn.Conv2d): + r""" + A Conv2d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Conv2d`, please see + https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d + for documentation. + + Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: str | _size_2_t = 0, + dilation: _size_2_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + qconfig=None, + device=None, + dtype=None, + ) -> None: + kernel_size_ = _pair(kernel_size) + stride_ = _pair(stride) + padding_ = padding if isinstance(padding, str) else _pair(padding) + dilation_ = _pair(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_pair(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype, + ) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + return super().from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class Conv3d(_ConvNd, nn.Conv3d): + r""" + A Conv3d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Conv3d`, please see + https://pytorch.org/docs/stable/nn.html?highlight=conv3d#torch.nn.Conv3d + for documentation. + + Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: str | _size_3_t = 0, + dilation: _size_3_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + qconfig=None, + device=None, + dtype=None, + ) -> None: + kernel_size_ = _triple(kernel_size) + stride_ = _triple(stride) + padding_ = padding if isinstance(padding, str) else _triple(padding) + dilation_ = _triple(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_triple(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype, + ) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + return super().from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..1f69e70abcf1d43c4a96ca15dae355c31f66a627 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py @@ -0,0 +1,251 @@ +# mypy: allow-untyped-defs +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +__all__ = ["Embedding", "EmbeddingBag"] + + +class Embedding(nn.Embedding): + r""" + An embedding bag module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Embedding`, please see + https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding + for documentation. + + Similar to `torch.nn.Embedding`, with FakeQuantize modules initialized to + default. + + Attributes: + weight: fake quant module for weight + """ + + _FLOAT_MODULE = nn.Embedding + + def __init__( + self, + num_embeddings, + embedding_dim, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, + _weight=None, + device=None, + dtype=None, + qconfig=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_embeddings, + embedding_dim, + padding_idx, + max_norm, + norm_type, + scale_grad_by_freq, + sparse, + _weight, + # pyrefly: ignore [bad-argument-type] + **factory_kwargs, + ) + assert qconfig, "qconfig must be provided for QAT module" + assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, ( + "Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got " + + str(qconfig.weight().qscheme) + ) + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input) -> Tensor: + return F.embedding( + input, + self.weight_fake_quant(self.weight), + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module + + Args: `mod` a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type(mod) is cls._FLOAT_MODULE, ( + " qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + weight_qscheme = mod.qconfig.weight().qscheme # type: ignore[union-attr, operator] + assert weight_qscheme == torch.per_channel_affine_float_qparams, ( + "Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got " + + str(weight_qscheme) + ) + + qconfig = mod.qconfig + qat_embedding_bag = cls( + mod.num_embeddings, + mod.embedding_dim, + mod.padding_idx, + mod.max_norm, + mod.norm_type, + mod.scale_grad_by_freq, + mod.sparse, + mod.weight, + qconfig=qconfig, + ) + + return qat_embedding_bag + + def to_float(self): + embedding_bag = torch.nn.Embedding( + self.num_embeddings, + self.embedding_dim, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + None, + ) + embedding_bag.weight = torch.nn.Parameter(self.weight.detach()) + embedding_bag.train(self.training) + return embedding_bag + + +class EmbeddingBag(nn.EmbeddingBag): + r""" + An embedding bag module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.EmbeddingBag`, please see + https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag + for documentation. + + Similar to `torch.nn.EmbeddingBag`, with FakeQuantize modules initialized to + default. + + Attributes: + weight: fake quant module for weight + """ + + _FLOAT_MODULE = nn.EmbeddingBag + + def __init__( + self, + num_embeddings, + embedding_dim, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + mode="mean", + sparse=False, + _weight=None, + include_last_offset=False, + padding_idx=None, + qconfig=None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_embeddings, + embedding_dim, + max_norm, + norm_type, + scale_grad_by_freq, + mode, + sparse, + _weight, + include_last_offset, + padding_idx, + **factory_kwargs, + ) + assert qconfig, "qconfig must be provided for QAT module" + assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, ( + "Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got " + + str(qconfig.weight().qscheme) + ) + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input, offsets=None, per_sample_weights=None) -> Tensor: + return F.embedding_bag( + input, + self.weight_fake_quant(self.weight), + offsets, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.mode, + self.sparse, + per_sample_weights, + self.include_last_offset, + self.padding_idx, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module + + Args: `mod` a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type(mod) is cls._FLOAT_MODULE, ( + " qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + weight_qscheme = mod.qconfig.weight().qscheme # type: ignore[union-attr, operator] + assert weight_qscheme == torch.per_channel_affine_float_qparams, ( + "Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got " + + str(weight_qscheme) + ) + + qconfig = mod.qconfig + qat_embedding_bag = cls( + mod.num_embeddings, + mod.embedding_dim, + mod.max_norm, + mod.norm_type, + mod.scale_grad_by_freq, + mod.mode, + mod.sparse, + mod.weight, + mod.include_last_offset, + mod.padding_idx, + qconfig=qconfig, + ) + + return qat_embedding_bag + + def to_float(self): + embedding_bag = torch.nn.EmbeddingBag( + self.num_embeddings, + self.embedding_dim, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.mode, + self.sparse, + None, + self.include_last_offset, + self.padding_idx, + ) + embedding_bag.weight = torch.nn.Parameter(self.weight.detach()) + embedding_bag.train(self.training) + return embedding_bag diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..5edf16ed3ea53d0323eda248b95703d5245b1786 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py @@ -0,0 +1,97 @@ +# mypy: allow-untyped-defs +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.ao.nn.intrinsic import LinearReLU +from torch.nn.utils.parametrize import ( + is_parametrized, + transfer_parametrizations_and_params, + type_before_parametrizations, +) + + +__all__ = ["Linear"] + + +class Linear(nn.Linear): + r""" + A linear module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Linear`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.Linear + for documentation. + + Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to + default. + + Attributes: + weight: fake quant module for weight + """ + + _FLOAT_MODULE = nn.Linear + + def __init__( + self, + in_features, + out_features, + bias=True, + qconfig=None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(in_features, out_features, bias, **factory_kwargs) + assert qconfig, "qconfig must be provided for QAT module" + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input): + return F.linear(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a qat module from a float module or qparams_dict + Args: `mod` a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, ( + " qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + if type_before_parametrizations(mod) == LinearReLU: + mod = mod[0] + + qconfig = mod.qconfig + qat_linear = cls( + mod.in_features, + mod.out_features, + bias=mod.bias is not None, + qconfig=qconfig, + ) + + if is_parametrized(mod, "weight"): + transfer_parametrizations_and_params(mod, qat_linear, "weight") + else: + qat_linear.weight = mod.weight + + if is_parametrized(mod, "bias"): + transfer_parametrizations_and_params(mod, qat_linear, "bias") + else: + qat_linear.bias = mod.bias + + return qat_linear + + def to_float(self): + linear = torch.nn.Linear( + self.in_features, self.out_features, self.bias is not None + ) + linear.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + linear.bias = torch.nn.Parameter(self.bias.detach()) + linear.train(self.training) + return linear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c08593a717f43bc70cda9fe4596e55ddd2d7204e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..221107660158171ada5d1823cc193666c9e152e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py @@ -0,0 +1,9 @@ +from .activation import MultiheadAttention +from .rnn import LSTM, LSTMCell + + +__all__ = [ + "LSTM", + "LSTMCell", + "MultiheadAttention", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbe78331a171b4402511b4389f9ff5b90d79775b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..217f3dff3cbe037a14783a2a4bf720d5b2528bba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..975d8bfe0b6a84e9c737d3cb572ff36b60ec0352 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..d808d50c366c68b8aa0d61a50b9f6db2d72c9ff2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py @@ -0,0 +1,579 @@ +# mypy: allow-untyped-defs +import warnings + +import torch +import torch.jit # this is needed to avoid a circular import +import torch.nn.functional as F +from torch import nn, Tensor + + +__all__ = ["MultiheadAttention"] + + +class MultiheadAttention(nn.MultiheadAttention): + _FLOAT_MODULE = nn.MultiheadAttention + + r"""Quantizable implementation of the MultiheadAttention. + + Note:: + Please, refer to :class:`~torch.nn.MultiheadAttention` for more + information + + Allows the model to jointly attend to information from different + representation subspaces. + See reference: Attention Is All You Need + + The original MHA module is not quantizable. + This reimplements it by explicitly instantiating the linear layers. + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model. + num_heads: parallel attention heads. + dropout: a Dropout layer on attn_output_weights. Default: 0.0. + bias: add bias as module parameter. Default: True. + add_bias_kv: add bias to the key and value sequences at dim=0. + add_zero_attn: add a new batch of zeros to the key and + value sequences at dim=1. + kdim: total number of features in key. Default: None. + vdim: total number of features in value. Default: None. + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + + Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set + to :attr:`embed_dim` such that query, key, and value have the same + number of features. + + Examples:: + + >>> import torch.ao.nn.quantizable as nnqa + >>> multihead_attn = nnqa.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + + Note:: + Please, follow the quantization flow to convert the quantizable MHA. + """ + __constants__ = ["batch_first"] + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + kdim: int | None = None, + vdim: int | None = None, + batch_first: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + embed_dim, + num_heads, + dropout, + bias, + add_bias_kv, + add_zero_attn, + kdim, + vdim, + batch_first, + **factory_kwargs, + ) + self.linear_Q = nn.Linear( + self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs + ) + self.linear_K = nn.Linear( + self.kdim, self.embed_dim, bias=bias, **factory_kwargs + ) + self.linear_V = nn.Linear( + self.vdim, self.embed_dim, bias=bias, **factory_kwargs + ) + # for the type: ignore, see https://github.com/pytorch/pytorch/issues/58969 + self.out_proj = nn.Linear( + self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs + ) # type: ignore[assignment] + + # Functionals + self.q_scaling_product = torch.ao.nn.quantized.FloatFunctional() + # note: importing torch.ao.nn.quantized at top creates a circular import + + # Quant/Dequant + self.quant_attn_output = torch.ao.quantization.QuantStub() + self.quant_attn_output_weights = torch.ao.quantization.QuantStub() + self.dequant_q = torch.ao.quantization.DeQuantStub() + self.dequant_k = torch.ao.quantization.DeQuantStub() + self.dequant_v = torch.ao.quantization.DeQuantStub() + + def _get_name(self): + return "QuantizableMultiheadAttention" + + @classmethod + def from_float(cls, other): + assert type(other) is cls._FLOAT_MODULE + assert hasattr(other, "qconfig"), "The float module must have 'qconfig'" + # Setting the dropout to 0.0! + observed = cls( + other.embed_dim, + other.num_heads, + other.dropout, + (other.in_proj_bias is not None), + (other.bias_k is not None), + other.add_zero_attn, + other.kdim, + other.vdim, + other.batch_first, + ) + observed.bias_k = other.bias_k + observed.bias_v = other.bias_v + observed.qconfig = other.qconfig + + # Set the linear weights + # for the type: ignores, see https://github.com/pytorch/pytorch/issues/58969 + observed.out_proj.weight = other.out_proj.weight + observed.out_proj.bias = other.out_proj.bias + if other._qkv_same_embed_dim: + # Use separate params + bias = other.in_proj_bias + _start = 0 + _end = _start + other.embed_dim + weight = other.in_proj_weight[_start:_end, :] + if bias is not None: + bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad) + observed.linear_Q.weight = torch.nn.Parameter(weight, weight.requires_grad) + observed.linear_Q.bias = bias + + bias = other.in_proj_bias + _start = _end + _end = _start + other.embed_dim + weight = other.in_proj_weight[_start:_end, :] + if bias is not None: + bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad) + observed.linear_K.weight = torch.nn.Parameter(weight, weight.requires_grad) + observed.linear_K.bias = bias + + bias = other.in_proj_bias + _start = _end + weight = other.in_proj_weight[_start:, :] + if bias is not None: + bias = torch.nn.Parameter(bias[_start:], bias.requires_grad) + observed.linear_V.weight = torch.nn.Parameter(weight, weight.requires_grad) + observed.linear_V.bias = bias + else: + observed.linear_Q.weight = nn.Parameter(other.q_proj_weight) + observed.linear_K.weight = nn.Parameter(other.k_proj_weight) + observed.linear_V.weight = nn.Parameter(other.v_proj_weight) + if other.in_proj_bias is None: + # pyrefly: ignore [bad-assignment] + observed.linear_Q.bias = None + # pyrefly: ignore [bad-assignment] + observed.linear_K.bias = None + # pyrefly: ignore [bad-assignment] + observed.linear_V.bias = None + else: + observed.linear_Q.bias = nn.Parameter( + other.in_proj_bias[0 : other.embed_dim] + ) + observed.linear_K.bias = nn.Parameter( + other.in_proj_bias[other.embed_dim : (other.embed_dim * 2)] + ) + observed.linear_V.bias = nn.Parameter( + other.in_proj_bias[(other.embed_dim * 2) :] + ) + observed.eval() + # Explicit prepare + observed = torch.ao.quantization.prepare(observed, inplace=True) + return observed + + @torch.jit.unused + def dequantize(self): + r"""Utility to convert the quantized MHA back to float. + + The motivation for this is that it is not trivial to convert the weights + from the format that is used in the quantized version back to the + float. + """ + fp = self._FLOAT_MODULE( + self.embed_dim, + self.num_heads, + self.dropout, + (self.linear_Q._weight_bias()[1] is not None), # type: ignore[operator] + (self.bias_k is not None), + self.add_zero_attn, + self.kdim, + self.vdim, + self.batch_first, + ) + assert fp._qkv_same_embed_dim == self._qkv_same_embed_dim + if self.bias_k is not None: + fp.bias_k = nn.Parameter(self.bias_k.dequantize()) + if self.bias_v is not None: + fp.bias_v = nn.Parameter(self.bias_v.dequantize()) + + # Set the linear weights + # Note: Because the linear layers are quantized, mypy does not know how + # to deal with them -- might need to ignore the typing checks. + # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969 + w, b = self.out_proj._weight_bias() # type: ignore[operator, has-type] + fp.out_proj.weight = nn.Parameter(w.dequantize()) + if b is not None: + fp.out_proj.bias = nn.Parameter(b) + + wQ, bQ = self.linear_Q._weight_bias() # type: ignore[operator] + wQ = wQ.dequantize() + wK, bK = self.linear_K._weight_bias() # type: ignore[operator] + wK = wK.dequantize() + wV, bV = self.linear_V._weight_bias() # type: ignore[operator] + wV = wV.dequantize() + if fp._qkv_same_embed_dim: + # Use separate params + _start = 0 + _end = _start + fp.embed_dim + fp.in_proj_weight[_start:_end, :] = wQ + if fp.in_proj_bias is not None: + # pyrefly: ignore [bad-argument-type] + assert all(bQ == 0) + fp.in_proj_bias[_start:_end] = bQ + + _start = _end + _end = _start + fp.embed_dim + fp.in_proj_weight[_start:_end, :] = wK + if fp.in_proj_bias is not None: + # pyrefly: ignore [bad-argument-type] + assert all(bK == 0) + fp.in_proj_bias[_start:_end] = bK + + _start = _end + fp.in_proj_weight[_start:, :] = wV + if fp.in_proj_bias is not None: + # pyrefly: ignore [bad-argument-type] + assert all(bV == 0) + fp.in_proj_bias[_start:] = bV + else: + fp.q_proj_weight = nn.Parameter(wQ) + fp.k_proj_weight = nn.Parameter(wK) + fp.v_proj_weight = nn.Parameter(wV) + if fp.in_proj_bias is None: + # pyrefly: ignore [bad-assignment] + self.linear_Q.bias = None + # pyrefly: ignore [bad-assignment] + self.linear_K.bias = None + # pyrefly: ignore [bad-assignment] + self.linear_V.bias = None + else: + fp.in_proj_bias[0 : fp.embed_dim] = bQ + fp.in_proj_bias[fp.embed_dim : (fp.embed_dim * 2)] = bK + fp.in_proj_bias[(fp.embed_dim * 2) :] = bV + + return fp + + @classmethod + def from_observed(cls, other): + # The whole flow is float -> observed -> quantized + # This class does float -> observed only + # See nn.quantized.MultiheadAttention + raise NotImplementedError( + "It looks like you are trying to prepare an " + "MHA module. Please, see " + "the examples on quantizable MHAs." + ) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Tensor | None = None, + need_weights: bool = True, + attn_mask: Tensor | None = None, + average_attn_weights: bool = True, + is_causal: bool = False, + ) -> tuple[Tensor, Tensor | None]: + r""" + Note:: + Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more + information + + Args: + query, key, value: map a query and a set of key-value pairs to an output. + See "Attention Is All You Need" for more details. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. When given a binary mask and a value is True, + the corresponding value on the attention layer will be ignored. + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + + Shape: + - Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked + positions. If a BoolTensor is provided, positions with ``True`` + is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + - is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask. + Default: ``False``. + - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across + heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an + effect when ``need_weights=True.``. Default: True (i.e. average weights across heads) + + - Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``. + - attn_output_weights: If ``average_attn_weights=True``, returns attention weights averaged + across heads of shape :math:`(N, L, S)`, where N is the batch size, L is the target sequence length, + S is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(N, num_heads, L, S)`. + """ + return self._forward_impl( + query, + key, + value, + key_padding_mask, + need_weights, + attn_mask, + average_attn_weights, + is_causal, + ) + + def _forward_impl( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Tensor | None = None, + need_weights: bool = True, + attn_mask: Tensor | None = None, + average_attn_weights: bool = True, + is_causal: bool = False, + ) -> tuple[Tensor, Tensor | None]: + # This version will not deal with the static key/value pairs. + # Keeping it here for future changes. + # + # TODO: This method has some duplicate lines with the + # `torch.nn.functional.multi_head_attention`. Will need to refactor. + static_k = None + static_v = None + + if attn_mask is not None and is_causal: + raise AssertionError("Only allow causal mask or attn_mask") + + if is_causal: + raise AssertionError("causal mask not supported by AO MHA module") + + if self.batch_first: + query, key, value = (x.transpose(0, 1) for x in (query, key, value)) + + tgt_len, bsz, embed_dim_to_check = query.size() + assert self.embed_dim == embed_dim_to_check + # allow MHA to have different sizes for the feature dimension + assert key.size(0) == value.size(0) and key.size(1) == value.size(1) + + head_dim = self.embed_dim // self.num_heads + assert head_dim * self.num_heads == self.embed_dim, ( + "embed_dim must be divisible by num_heads" + ) + scaling = float(head_dim) ** -0.5 + + q = self.linear_Q(query) + k = self.linear_K(key) + v = self.linear_V(value) + + q = self.q_scaling_product.mul_scalar(q, scaling) + + if attn_mask is not None: + if attn_mask.dtype == torch.uint8: + warnings.warn( + "Byte tensor for `attn_mask` in `nn.MultiheadAttention` is deprecated. " + "Use bool tensor instead.", + stacklevel=3, + ) + attn_mask = attn_mask.to(torch.bool) + assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, ( + f"Only float and bool types are supported for attn_mask, not {attn_mask.dtype}" + ) + + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) + if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: + raise RuntimeError("The size of the 2D attn_mask is not correct.") + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [ + bsz * self.num_heads, + query.size(0), + key.size(0), + ]: + raise RuntimeError("The size of the 3D attn_mask is not correct.") + else: + raise RuntimeError( + f"attn_mask's dimension {attn_mask.dim()} is not supported" + ) + # attn_mask's dim is 3 now. + + # convert ByteTensor key_padding_mask to bool + if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: + warnings.warn( + "Byte tensor for `key_padding_mask` in `nn.MultiheadAttention` is deprecated. " + "Use bool tensor instead.", + stacklevel=3, + ) + key_padding_mask = key_padding_mask.to(torch.bool) + if self.bias_k is not None and self.bias_v is not None: + if static_k is None and static_v is None: + # Explicitly assert that bias_k and bias_v are not None + # in a way that TorchScript can understand. + bias_k = self.bias_k + assert bias_k is not None + bias_v = self.bias_v + assert bias_v is not None + + k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = F.pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, 1)) + else: + assert static_k is None, "bias cannot be added to static key." + assert static_v is None, "bias cannot be added to static value." + else: + assert self.bias_k is None + assert self.bias_v is None + + q = q.contiguous().view(tgt_len, bsz * self.num_heads, head_dim).transpose(0, 1) + if k is not None: + k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1) + if v is not None: + v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1) + + if static_k is not None: + assert static_k.size(0) == bsz * self.num_heads + assert static_k.size(2) == head_dim + k = static_k + + if static_v is not None: + assert static_v.size(0) == bsz * self.num_heads + assert static_v.size(2) == head_dim + v = static_v + + # pyrefly: ignore [missing-attribute] + src_len = k.size(1) + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + src_len += 1 + # pyrefly: ignore [missing-attribute] + k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:]) + # pyrefly: ignore [missing-attribute] + if k.is_quantized: + k_zeros = torch.quantize_per_tensor( + k_zeros, + # pyrefly: ignore [missing-attribute] + k.q_scale(), + # pyrefly: ignore [missing-attribute] + k.q_zero_point(), + # pyrefly: ignore [missing-attribute] + k.dtype, + ) + # pyrefly: ignore [no-matching-overload] + k = torch.cat([k, k_zeros], dim=1) + # pyrefly: ignore [missing-attribute] + v_zeros = torch.zeros((v.size(0), 1) + k.size()[2:]) + # pyrefly: ignore [missing-attribute] + if v.is_quantized: + v_zeros = torch.quantize_per_tensor( + v_zeros, + # pyrefly: ignore [missing-attribute] + v.q_scale(), + # pyrefly: ignore [missing-attribute] + v.q_zero_point(), + # pyrefly: ignore [missing-attribute] + v.dtype, + ) + # pyrefly: ignore [no-matching-overload] + v = torch.cat([v, v_zeros], dim=1) + + if attn_mask is not None: + attn_mask = F.pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, 1)) + + # Leaving the quantized zone here + q = self.dequant_q(q) + k = self.dequant_k(k) + v = self.dequant_v(v) + attn_output_weights = torch.bmm(q, k.transpose(1, 2)) + assert list(attn_output_weights.size()) == [ + bsz * self.num_heads, + tgt_len, + src_len, + ] + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_output_weights.masked_fill_(attn_mask, float("-inf")) + else: + attn_output_weights += attn_mask + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view( + bsz, self.num_heads, tgt_len, src_len + ) + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float("-inf"), + ) + attn_output_weights = attn_output_weights.view( + bsz * self.num_heads, tgt_len, src_len + ) + + attn_output_weights = F.softmax(attn_output_weights, dim=-1) + attn_output_weights = F.dropout( + attn_output_weights, p=self.dropout, training=self.training + ) + + attn_output = torch.bmm(attn_output_weights, v) + assert list(attn_output.size()) == [bsz * self.num_heads, tgt_len, head_dim] + if self.batch_first: + attn_output = attn_output.view(bsz, tgt_len, self.embed_dim) + else: + attn_output = ( + attn_output.transpose(0, 1) + .contiguous() + .view(tgt_len, bsz, self.embed_dim) + ) + + # Reentering the quantized zone + attn_output = self.quant_attn_output(attn_output) + # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969 + attn_output = self.out_proj(attn_output) # type: ignore[has-type] + attn_output_weights = self.quant_attn_output_weights(attn_output_weights) + + if need_weights: + # average attention weights over heads + attn_output_weights = attn_output_weights.view( + bsz, self.num_heads, tgt_len, src_len + ) + if average_attn_weights: + attn_output_weights = attn_output_weights.mean(dim=1) + return attn_output, attn_output_weights + else: + return attn_output, None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..74e4bd902d1565360f72a5c4098b6e6d1590a146 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py @@ -0,0 +1,604 @@ +""" +We will recreate all the RNN modules as we require the modules to be decomposed +into its building blocks to be able to observe. +""" + +# mypy: allow-untyped-defs + +import numbers +import warnings + +import torch +from torch import Tensor + + +__all__ = ["LSTMCell", "LSTM"] + + +class LSTMCell(torch.nn.Module): + r"""A quantizable long short-term memory (LSTM) cell. + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell` + + `split_gates`: specify True to compute the input/forget/cell/output gates separately + to avoid an intermediate tensor which is subsequently chunk'd. This optimization can + be beneficial for on-device inference latency. This flag is cascaded down from the + parent classes. + + Examples:: + + >>> import torch.ao.nn.quantizable as nnqa + >>> rnn = nnqa.LSTMCell(10, 20) + >>> input = torch.randn(6, 10) + >>> hx = torch.randn(3, 20) + >>> cx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx, cx = rnn(input[i], (hx, cx)) + ... output.append(hx) + """ + + _FLOAT_MODULE = torch.nn.LSTMCell + __constants__ = ["split_gates"] # for jit.script + + def __init__( + self, + input_dim: int, + hidden_dim: int, + bias: bool = True, + device=None, + dtype=None, + *, + split_gates=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.input_size = input_dim + self.hidden_size = hidden_dim + self.bias = bias + self.split_gates = split_gates + + if not split_gates: + self.igates: torch.nn.Module = torch.nn.Linear( + input_dim, 4 * hidden_dim, bias=bias, **factory_kwargs + ) + self.hgates: torch.nn.Module = torch.nn.Linear( + hidden_dim, 4 * hidden_dim, bias=bias, **factory_kwargs + ) + self.gates: torch.nn.Module = torch.ao.nn.quantized.FloatFunctional() + else: + # keep separate Linear layers for each gate + self.igates = torch.nn.ModuleDict() + self.hgates = torch.nn.ModuleDict() + self.gates = torch.nn.ModuleDict() + for g in ["input", "forget", "cell", "output"]: + # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]` + self.igates[g] = torch.nn.Linear( + input_dim, hidden_dim, bias=bias, **factory_kwargs + ) + # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]` + self.hgates[g] = torch.nn.Linear( + hidden_dim, hidden_dim, bias=bias, **factory_kwargs + ) + # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]` + self.gates[g] = torch.ao.nn.quantized.FloatFunctional() + + self.input_gate = torch.nn.Sigmoid() + self.forget_gate = torch.nn.Sigmoid() + self.cell_gate = torch.nn.Tanh() + self.output_gate = torch.nn.Sigmoid() + + self.fgate_cx = torch.ao.nn.quantized.FloatFunctional() + self.igate_cgate = torch.ao.nn.quantized.FloatFunctional() + self.fgate_cx_igate_cgate = torch.ao.nn.quantized.FloatFunctional() + + self.ogate_cy = torch.ao.nn.quantized.FloatFunctional() + + self.initial_hidden_state_qparams: tuple[float, int] = (1.0, 0) + self.initial_cell_state_qparams: tuple[float, int] = (1.0, 0) + self.hidden_state_dtype: torch.dtype = torch.quint8 + self.cell_state_dtype: torch.dtype = torch.quint8 + + def forward( + self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None + ) -> tuple[Tensor, Tensor]: + if hidden is None or hidden[0] is None or hidden[1] is None: + hidden = self.initialize_hidden(x.shape[0], x.is_quantized) + hx, cx = hidden + + if not self.split_gates: + igates = self.igates(x) + hgates = self.hgates(hx) + gates = self.gates.add(igates, hgates) # type: ignore[operator] + + input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1) + + input_gate = self.input_gate(input_gate) + forget_gate = self.forget_gate(forget_gate) + cell_gate = self.cell_gate(cell_gate) + out_gate = self.output_gate(out_gate) + else: + # apply each input + hidden projection and add together + gate = {} + for (key, gates), igates, hgates in zip( + self.gates.items(), # type: ignore[operator] + self.igates.values(), # type: ignore[operator] + self.hgates.values(), # type: ignore[operator] + ): + gate[key] = gates.add(igates(x), hgates(hx)) + + input_gate = self.input_gate(gate["input"]) + forget_gate = self.forget_gate(gate["forget"]) + cell_gate = self.cell_gate(gate["cell"]) + out_gate = self.output_gate(gate["output"]) + + fgate_cx = self.fgate_cx.mul(forget_gate, cx) + igate_cgate = self.igate_cgate.mul(input_gate, cell_gate) + fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate) + cy = fgate_cx_igate_cgate + + # TODO: make this tanh a member of the module so its qparams can be configured + tanh_cy = torch.tanh(cy) + hy = self.ogate_cy.mul(out_gate, tanh_cy) + return hy, cy + + def initialize_hidden( + self, batch_size: int, is_quantized: bool = False + ) -> tuple[Tensor, Tensor]: + h, c = ( + torch.zeros((batch_size, self.hidden_size)), + torch.zeros((batch_size, self.hidden_size)), + ) + if is_quantized: + (h_scale, h_zp) = self.initial_hidden_state_qparams + (c_scale, c_zp) = self.initial_cell_state_qparams + h = torch.quantize_per_tensor( + h, scale=h_scale, zero_point=h_zp, dtype=self.hidden_state_dtype + ) + c = torch.quantize_per_tensor( + c, scale=c_scale, zero_point=c_zp, dtype=self.cell_state_dtype + ) + return h, c + + def _get_name(self): + return "QuantizableLSTMCell" + + @classmethod + def from_params(cls, wi, wh, bi=None, bh=None, split_gates=False): + """Uses the weights and biases to create a new LSTM cell. + + Args: + wi, wh: Weights for the input and hidden layers + bi, bh: Biases for the input and hidden layers + """ + assert (bi is None) == (bh is None) # Either both None or both have values + input_size = wi.shape[1] + hidden_size = wh.shape[1] + cell = cls( + input_dim=input_size, + hidden_dim=hidden_size, + bias=(bi is not None), + split_gates=split_gates, + ) + + if not split_gates: + cell.igates.weight = torch.nn.Parameter(wi) + if bi is not None: + cell.igates.bias = torch.nn.Parameter(bi) + cell.hgates.weight = torch.nn.Parameter(wh) + if bh is not None: + cell.hgates.bias = torch.nn.Parameter(bh) + else: + # split weight/bias + for w, b, gates in zip([wi, wh], [bi, bh], [cell.igates, cell.hgates]): + for w_chunk, gate in zip(w.chunk(4, dim=0), gates.values()): # type: ignore[operator] + gate.weight = torch.nn.Parameter(w_chunk) + + if b is not None: + for b_chunk, gate in zip(b.chunk(4, dim=0), gates.values()): # type: ignore[operator] + gate.bias = torch.nn.Parameter(b_chunk) + + return cell + + @classmethod + def from_float(cls, other, use_precomputed_fake_quant=False, split_gates=False): + assert type(other) is cls._FLOAT_MODULE + assert hasattr(other, "qconfig"), "The float module must have 'qconfig'" + observed = cls.from_params( + other.weight_ih, + other.weight_hh, + other.bias_ih, + other.bias_hh, + split_gates=split_gates, + ) + observed.qconfig = other.qconfig + observed.igates.qconfig = other.qconfig + observed.hgates.qconfig = other.qconfig + if split_gates: + # also apply qconfig directly to Linear modules + for g in observed.igates.values(): + g.qconfig = other.qconfig + for g in observed.hgates.values(): + g.qconfig = other.qconfig + return observed + + +class _LSTMSingleLayer(torch.nn.Module): + r"""A single one-directional LSTM layer. + + The difference between a layer and a cell is that the layer can process a + sequence, while the cell only expects an instantaneous value. + """ + + def __init__( + self, + input_dim: int, + hidden_dim: int, + bias: bool = True, + device=None, + dtype=None, + *, + split_gates=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.cell = LSTMCell( + input_dim, hidden_dim, bias=bias, split_gates=split_gates, **factory_kwargs + ) + + def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None): + result = [] + seq_len = x.shape[0] + for i in range(seq_len): + hidden = self.cell(x[i], hidden) + result.append(hidden[0]) # type: ignore[index] + result_tensor = torch.stack(result, 0) + return result_tensor, hidden + + @classmethod + def from_params(cls, *args, **kwargs): + cell = LSTMCell.from_params(*args, **kwargs) + layer = cls( + cell.input_size, cell.hidden_size, cell.bias, split_gates=cell.split_gates + ) + layer.cell = cell + return layer + + +class _LSTMLayer(torch.nn.Module): + r"""A single bi-directional LSTM layer.""" + + def __init__( + self, + input_dim: int, + hidden_dim: int, + bias: bool = True, + batch_first: bool = False, + bidirectional: bool = False, + device=None, + dtype=None, + *, + split_gates=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.batch_first = batch_first + self.bidirectional = bidirectional + self.layer_fw = _LSTMSingleLayer( + input_dim, hidden_dim, bias=bias, split_gates=split_gates, **factory_kwargs + ) + if self.bidirectional: + self.layer_bw = _LSTMSingleLayer( + input_dim, + hidden_dim, + bias=bias, + split_gates=split_gates, + **factory_kwargs, + ) + + def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None): + if self.batch_first: + x = x.transpose(0, 1) + if hidden is None: + hx_fw, cx_fw = (None, None) + else: + hx_fw, cx_fw = hidden + hidden_bw: tuple[Tensor, Tensor] | None = None + if self.bidirectional: + if hx_fw is None: + hx_bw = None + else: + hx_bw = hx_fw[1] + hx_fw = hx_fw[0] + if cx_fw is None: + cx_bw = None + else: + cx_bw = cx_fw[1] + cx_fw = cx_fw[0] + if hx_bw is not None and cx_bw is not None: + hidden_bw = hx_bw, cx_bw + if hx_fw is None and cx_fw is None: + hidden_fw = None + else: + hidden_fw = ( + torch.jit._unwrap_optional(hx_fw), + torch.jit._unwrap_optional(cx_fw), + ) + result_fw, hidden_fw = self.layer_fw(x, hidden_fw) + + if hasattr(self, "layer_bw") and self.bidirectional: + x_reversed = x.flip(0) + result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw) + result_bw = result_bw.flip(0) + + result = torch.cat([result_fw, result_bw], result_fw.dim() - 1) + if hidden_fw is None and hidden_bw is None: + h = None + c = None + elif hidden_fw is None: + (h, c) = torch.jit._unwrap_optional(hidden_bw) + elif hidden_bw is None: + (h, c) = torch.jit._unwrap_optional(hidden_fw) + else: + h = torch.stack([hidden_fw[0], hidden_bw[0]], 0) # type: ignore[list-item] + c = torch.stack([hidden_fw[1], hidden_bw[1]], 0) # type: ignore[list-item] + else: + result = result_fw + h, c = torch.jit._unwrap_optional(hidden_fw) # type: ignore[assignment] + + if self.batch_first: + result.transpose_(0, 1) + + return result, (h, c) + + @classmethod + def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs): + r""" + There is no FP equivalent of this class. This function is here just to + mimic the behavior of the `prepare` within the `torch.ao.quantization` + flow. + """ + assert hasattr(other, "qconfig") or (qconfig is not None) + + input_size = kwargs.get("input_size", other.input_size) + hidden_size = kwargs.get("hidden_size", other.hidden_size) + bias = kwargs.get("bias", other.bias) + batch_first = kwargs.get("batch_first", other.batch_first) + bidirectional = kwargs.get("bidirectional", other.bidirectional) + split_gates = kwargs.get("split_gates", False) + + layer = cls( + input_size, + hidden_size, + bias, + batch_first, + bidirectional, + split_gates=split_gates, + ) + # pyrefly: ignore [bad-argument-type] + layer.qconfig = getattr(other, "qconfig", qconfig) + wi = getattr(other, f"weight_ih_l{layer_idx}") + wh = getattr(other, f"weight_hh_l{layer_idx}") + bi = getattr(other, f"bias_ih_l{layer_idx}", None) + bh = getattr(other, f"bias_hh_l{layer_idx}", None) + + layer.layer_fw = _LSTMSingleLayer.from_params( + wi, wh, bi, bh, split_gates=split_gates + ) + + if other.bidirectional: + wi = getattr(other, f"weight_ih_l{layer_idx}_reverse") + wh = getattr(other, f"weight_hh_l{layer_idx}_reverse") + bi = getattr(other, f"bias_ih_l{layer_idx}_reverse", None) + bh = getattr(other, f"bias_hh_l{layer_idx}_reverse", None) + layer.layer_bw = _LSTMSingleLayer.from_params( + wi, wh, bi, bh, split_gates=split_gates + ) + return layer + + +class LSTM(torch.nn.Module): + r"""A quantizable long short-term memory (LSTM). + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTM` + + Attributes: + layers : instances of the `_LSTMLayer` + + .. note:: + To access the weights and biases, you need to access them per layer. + See examples below. + + Examples:: + + >>> import torch.ao.nn.quantizable as nnqa + >>> rnn = nnqa.LSTM(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> c0 = torch.randn(2, 3, 20) + >>> output, (hn, cn) = rnn(input, (h0, c0)) + >>> # To get the weights: + >>> # xdoctest: +SKIP + >>> print(rnn.layers[0].weight_ih) + tensor([[...]]) + >>> print(rnn.layers[0].weight_hh) + AssertionError: There is no reverse path in the non-bidirectional layer + """ + + _FLOAT_MODULE = torch.nn.LSTM + + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + device=None, + dtype=None, + *, + split_gates: bool = False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = float(dropout) + self.bidirectional = bidirectional + self.training = False # Default to eval mode. If we want to train, we will explicitly set to training. + + if ( + not isinstance(dropout, numbers.Number) + # pyrefly: ignore [unsupported-operation] + or not 0 <= dropout <= 1 + or isinstance(dropout, bool) + ): + raise ValueError( + "dropout should be a number in range [0, 1] " + "representing the probability of an element being " + "zeroed" + ) + # pyrefly: ignore [unsupported-operation] + if dropout > 0: + warnings.warn( + "dropout option for quantizable LSTM is ignored. " + "If you are training, please, use nn.LSTM version " + "followed by `prepare` step.", + stacklevel=2, + ) + if num_layers == 1: + warnings.warn( + "dropout option adds dropout after all but last " + "recurrent layer, so non-zero dropout expects " + f"num_layers greater than 1, but got dropout={dropout} " + f"and num_layers={num_layers}", + stacklevel=2, + ) + + layers = [ + _LSTMLayer( + self.input_size, + self.hidden_size, + self.bias, + batch_first=False, + bidirectional=self.bidirectional, + split_gates=split_gates, + **factory_kwargs, + ) + ] + layers.extend( + _LSTMLayer( + self.hidden_size, + self.hidden_size, + self.bias, + batch_first=False, + bidirectional=self.bidirectional, + split_gates=split_gates, + **factory_kwargs, + ) + for _ in range(1, num_layers) + ) + self.layers = torch.nn.ModuleList(layers) + + def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None): + if self.batch_first: + x = x.transpose(0, 1) + + max_batch_size = x.size(1) + num_directions = 2 if self.bidirectional else 1 + if hidden is None: + zeros = torch.zeros( + num_directions, + max_batch_size, + self.hidden_size, + dtype=torch.float, + device=x.device, + ) + zeros.squeeze_(0) + if x.is_quantized: + zeros = torch.quantize_per_tensor( + zeros, scale=1.0, zero_point=0, dtype=x.dtype + ) + hxcx = [(zeros, zeros) for _ in range(self.num_layers)] + else: + hidden_non_opt = torch.jit._unwrap_optional(hidden) + if isinstance(hidden_non_opt[0], Tensor): + hx = hidden_non_opt[0].reshape( + self.num_layers, num_directions, max_batch_size, self.hidden_size + ) + cx = hidden_non_opt[1].reshape( + self.num_layers, num_directions, max_batch_size, self.hidden_size + ) + hxcx = [ + (hx[idx].squeeze(0), cx[idx].squeeze(0)) + for idx in range(self.num_layers) + ] + else: + hxcx = hidden_non_opt + + hx_list = [] + cx_list = [] + for idx, layer in enumerate(self.layers): + x, (h, c) = layer(x, hxcx[idx]) + hx_list.append(torch.jit._unwrap_optional(h)) + cx_list.append(torch.jit._unwrap_optional(c)) + hx_tensor = torch.stack(hx_list) + cx_tensor = torch.stack(cx_list) + + # We are creating another dimension for bidirectional case + # need to collapse it + hx_tensor = hx_tensor.reshape(-1, hx_tensor.shape[-2], hx_tensor.shape[-1]) + cx_tensor = cx_tensor.reshape(-1, cx_tensor.shape[-2], cx_tensor.shape[-1]) + + if self.batch_first: + x = x.transpose(0, 1) + + return x, (hx_tensor, cx_tensor) + + def _get_name(self): + return "QuantizableLSTM" + + @classmethod + def from_float(cls, other, qconfig=None, split_gates=False): + assert isinstance(other, cls._FLOAT_MODULE) + assert hasattr(other, "qconfig") or qconfig + observed = cls( + other.input_size, + other.hidden_size, + other.num_layers, + other.bias, + other.batch_first, + other.dropout, + other.bidirectional, + split_gates=split_gates, + ) + # pyrefly: ignore [bad-argument-type] + observed.qconfig = getattr(other, "qconfig", qconfig) + for idx in range(other.num_layers): + observed.layers[idx] = _LSTMLayer.from_float( + other, idx, qconfig, batch_first=False, split_gates=split_gates + ) + + # Prepare the model + if other.training: + observed.train() + observed = torch.ao.quantization.prepare_qat(observed, inplace=True) + else: + observed.eval() + observed = torch.ao.quantization.prepare(observed, inplace=True) + return observed + + @classmethod + def from_observed(cls, other): + # The whole flow is float -> observed -> quantized + # This class does float -> observed only + raise NotImplementedError( + "It looks like you are trying to convert a " + "non-quantizable LSTM module. Please, see " + "the examples on quantizable LSTMs." + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..77e97d8595282f3d69963ee129fa473249e3ae29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py @@ -0,0 +1,39 @@ +from . import functional +from .modules import * # noqa: F403 +from .modules import MaxPool2d + + +__all__ = [ + "BatchNorm2d", + "BatchNorm3d", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "DeQuantize", + "ELU", + "Embedding", + "EmbeddingBag", + "GroupNorm", + "Hardswish", + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", + "LayerNorm", + "LeakyReLU", + "Linear", + "LSTM", + "MultiheadAttention", + "Quantize", + "ReLU6", + "Sigmoid", + "Softmax", + "Dropout", + "PReLU", + # Wrapper modules + "FloatFunctional", + "FXFloatFunctional", + "QFunctional", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42623256e6253b588ec56788a646b92d194e32cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6c44be65fdad7dad42c21a07331d76c95d7f6b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1833c1d585aea7fa67c80b5f6eff38b370243b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..969fd6f121f5ddb72ed2e8e158e3ee7e990cfd0c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py @@ -0,0 +1,26 @@ +from .conv import ( + Conv1d, + Conv2d, + Conv3d, + ConvTranspose1d, + ConvTranspose2d, + ConvTranspose3d, +) +from .linear import Linear +from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNNCell + + +__all__ = [ + "Linear", + "LSTM", + "GRU", + "LSTMCell", + "RNNCell", + "GRUCell", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b622b49c7e96bdface694f6184dbd94f5c3f98ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..958edd5594a24a3bebe7420507c148d67d8af6c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5221799d456a5b460fa0b053332b22e566f2dc85 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbd5adbb7df8c9be323288580935ac25a1ae1ddf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..68c3f6acd093477a44057ade1fb48107709eda89 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py @@ -0,0 +1,530 @@ +# mypy: allow-untyped-defs +r"""Dynamically quantized convolution modules.""" + +import warnings +from typing import ClassVar, Literal + +import torch +import torch.ao.nn.quantized as nnq +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch._ops import ops +from torch.ao.nn.quantized.modules.conv import _reverse_repeat_padding +from torch.nn.common_types import _size_1_t +from torch.nn.modules.utils import _pair, _single, _triple + + +__all__ = [ + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", +] + + +class Conv1d(nnq.Conv1d): + r"""A dynamically quantized conv module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv1d` and :class:`~torch.ao.nn.quantized.dynamic.Conv1d` and + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv1d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.quantized.dynamic.Conv1d(16, 33, 3, stride=2) + >>> input = torch.randn(20, 16, 100) + >>> output = m(input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + reduce_range=True, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, + ) + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _single(kernel_size) + stride = _single(stride) + # pyrefly: ignore [bad-assignment] + padding = padding if isinstance(padding, str) else _single(padding) + dilation = _single(dilation) + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConv1d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + if self.padding_mode != "zeros": + # Padding in Conv1d is stored as (p, p), need to get (p,) + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1]) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv1d_dynamic(input, self._packed_params, reduce_range) + + +class Conv2d(nnq.Conv2d): + r"""A dynamically quantized conv module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv2d` and :class:`~torch.ao.nn.quantized.dynamic.Conv2d` and + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv2d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With square kernels and equal stride + >>> m = nn.quantized.dynamic.Conv2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> # non-square kernels and unequal stride and with padding and dilation + >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + >>> input = torch.randn(20, 16, 50, 100) + >>> output = m(input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module " + "has poor numerical accuracy and its use is not recommended", + stacklevel=2, + ) + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConv2d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv2d_dynamic(input, self._packed_params, reduce_range) + + +class Conv3d(nnq.Conv3d): + r"""A dynamically quantized conv module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv3d` and :class:`~torch.ao.nn.quantized.dynamic.Conv3d` and + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv3d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With square kernels and equal stride + >>> m = nn.quantized.dynamic.Conv3d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2)) + >>> # non-square kernels and unequal stride and with padding and dilation + >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2)) + >>> input = torch.randn(20, 16, 56, 56, 56) + >>> output = m(input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None + _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, + ) + assert padding_mode != "reflect", "Conv3d does not support reflection padding" + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + super()._init( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _triple(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConv3d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, D, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv3d_dynamic(input, self._packed_params, reduce_range) + + +class ConvTranspose1d(nnq.ConvTranspose1d): + r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose1d`. + + For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv1d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose1d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With square kernels and equal stride + >>> m = nndq.ConvTranspose1d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nndq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> output = m(input) + >>> # exact output size can be also specified as an argument + >>> downsample = nndq.Conv1d(16, 16, 3, stride=2, padding=1) + >>> upsample = nndq.ConvTranspose1d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(input) + >>> h.size() + torch.Size([1, 16, 6]) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose1d]] = nn.ConvTranspose1d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, + ) + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConvTranspose1d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + return torch.ops.quantized.conv_transpose1d_dynamic( + input, self._packed_params, reduce_range + ) + + +class ConvTranspose2d(nnq.ConvTranspose2d): + r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose2d`. + + For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv2d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose2d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With square kernels and equal stride + >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> output = m(input) + >>> # exact output size can be also specified as an argument + >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1) + >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(input) + >>> h.size() + torch.Size([1, 16, 6, 6]) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose2d]] = nn.ConvTranspose2d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, + ) + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConvTranspose2d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + return ops.quantized.conv_transpose2d_dynamic( + input, self._packed_params, reduce_range + ) + + +class ConvTranspose3d(nnq.ConvTranspose3d): + r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose3d`. + + For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv3d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose3d` for other attributes. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With cubic kernels and equal stride + >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2) + >>> # non-cubic kernels and unequal stride and with padding + >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2)) + >>> output = m(input) + >>> # exact output size can be also specified as an argument + >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1) + >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(input) + >>> h.size() + torch.Size([1, 16, 6, 6, 6]) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12, 12, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose3d]] = nn.ConvTranspose3d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + warnings.warn( + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, + ) + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "DynamicQuantizedConvTranspose3d" + + def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor: + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, T, H, W)`!") + return ops.quantized.conv_transpose3d_dynamic( + input, self._packed_params, reduce_range + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..523ff78c31cf141e680e0a3374bcb5f1252cf7d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py @@ -0,0 +1,168 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.quantized as nnq +from torch.ao.nn.quantized.modules.utils import _quantize_weight + + +__all__ = [ + "Linear", +] + + +class Linear(nnq.Linear): + r""" + A dynamic quantized linear module with floating point tensor as inputs and outputs. + We adopt the same interface as `torch.nn.Linear`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation. + + Similar to :class:`torch.nn.Linear`, attributes will be randomly + initialized at module creation time and will be overwritten later + + Attributes: + weight (Tensor): the non-learnable quantized weights of the module which are of + shape :math:`(\text{out\_features}, \text{in\_features})`. + bias (Tensor): the non-learnable floating point bias of the module of shape + :math:`(\text{out\_features})`. If :attr:`bias` is ``True``, + the values are initialized to zero. + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = nn.quantized.dynamic.Linear(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + # version used in this class is different from the parent class nnq.Linear + _version = 4 + + def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8): + super().__init__(in_features, out_features, bias_, dtype=dtype) + # We don't muck around with buffers or attributes or anything here + # to keep the module simple. *everything* is simply a Python attribute. + # Serialization logic is explicitly handled in the below serialization and + # deserialization modules + self.version = 4 + + def forward(self, x): + # Note that we can handle self.bias == None case. + if self._packed_params.dtype == torch.qint8: + if self.version is None or self.version < 4: + Y = torch.ops.quantized.linear_dynamic( + x, self._packed_params._packed_params + ) + else: + Y = torch.ops.quantized.linear_dynamic( + x, self._packed_params._packed_params, reduce_range=True + ) + elif self._packed_params.dtype == torch.float16: + Y = torch.ops.quantized.linear_dynamic_fp16( + x, self._packed_params._packed_params + ) + else: + raise RuntimeError("Unsupported dtype on dynamic quantized linear!") + return Y.to(x.dtype) + + def _get_name(self): + return "DynamicQuantizedLinear" + + def extra_repr(self): + extra_repr_str = f"in_features={self.in_features}, out_features={self.out_features}, dtype={self._packed_params.dtype}" + if self._packed_params.dtype == torch.qint8: + extra_repr_str += f", qscheme={self.weight().qscheme()}" + return extra_repr_str + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + self.version = version + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a dynamic quantized module from a float module or qparams_dict + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + float_modules = [ + torch.nn.Linear, + torch.nn.modules.linear.NonDynamicallyQuantizableLinear, + torch.ao.nn.intrinsic.modules.fused.LinearReLU, + torch.ao.nn.qat.dynamic.Linear, + ] + + assert type(mod) in float_modules, ( + "nn.quantized.dynamic.Linear.from_float only works for one of" + + str([float_mod.__name__ for float_mod in float_modules]) + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + if type(mod) is nni.LinearReLU: + mod = mod[0] + # pyrefly: ignore [missing-attribute] + if mod.qconfig is not None and mod.qconfig.weight is not None: + # pyrefly: ignore [not-callable] + weight_observer = mod.qconfig.weight() + else: + # We have the circular import issues if we import the qconfig in the beginning of this file: + # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the + # import until we need it. + from torch.ao.quantization.qconfig import default_dynamic_qconfig + + weight_observer = default_dynamic_qconfig.weight() + dtype = weight_observer.dtype + assert dtype in [torch.qint8, torch.float16], ( + "The only supported dtypes for " + f"dynamic quantized linear are qint8 and float16 got: {dtype}" + ) + weight_observer(mod.weight) + if dtype == torch.qint8: + qweight = _quantize_weight(mod.weight.float(), weight_observer) + elif dtype == torch.float16: + qweight = mod.weight.float() + else: + raise RuntimeError( + "Unsupported dtype specified for dynamic quantized Linear!" + ) + qlinear = cls(mod.in_features, mod.out_features, dtype=dtype) + # pyrefly: ignore [bad-argument-type] + qlinear.set_weight_bias(qweight, mod.bias) + return qlinear + + @classmethod + def from_reference(cls, ref_qlinear): # type: ignore[override] + """Create a (fbgemm/qnnpack) dynamic quantized module from a reference quantized + module + Args: + ref_qlinear (Module): a reference quantized module, either produced by + torch.ao.quantization functions or provided by the user + """ + qlinear = cls( + ref_qlinear.in_features, + ref_qlinear.out_features, + dtype=ref_qlinear.weight_dtype, + ) + qweight = ref_qlinear.get_quantized_weight() + bias = ref_qlinear.bias + qlinear.set_weight_bias(qweight, bias) + return qlinear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..1ebe4b6a15af499f38a0d70ca93870cf1d6c224f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py @@ -0,0 +1,1366 @@ +# mypy: allow-untyped-defs +import numbers +import warnings +from typing_extensions import deprecated + +import torch +import torch.nn as nn +from torch import Tensor # noqa: F401 +from torch._jit_internal import Dict, List, Optional, Tuple, Union # noqa: F401 +from torch.ao.nn.quantized.modules.utils import _quantize_weight +from torch.nn.utils.rnn import PackedSequence + + +__all__ = [ + "pack_weight_bias", + "PackedParameter", + "RNNBase", + "LSTM", + "GRU", + "RNNCellBase", + "RNNCell", + "LSTMCell", + "GRUCell", + "apply_permutation", +] + + +def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: + return tensor.index_select(dim, permutation) + + +@deprecated( + "`apply_permutation` is deprecated, please use `tensor.index_select(dim, permutation)` instead", + category=FutureWarning, +) +def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: + return _apply_permutation(tensor, permutation, dim) + + +def pack_weight_bias(qweight, bias, dtype): + if dtype == torch.qint8: + # for each layer, for each direction we need to quantize and pack + # weights and pack parameters in this order: + # + # w_ih, w_hh + packed_weight = torch.ops.quantized.linear_prepack(qweight, bias) + + return packed_weight + else: + # for each layer, for each direction we need to quantize and pack + # weights and pack parameters in this order: + # + # packed_ih, packed_hh, b_ih, b_hh + packed_weight = torch.ops.quantized.linear_prepack_fp16(qweight, bias) + + return packed_weight + + +class PackedParameter(torch.nn.Module): + def __init__(self, param): + super().__init__() + self.param = param + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "param"] = self.param + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.param = state_dict[prefix + "param"] + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +class RNNBase(torch.nn.Module): + _FLOAT_MODULE = nn.RNNBase + + _version = 2 + + def __init__( + self, + mode, + input_size, + hidden_size, + num_layers=1, + bias=True, + batch_first=False, + dropout=0.0, + bidirectional=False, + dtype=torch.qint8, + ): + super().__init__() + + self.mode = mode + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = float(dropout) + self.bidirectional = bidirectional + self.dtype = dtype + self.version = 2 + self.training = False + num_directions = 2 if bidirectional else 1 + + # "type: ignore" is required since ints and Numbers are not fully comparable + # https://github.com/python/mypy/issues/8566 + if ( + not isinstance(dropout, numbers.Number) + or not 0 <= dropout <= 1 # type: ignore[operator] + or isinstance(dropout, bool) + ): + raise ValueError( + "dropout should be a number in range [0, 1] " + "representing the probability of an element being " + "zeroed" + ) + if dropout > 0 and num_layers == 1: # type: ignore[operator] + warnings.warn( + "dropout option adds dropout after all but last " + "recurrent layer, so non-zero dropout expects " + f"num_layers greater than 1, but got dropout={dropout} and " + f"num_layers={num_layers}", + stacklevel=2, + ) + + if mode == "LSTM": + gate_size = 4 * hidden_size + elif mode == "GRU": + gate_size = 3 * hidden_size + else: + raise ValueError("Unrecognized RNN mode: " + mode) + + _all_weight_values = [] + for layer in range(num_layers): + for _ in range(num_directions): + layer_input_size = ( + input_size if layer == 0 else hidden_size * num_directions + ) + + w_ih = torch.randn(gate_size, layer_input_size).to(torch.float) + w_hh = torch.randn(gate_size, hidden_size).to(torch.float) + b_ih = torch.randn(gate_size).to(torch.float) + b_hh = torch.randn(gate_size).to(torch.float) + if dtype == torch.qint8: + w_ih = torch.quantize_per_tensor( + w_ih, scale=0.1, zero_point=0, dtype=torch.qint8 + ) + w_hh = torch.quantize_per_tensor( + w_hh, scale=0.1, zero_point=0, dtype=torch.qint8 + ) + packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih) + packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh) + if self.version is None or self.version < 2: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, b_ih, b_hh + ) + ) + else: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, b_ih, b_hh, True + ) + ) + else: + packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih) + packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh) + cell_params = torch.ops.quantized.make_quantized_cell_params_fp16( + packed_ih, packed_hh + ) + + _all_weight_values.append(PackedParameter(cell_params)) + self._all_weight_values = torch.nn.ModuleList(_all_weight_values) + + def _get_name(self): + return "DynamicQuantizedRNN" + + def extra_repr(self): + s = "{input_size}, {hidden_size}" + if self.num_layers != 1: + s += ", num_layers={num_layers}" + if self.bias is not True: + s += ", bias={bias}" + if self.batch_first is not False: + s += ", batch_first={batch_first}" + if self.dropout != 0: + s += ", dropout={dropout}" + if self.bidirectional is not False: + s += ", bidirectional={bidirectional}" + return s.format(**self.__dict__) + + def __repr__(self): + # We don't want to show `ModuleList` children, hence custom + # `__repr__`. This is the same as nn.Module.__repr__, except the check + # for the `PackedParameter` and `nn.ModuleList`. + # You should still override `extra_repr` to add more info. + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split("\n") + child_lines = [] + for key, module in self._modules.items(): + if isinstance(module, (PackedParameter, nn.ModuleList)): + continue + mod_str = repr(module) + mod_str = nn.modules.module._addindent(mod_str, 2) + child_lines.append("(" + key + "): " + mod_str) + lines = extra_lines + child_lines + + main_str = self._get_name() + "(" + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += "\n " + "\n ".join(lines) + "\n" + + main_str += ")" + return main_str + + def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None: + expected_input_dim = 2 if batch_sizes is not None else 3 + if input.dim() != expected_input_dim: + raise RuntimeError( + f"input must have {expected_input_dim} dimensions, got {input.dim()}" + ) + if self.input_size != input.size(-1): + raise RuntimeError( + f"input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}" + ) + + def get_expected_hidden_size( + self, input: Tensor, batch_sizes: Optional[Tensor] + ) -> tuple[int, int, int]: + if batch_sizes is not None: + mini_batch = int(batch_sizes[0]) + else: + mini_batch = input.size(0) if self.batch_first else input.size(1) + num_directions = 2 if self.bidirectional else 1 + expected_hidden_size = ( + self.num_layers * num_directions, + mini_batch, + self.hidden_size, + ) + return expected_hidden_size + + def check_hidden_size( + self, + hx: Tensor, + expected_hidden_size: tuple[int, int, int], + msg: str = "Expected hidden size {}, got {}", + ) -> None: + if hx.size() != expected_hidden_size: + raise RuntimeError(msg.format(expected_hidden_size, list(hx.size()))) + + def check_forward_args( + self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor] + ) -> None: + self.check_input(input, batch_sizes) + expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) + self.check_hidden_size( + hidden, expected_hidden_size, msg="Expected hidden size {}, got {}" + ) + + def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor: + if permutation is None: + return hx + return _apply_permutation(hx, permutation) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + self.version = version + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def set_weight_bias(self, weight_bias_dict): + def weight_bias_name(ihhh, layer, suffix): + weight_name = f"weight_{ihhh}_l{layer}{suffix}" + bias_name = f"bias_{ihhh}_l{layer}{suffix}" + return weight_name, bias_name + + num_directions = 2 if self.bidirectional else 1 + # TODO: dedup with __init__ of RNNBase + _all_weight_values = [] + for layer in range(self.num_layers): + for direction in range(num_directions): + suffix = "_reverse" if direction == 1 else "" + w_ih_name, b_ih_name = weight_bias_name("ih", layer, suffix) + w_hh_name, b_hh_name = weight_bias_name("hh", layer, suffix) + w_ih = weight_bias_dict[w_ih_name] + b_ih = weight_bias_dict[b_ih_name] + w_hh = weight_bias_dict[w_hh_name] + b_hh = weight_bias_dict[b_hh_name] + if w_ih.dtype == torch.qint8: + packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih) + packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh) + if self.version is None or self.version < 2: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, b_ih, b_hh + ) + ) + else: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, b_ih, b_hh, True + ) + ) + else: + packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih) + packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh) + cell_params = torch.ops.quantized.make_quantized_cell_params_fp16( + packed_ih, packed_hh + ) + + _all_weight_values.append(PackedParameter(cell_params)) + self._all_weight_values = torch.nn.ModuleList(_all_weight_values) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + assert type(mod) in { + torch.nn.LSTM, + torch.nn.GRU, + }, "nn.quantized.dynamic.RNNBase.from_float only works for nn.LSTM and nn.GRU" + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + + if mod.qconfig is not None and mod.qconfig.weight is not None: + weight_observer_method = mod.qconfig.weight + else: + # We have the circular import issues if we import the qconfig in the beginning of this file: + # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the + # import until we need it. + from torch.ao.quantization.qconfig import default_dynamic_qconfig + + weight_observer_method = default_dynamic_qconfig.weight + + dtype = weight_observer_method().dtype + supported_scalar_types = [torch.qint8, torch.float16] + if dtype not in supported_scalar_types: + raise RuntimeError( + f"Unsupported dtype for dynamic RNN quantization: {dtype}" + ) + # RNNBase can be either LSTM or GRU + qRNNBase: Union[LSTM, GRU] + if mod.mode == "LSTM": + qRNNBase = LSTM( + mod.input_size, + mod.hidden_size, + mod.num_layers, + mod.bias, + mod.batch_first, + mod.dropout, + mod.bidirectional, + dtype, + ) + elif mod.mode == "GRU": + qRNNBase = GRU( + mod.input_size, + mod.hidden_size, + mod.num_layers, + mod.bias, + mod.batch_first, + mod.dropout, + mod.bidirectional, + dtype, + ) + else: + raise NotImplementedError( + "Only LSTM/GRU is supported for QuantizedRNN for now" + ) + + num_directions = 2 if mod.bidirectional else 1 + + assert mod.bias + + _all_weight_values = [] + for layer in range(qRNNBase.num_layers): + for direction in range(num_directions): + suffix = "_reverse" if direction == 1 else "" + + def retrieve_weight_bias(ihhh): + weight_name = f"weight_{ihhh}_l{layer}{suffix}" + bias_name = f"bias_{ihhh}_l{layer}{suffix}" + weight = getattr(mod, weight_name) + bias = getattr(mod, bias_name) + return weight, bias + + weight_ih, bias_ih = retrieve_weight_bias("ih") + weight_hh, bias_hh = retrieve_weight_bias("hh") + + if dtype == torch.qint8: + + def quantize_and_pack(w, b): + weight_observer = weight_observer_method() + weight_observer(w) + qweight = _quantize_weight(w.float(), weight_observer) + packed_weight = torch.ops.quantized.linear_prepack(qweight, b) + return packed_weight + + packed_ih = quantize_and_pack(weight_ih, bias_ih) + packed_hh = quantize_and_pack(weight_hh, bias_hh) + if qRNNBase.version is None or qRNNBase.version < 2: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, bias_ih, bias_hh + ) + ) + else: + cell_params = ( + torch.ops.quantized.make_quantized_cell_params_dynamic( + packed_ih, packed_hh, bias_ih, bias_hh, True + ) + ) + + elif dtype == torch.float16: + packed_ih = torch.ops.quantized.linear_prepack_fp16( + weight_ih.float(), bias_ih + ) + packed_hh = torch.ops.quantized.linear_prepack_fp16( + weight_hh.float(), bias_hh + ) + + cell_params = torch.ops.quantized.make_quantized_cell_params_fp16( + packed_ih, packed_hh + ) + else: + raise RuntimeError( + "Unsupported dtype specified for dynamic quantized LSTM!" + ) + + _all_weight_values.append(PackedParameter(cell_params)) + qRNNBase._all_weight_values = torch.nn.ModuleList(_all_weight_values) + + return qRNNBase + + def _weight_bias(self): + # Returns a dict of weights and biases + weight_bias_dict: Dict[str, Dict] = {"weight": {}, "bias": {}} + count = 0 + num_directions = 2 if self.bidirectional else 1 + for layer in range(self.num_layers): + for direction in range(num_directions): + suffix = "_reverse" if direction == 1 else "" + key_name1 = f"weight_ih_l{layer}{suffix}" + key_name2 = f"weight_hh_l{layer}{suffix}" + # packed weights are part of torchbind class, CellParamsSerializationType + # Within the packed weight class, the weight and bias are accessible as Tensors + packed_weight_bias = self._all_weight_values[ # type: ignore[index] + count + ].param.__getstate__()[0][4] + weight_bias_dict["weight"][key_name1] = packed_weight_bias[ + 0 + ].__getstate__()[0][0] + weight_bias_dict["weight"][key_name2] = packed_weight_bias[ + 1 + ].__getstate__()[0][0] + key_name1 = f"bias_ih_l{layer}{suffix}" + key_name2 = f"bias_hh_l{layer}{suffix}" + weight_bias_dict["bias"][key_name1] = packed_weight_bias[ + 0 + ].__getstate__()[0][1] + weight_bias_dict["bias"][key_name2] = packed_weight_bias[ + 1 + ].__getstate__()[0][1] + count = count + 1 + return weight_bias_dict + + def get_weight(self): + return self._weight_bias()["weight"] + + def get_bias(self): + return self._weight_bias()["bias"] + + +class LSTM(RNNBase): + r""" + A dynamic quantized LSTM module with floating point tensor as inputs and outputs. + We adopt the same interface as `torch.nn.LSTM`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM for documentation. + + Examples:: + + >>> # xdoctest: +SKIP + >>> rnn = nn.LSTM(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> c0 = torch.randn(2, 3, 20) + >>> output, (hn, cn) = rnn(input, (h0, c0)) + """ + + # pyrefly: ignore [bad-override] + _FLOAT_MODULE = nn.LSTM + + __overloads__ = {"forward": ["forward_packed", "forward_tensor"]} + + def __init__(self, *args, **kwargs): + super().__init__("LSTM", *args, **kwargs) + + def _get_name(self): + return "DynamicQuantizedLSTM" + + def forward_impl( + self, + input: Tensor, + hx: Optional[tuple[Tensor, Tensor]], + batch_sizes: Optional[Tensor], + max_batch_size: int, + sorted_indices: Optional[Tensor], + ) -> tuple[Tensor, tuple[Tensor, Tensor]]: + if hx is None: + num_directions = 2 if self.bidirectional else 1 + zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + hx = (zeros, zeros) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + self.check_forward_args(input, hx, batch_sizes) + + _all_params = [m.param for m in self._all_weight_values] + if batch_sizes is None: + result = torch.quantized_lstm( + input, + hx, + _all_params, + self.bias, + self.num_layers, + float(self.dropout), + self.training, + self.bidirectional, + self.batch_first, + dtype=self.dtype, + use_dynamic=True, + ) + else: + result = torch.quantized_lstm( + input, + batch_sizes, + hx, + _all_params, + self.bias, + self.num_layers, + float(self.dropout), + self.training, + self.bidirectional, + dtype=self.dtype, + use_dynamic=True, + ) + output = result[0] + hidden = result[1:] + + return output, hidden + + @torch.jit.export + def forward_tensor( + self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None + ) -> tuple[Tensor, tuple[Tensor, Tensor]]: + batch_sizes = None + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + + output, hidden = self.forward_impl( + input, hx, batch_sizes, max_batch_size, sorted_indices + ) + + return output, self.permute_hidden(hidden, unsorted_indices) + + @torch.jit.export + def forward_packed( + self, input: PackedSequence, hx: Optional[tuple[Tensor, Tensor]] = None + ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]: + input_, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = int(batch_sizes[0]) + + output_, hidden = self.forward_impl( + input_, hx, batch_sizes, max_batch_size, sorted_indices + ) + + output = PackedSequence(output_, batch_sizes, sorted_indices, unsorted_indices) + return output, self.permute_hidden(hidden, unsorted_indices) + + # "type: ignore" is required due to issue #43072 + def permute_hidden( # type: ignore[override] + self, + hx: tuple[Tensor, Tensor], + permutation: Optional[Tensor], + ) -> tuple[Tensor, Tensor]: + if permutation is None: + return hx + return _apply_permutation(hx[0], permutation), _apply_permutation( + hx[1], permutation + ) + + # "type: ignore" is required due to issue #43072 + def check_forward_args( # type: ignore[override] + self, + input: Tensor, + hidden: tuple[Tensor, Tensor], + batch_sizes: Optional[Tensor], + ) -> None: + self.check_input(input, batch_sizes) + expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) + + self.check_hidden_size( + hidden[0], expected_hidden_size, "Expected hidden[0] size {}, got {}" + ) + self.check_hidden_size( + hidden[1], expected_hidden_size, "Expected hidden[1] size {}, got {}" + ) + + @torch.jit.ignore + def forward(self, input, hx=None): + if isinstance(input, PackedSequence): + return self.forward_packed(input, hx) + else: + return self.forward_tensor(input, hx) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_mod): + assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 " + "exists in LSTM, may need to relax the assumption to support the use case" + qmod = cls( + ref_mod.input_size, + ref_mod.hidden_size, + ref_mod.num_layers, + ref_mod.bias, + ref_mod.batch_first, + ref_mod.dropout, + ref_mod.bidirectional, + # assuming there is layer 0, which should be OK + ref_mod.weight_ih_l0_dtype, + ) + qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict()) + return qmod + + +class GRU(RNNBase): + r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. + + + For each element in the input sequence, each layer computes the following + function: + + .. math:: + \begin{array}{ll} + r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ + z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ + n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\ + h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} + \end{array} + + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input + at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer + at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`, + :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively. + :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer + (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by + dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random + variable which is :math:`0` with probability :attr:`dropout`. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two GRUs together to form a `stacked GRU`, + with the second GRU taking in outputs of the first GRU and + computing the final results. Default: 1 + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + GRU layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False`` + + Inputs: input, h_0 + - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features + of the input sequence. The input can also be a packed variable length + sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` + for details. + - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor + containing the initial hidden state for each element in the batch. + Defaults to zero if not provided. If the RNN is bidirectional, + num_directions should be 2, else it should be 1. + + Outputs: output, h_n + - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor + containing the output features h_t from the last layer of the GRU, + for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been + given as the input, the output will also be a packed sequence. + For the unpacked case, the directions can be separated + using ``output.view(seq_len, batch, num_directions, hidden_size)``, + with forward and backward being direction `0` and `1` respectively. + + Similarly, the directions can be separated in the packed case. + - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor + containing the hidden state for `t = seq_len` + + Like *output*, the layers can be separated using + ``h_n.view(num_layers, num_directions, batch, hidden_size)``. + + Shape: + - Input1: :math:`(L, N, H_{in})` tensor containing input features where + :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length. + - Input2: :math:`(S, N, H_{out})` tensor + containing the initial hidden state for each element in the batch. + :math:`H_{out}=\text{hidden\_size}` + Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}` + If the RNN is bidirectional, num_directions should be 2, else it should be 1. + - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}` + - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state + for each element in the batch + + Attributes: + weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer + (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`. + Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)` + weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer + (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)` + bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer + (b_ir|b_iz|b_in), of shape `(3*hidden_size)` + bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer + (b_hr|b_hz|b_hn), of shape `(3*hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + .. note:: + The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks. + In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the + previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix + `W` and addition of bias: + + .. math:: + \begin{aligned} + n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn}) + \end{aligned} + + This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}` + + .. math:: + \begin{aligned} + n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) + \end{aligned} + + This implementation differs on purpose for efficiency. + + .. include:: ../cudnn_persistent_rnn.rst + + Examples:: + + >>> # xdoctest: +SKIP + >>> rnn = nn.GRU(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> output, hn = rnn(input, h0) + """ + + # pyrefly: ignore [bad-override] + _FLOAT_MODULE = nn.GRU + + __overloads__ = {"forward": ["forward_packed", "forward_tensor"]} + + def __init__(self, *args, **kwargs): + super().__init__("GRU", *args, **kwargs) + + def _get_name(self): + return "DynamicQuantizedGRU" + + def check_forward_args( + self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor] + ) -> None: + self.check_input(input, batch_sizes) + expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) + + self.check_hidden_size( + hidden, expected_hidden_size, "Expected hidden size {}, got {}" + ) + + def forward_impl( + self, + input: Tensor, + hx: Optional[Tensor], + batch_sizes: Optional[Tensor], + max_batch_size: int, + sorted_indices: Optional[Tensor], + ) -> tuple[Tensor, Tensor]: + if hx is None: + num_directions = 2 if self.bidirectional else 1 + zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + hx = zeros + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + self.check_forward_args(input, hx, batch_sizes) + + _all_params = [m.param for m in self._all_weight_values] + if batch_sizes is None: + result = torch.quantized_gru( + input, + hx, + _all_params, + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = torch.quantized_gru( + input, + batch_sizes, + hx, + _all_params, + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + output = result[0] + hidden = result[1] + + return output, hidden + + @torch.jit.export + def forward_tensor( + self, input: Tensor, hx: Optional[Tensor] = None + ) -> tuple[Tensor, Tensor]: + batch_sizes = None + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + + output, hidden = self.forward_impl( + input, hx, batch_sizes, max_batch_size, sorted_indices + ) + + return output, self.permute_hidden(hidden, unsorted_indices) + + @torch.jit.export + def forward_packed( + self, input: PackedSequence, hx: Optional[Tensor] = None + ) -> tuple[PackedSequence, Tensor]: + input_, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = int(batch_sizes[0]) + output_, hidden = self.forward_impl( + input_, hx, batch_sizes, max_batch_size, sorted_indices + ) + + output = PackedSequence(output_, batch_sizes, sorted_indices, unsorted_indices) + return output, self.permute_hidden(hidden, unsorted_indices) + + def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor: + if permutation is None: + return hx + return _apply_permutation(hx, permutation) + + @torch.jit.ignore + def forward(self, input, hx=None): + if isinstance(input, PackedSequence): + return self.forward_packed(input, hx) + else: + return self.forward_tensor(input, hx) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + @classmethod + def from_reference(cls, ref_mod): + assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 " + "exists in LSTM, may need to relax the assumption to support the use case" + qmod = cls( + ref_mod.input_size, + ref_mod.hidden_size, + ref_mod.num_layers, + ref_mod.bias, + ref_mod.batch_first, + ref_mod.dropout, + ref_mod.bidirectional, + # assuming there is layer 0, which should be OK + ref_mod.weight_ih_l0_dtype, + ) + qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict()) + return qmod + + +class RNNCellBase(torch.nn.Module): + # _FLOAT_MODULE = nn.CellRNNBase + __constants__ = ["input_size", "hidden_size", "bias"] + + def __init__( + self, input_size, hidden_size, bias=True, num_chunks=4, dtype=torch.qint8 + ): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + self.weight_dtype = dtype + if bias: + self.bias_ih = torch.randn(num_chunks * hidden_size).to(dtype=torch.float) + self.bias_hh = torch.randn(num_chunks * hidden_size).to(dtype=torch.float) + else: + self.register_parameter("bias_ih", None) + self.register_parameter("bias_hh", None) + + weight_ih = torch.randn(num_chunks * hidden_size, input_size).to(torch.float) + weight_hh = torch.randn(num_chunks * hidden_size, hidden_size).to(torch.float) + if dtype == torch.qint8: + weight_ih = torch.quantize_per_tensor( + weight_ih, scale=1, zero_point=0, dtype=torch.qint8 + ) + weight_hh = torch.quantize_per_tensor( + weight_hh, scale=1, zero_point=0, dtype=torch.qint8 + ) + + if dtype == torch.qint8: + # for each layer, for each direction we need to quantize and pack + # weights and pack parameters in this order: + # + # w_ih, w_hh + packed_weight_ih = torch.ops.quantized.linear_prepack( + weight_ih, self.bias_ih + ) + packed_weight_hh = torch.ops.quantized.linear_prepack( + weight_hh, self.bias_hh + ) + else: + # for each layer, for each direction we need to quantize and pack + # weights and pack parameters in this order: + # + # packed_ih, packed_hh, b_ih, b_hh + packed_weight_ih = torch.ops.quantized.linear_prepack_fp16( + weight_ih, self.bias_ih + ) + packed_weight_hh = torch.ops.quantized.linear_prepack_fp16( + weight_hh, self.bias_hh + ) + + self._packed_weight_ih = packed_weight_ih + self._packed_weight_hh = packed_weight_hh + + def _get_name(self): + return "DynamicQuantizedRNNBase" + + def extra_repr(self): + s = "{input_size}, {hidden_size}" + if "bias" in self.__dict__ and self.bias is not True: + s += ", bias={bias}" + if "nonlinearity" in self.__dict__ and self.nonlinearity != "tanh": + s += ", nonlinearity={nonlinearity}" + return s.format(**self.__dict__) + + def check_forward_input(self, input): + if input.size(1) != self.input_size: + raise RuntimeError( + f"input has inconsistent input_size: got {input.size(1)}, expected {self.input_size}" + ) + + def check_forward_hidden( + self, input: Tensor, hx: Tensor, hidden_label: str = "" + ) -> None: + if input.size(0) != hx.size(0): + raise RuntimeError( + f"Input batch size {input.size(0)} doesn't match hidden{hidden_label} batch size {hx.size(0)}" + ) + + if hx.size(1) != self.hidden_size: + raise RuntimeError( + f"hidden{hidden_label} has inconsistent hidden_size: got {hx.size(1)}, expected {self.hidden_size}" + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + assert type(mod) in { + torch.nn.LSTMCell, + torch.nn.GRUCell, + torch.nn.RNNCell, + }, ( + "nn.quantized.dynamic.RNNCellBase.from_float \ + only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell" + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + + if mod.qconfig is not None and mod.qconfig.weight is not None: + weight_observer_method = mod.qconfig.weight + else: + # We have the circular import issues if we import the qconfig in the beginning of this file: + # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the + # import until we need it. + from torch.ao.quantization.qconfig import default_dynamic_qconfig + + weight_observer_method = default_dynamic_qconfig.weight + + dtype = weight_observer_method().dtype + supported_scalar_types = [torch.qint8, torch.float16] + if dtype not in supported_scalar_types: + raise RuntimeError( + f"Unsupported dtype for dynamic RNN quantization: {dtype}" + ) + + qRNNCellBase: Union[LSTMCell, GRUCell, RNNCell] + + if type(mod) is torch.nn.LSTMCell: + qRNNCellBase = LSTMCell( + mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype + ) + elif type(mod) is torch.nn.GRUCell: + qRNNCellBase = GRUCell( + mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype + ) + elif type(mod) is torch.nn.RNNCell: + qRNNCellBase = RNNCell( + mod.input_size, + mod.hidden_size, + bias=mod.bias, + nonlinearity=mod.nonlinearity, + dtype=dtype, + ) + else: + raise NotImplementedError( + "Only LSTMCell, GRUCell and RNNCell \ + are supported for QuantizedRNN for now" + ) + + assert mod.bias + + def _observe_and_quantize_weight(weight): + if dtype == torch.qint8: + weight_observer = weight_observer_method() + weight_observer(weight) + qweight = _quantize_weight(weight.float(), weight_observer) + return qweight + else: + return weight.float() + + qRNNCellBase._packed_weight_ih = pack_weight_bias( + _observe_and_quantize_weight(mod.weight_ih), mod.bias_ih, dtype + ) + qRNNCellBase._packed_weight_hh = pack_weight_bias( + _observe_and_quantize_weight(mod.weight_hh), mod.bias_hh, dtype + ) + return qRNNCellBase + + @classmethod + def from_reference(cls, ref_mod): + assert hasattr(ref_mod, "weight_ih_dtype"), "We are assuming weight_ih " + "exists in reference module, may need to relax the assumption to support the use case" + if hasattr(ref_mod, "nonlinearity"): + qmod = cls( + ref_mod.input_size, + ref_mod.hidden_size, + ref_mod.bias, + ref_mod.nonlinearity, + dtype=ref_mod.weight_ih_dtype, + ) + else: + qmod = cls( + ref_mod.input_size, + ref_mod.hidden_size, + ref_mod.bias, + dtype=ref_mod.weight_ih_dtype, + ) + weight_bias_dict = { + "weight": { + "weight_ih": ref_mod.get_quantized_weight_ih(), + "weight_hh": ref_mod.get_quantized_weight_hh(), + }, + "bias": { + "bias_ih": ref_mod.bias_ih, + "bias_hh": ref_mod.bias_hh, + }, + } + qmod.set_weight_bias(weight_bias_dict) + return qmod + + def _weight_bias(self): + # Returns a dict of weights and biases + weight_bias_dict: Dict[str, Dict] = {"weight": {}, "bias": {}} + w1, b1 = self._packed_weight_ih.__getstate__()[0] + w2, b2 = self._packed_weight_hh.__getstate__()[0] + # TODO: these can be simplified to one level? e.g. using weight_ih as key + # directly + weight_bias_dict["weight"]["weight_ih"] = w1 + weight_bias_dict["weight"]["weight_hh"] = w2 + weight_bias_dict["bias"]["bias_ih"] = b1 + weight_bias_dict["bias"]["bias_hh"] = b2 + return weight_bias_dict + + def get_weight(self): + return self._weight_bias()["weight"] + + def get_bias(self): + return self._weight_bias()["bias"] + + def set_weight_bias(self, weight_bias_dict): + # TODO: these can be simplified to one level? e.g. using weight_ih as key + # directly + self._packed_weight_ih = pack_weight_bias( + weight_bias_dict["weight"]["weight_ih"], + weight_bias_dict["bias"]["bias_ih"], + self.weight_dtype, + ) + self._packed_weight_hh = pack_weight_bias( + weight_bias_dict["weight"]["weight_hh"], + weight_bias_dict["bias"]["bias_hh"], + self.weight_dtype, + ) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "_packed_weight_ih"] = self._packed_weight_ih + destination[prefix + "_packed_weight_hh"] = self._packed_weight_hh + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self._packed_weight_ih = state_dict.pop(prefix + "_packed_weight_ih") + self._packed_weight_hh = state_dict.pop(prefix + "_packed_weight_hh") + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +class RNNCell(RNNCellBase): + r"""An Elman RNN cell with tanh or ReLU non-linearity. + A dynamic quantized RNNCell module with floating point tensor as inputs and outputs. + Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.RNNCell`, + please see https://pytorch.org/docs/stable/nn.html#torch.nn.RNNCell for documentation. + + Examples:: + + >>> # xdoctest: +SKIP + >>> rnn = nn.RNNCell(10, 20) + >>> input = torch.randn(6, 3, 10) + >>> hx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx = rnn(input[i], hx) + ... output.append(hx) + """ + + __constants__ = ["input_size", "hidden_size", "bias", "nonlinearity"] + + def __init__( + self, input_size, hidden_size, bias=True, nonlinearity="tanh", dtype=torch.qint8 + ): + super().__init__(input_size, hidden_size, bias, num_chunks=1, dtype=dtype) + self.nonlinearity = nonlinearity + + def _get_name(self): + return "DynamicQuantizedRNNCell" + + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: + self.check_forward_input(input) + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + self.check_forward_hidden(input, hx, "") + if self.nonlinearity == "tanh": + ret = torch.ops.quantized.quantized_rnn_tanh_cell_dynamic( + input, + hx, + self._packed_weight_ih, + self._packed_weight_hh, + self.bias_ih, + self.bias_hh, + ) + elif self.nonlinearity == "relu": + ret = torch.ops.quantized.quantized_rnn_relu_cell_dynamic( + input, + hx, + self._packed_weight_ih, + self._packed_weight_hh, + self.bias_ih, + self.bias_hh, + ) + else: + ret = input # TODO: remove when jit supports exception flow + raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}") + return ret + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class LSTMCell(RNNCellBase): + r"""A long short-term memory (LSTM) cell. + + A dynamic quantized LSTMCell module with floating point tensor as inputs and outputs. + Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.LSTMCell`, + please see https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell for documentation. + + Examples:: + + >>> # xdoctest: +SKIP + >>> rnn = nn.LSTMCell(10, 20) + >>> input = torch.randn(6, 3, 10) + >>> hx = torch.randn(3, 20) + >>> cx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx, cx = rnn(input[i], (hx, cx)) + ... output.append(hx) + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, num_chunks=4, **kwargs) # type: ignore[misc] + + def _get_name(self): + return "DynamicQuantizedLSTMCell" + + def forward( + self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None + ) -> tuple[Tensor, Tensor]: + self.check_forward_input(input) + if hx is None: + zeros = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + hx = (zeros, zeros) + self.check_forward_hidden(input, hx[0], "[0]") + self.check_forward_hidden(input, hx[1], "[1]") + return torch.ops.quantized.quantized_lstm_cell_dynamic( + input, + hx, + self._packed_weight_ih, + self._packed_weight_hh, + self.bias_ih, + self.bias_hh, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class GRUCell(RNNCellBase): + r"""A gated recurrent unit (GRU) cell + + A dynamic quantized GRUCell module with floating point tensor as inputs and outputs. + Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.GRUCell`, + please see https://pytorch.org/docs/stable/nn.html#torch.nn.GRUCell for documentation. + + Examples:: + + >>> # xdoctest: +SKIP + >>> rnn = nn.GRUCell(10, 20) + >>> input = torch.randn(6, 3, 10) + >>> hx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx = rnn(input[i], hx) + ... output.append(hx) + """ + + def __init__(self, input_size, hidden_size, bias=True, dtype=torch.qint8): + super().__init__(input_size, hidden_size, bias, num_chunks=3, dtype=dtype) + + def _get_name(self): + return "DynamicQuantizedGRUCell" + + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: + self.check_forward_input(input) + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + self.check_forward_hidden(input, hx, "") + return torch.ops.quantized.quantized_gru_cell_dynamic( + input, + hx, + self._packed_weight_ih, + self._packed_weight_hh, + self.bias_ih, + self.bias_hh, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return super().from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..f84d41b58503ad1d86244c7aa358f09ad16acad2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py @@ -0,0 +1,781 @@ +# mypy: allow-untyped-defs +r"""Functional interface (quantized).""" + +import warnings + +import torch +from torch import Tensor +from torch.jit.annotations import BroadcastingList2 +from torch.nn.modules.utils import _pair, _triple + +from .modules.utils import _pair_from_first + + +# Although some of the functions and docstrings are mirrored from the torch.nn, +# we want to have them here for future changes. + +__all__ = [ + "avg_pool2d", + "avg_pool3d", + "adaptive_avg_pool2d", + "adaptive_avg_pool3d", + "conv1d", + "conv2d", + "conv3d", + "interpolate", + "linear", + "max_pool1d", + "max_pool2d", + "celu", + "leaky_relu", + "hardtanh", + "hardswish", + "threshold", + "elu", + "hardsigmoid", + "clamp", + "upsample", + "upsample_bilinear", + "upsample_nearest", +] + + +def avg_pool2d( + input, + kernel_size, + stride=None, + padding=0, + ceil_mode=False, + count_include_pad=True, + divisor_override=None, +): + r""" + Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size + :math:`sH \times sW` steps. The number of output features is equal to the number of + input planes. + + .. note:: The input quantization parameters propagate to the output. + + See :class:`~torch.ao.nn.quantized.AvgPool2d` for details and output shape. + + Args: + input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)` + kernel_size: size of the pooling region. Can be a single number or a + tuple `(kH, kW)` + stride: stride of the pooling operation. Can be a single number or a + tuple `(sH, sW)`. Default: :attr:`kernel_size` + padding: implicit zero paddings on both sides of the input. Can be a + single number or a tuple `(padH, padW)`. Default: 0 + ceil_mode: when True, will use `ceil` instead of `floor` in the formula + to compute the output shape. Default: ``False`` + count_include_pad: when True, will include the zero-padding in the + averaging calculation. Default: ``True`` + divisor_override: if specified, it will be used as divisor, otherwise + size of the pooling region will be used. Default: None + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.avg_pool2d' must be quantized!") + return torch.nn.functional.avg_pool2d( + input, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, + ) + + +def avg_pool3d( + input, + kernel_size, + stride=None, + padding=0, + ceil_mode=False, + count_include_pad=True, + divisor_override=None, +): + r""" + Applies 3D average-pooling operation in :math:`kD \ times kH \times kW` regions by step size + :math:`sD \times sH \times sW` steps. The number of output features is equal to the number of + input planes. + + .. note:: The input quantization parameters propagate to the output. + + Args: + input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)` + kernel_size: size of the pooling region. Can be a single number or a + tuple `(kD, kH, kW)` + stride: stride of the pooling operation. Can be a single number or a + tuple `(sD, sH, sW)`. Default: :attr:`kernel_size` + padding: implicit zero paddings on both sides of the input. Can be a + single number or a tuple `(padD, padH, padW)`. Default: 0 + ceil_mode: when True, will use `ceil` instead of `floor` in the formula + to compute the output shape. Default: ``False`` + count_include_pad: when True, will include the zero-padding in the + averaging calculation. Default: ``True`` + divisor_override: if specified, it will be used as divisor, otherwise + size of the pooling region will be used. Default: None + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.avg_pool3d' must be quantized!") + return torch.nn.functional.avg_pool3d( + input, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, + ) + + +def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor: + r""" + Applies a 2D adaptive average pooling over a quantized input signal composed + of several quantized input planes. + + .. note:: The input quantization parameters propagate to the output. + + See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool2d` for details and output shape. + + Args: + output_size: the target output size (single integer or + double-integer tuple) + """ + if not input.is_quantized: + raise ValueError( + "Input to 'quantized.functional.adaptive_avg_pool2d' must be quantized!" + ) + return torch.nn.functional.adaptive_avg_pool2d(input, output_size) + + +def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor: + r""" + Applies a 3D adaptive average pooling over a quantized input signal composed + of several quantized input planes. + + .. note:: The input quantization parameters propagate to the output. + + See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool3d` for details and output shape. + + Args: + output_size: the target output size (single integer or + double-integer tuple) + """ + if not input.is_quantized: + raise ValueError( + "Input to 'quantized.functional.adaptive_avg_pool3d' must be quantized!" + ) + return torch.nn.functional.adaptive_avg_pool3d(input, output_size) + + +def conv1d( + input, + weight, + bias, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode="zeros", + scale=1.0, + zero_point=0, + dtype=torch.quint8, +): + r""" + Applies a 1D convolution over a quantized 1D input composed of several input + planes. + + See :class:`~torch.ao.nn.quantized.Conv1d` for details and output shape. + + Args: + input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)` + weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , iW)` + bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`. + stride: the stride of the convolving kernel. Can be a single number or a + tuple `(sW,)`. Default: 1 + padding: implicit paddings on both sides of the input. Can be a + single number or a tuple `(padW,)`. Default: 0 + dilation: the spacing between kernel elements. Can be a single number or + a tuple `(dW,)`. Default: 1 + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the + number of groups. Default: 1 + padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros" + scale: quantization scale for the output. Default: 1.0 + zero_point: quantization zero_point for the output. Default: 0 + dtype: quantization data type to use. Default: ``torch.quint8`` + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> from torch.ao.nn.quantized import functional as qF + >>> filters = torch.randn(33, 16, 3, dtype=torch.float) + >>> inputs = torch.randn(20, 16, 50, dtype=torch.float) + >>> bias = torch.randn(33, dtype=torch.float) + >>> + >>> scale, zero_point = 1.0, 0 + >>> dtype_inputs = torch.quint8 + >>> dtype_filters = torch.qint8 + >>> + >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters) + >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs) + >>> qF.conv1d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point) + """ # noqa: E501 + if padding_mode != "zeros": + raise NotImplementedError("Only zero-padding is supported!") + if input.dtype != torch.quint8: + raise NotImplementedError( + "Only torch.quint8 is supported for activation tensor!" + ) + if weight.dtype != torch.qint8: + raise NotImplementedError("Only torch.qint8 is supported for weight tensor!") + if input.ndim != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + stride = _pair_from_first(stride) + padding = _pair_from_first(padding) + dilation = _pair_from_first(dilation) + + packed_params = torch.ops.quantized.conv1d_prepack( + weight, bias, stride, padding, dilation, groups + ) + return torch.ops.quantized.conv1d(input, packed_params, scale, zero_point) + + +def conv2d( + input, + weight, + bias, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode="zeros", + scale=1.0, + zero_point=0, + dtype=torch.quint8, +): + r""" + Applies a 2D convolution over a quantized 2D input composed of several input + planes. + + See :class:`~torch.ao.nn.quantized.Conv2d` for details and output shape. + + Args: + input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)` + weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW)` + bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`. + stride: the stride of the convolving kernel. Can be a single number or a + tuple `(sH, sW)`. Default: 1 + padding: implicit paddings on both sides of the input. Can be a + single number or a tuple `(padH, padW)`. Default: 0 + dilation: the spacing between kernel elements. Can be a single number or + a tuple `(dH, dW)`. Default: 1 + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the + number of groups. Default: 1 + padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros" + scale: quantization scale for the output. Default: 1.0 + zero_point: quantization zero_point for the output. Default: 0 + dtype: quantization data type to use. Default: ``torch.quint8`` + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> from torch.ao.nn.quantized import functional as qF + >>> filters = torch.randn(8, 4, 3, 3, dtype=torch.float) + >>> inputs = torch.randn(1, 4, 5, 5, dtype=torch.float) + >>> bias = torch.randn(8, dtype=torch.float) + >>> + >>> scale, zero_point = 1.0, 0 + >>> dtype_inputs = torch.quint8 + >>> dtype_filters = torch.qint8 + >>> + >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters) + >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs) + >>> qF.conv2d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point) + """ # noqa: E501 + if padding_mode != "zeros": + raise NotImplementedError("Only zero-padding is supported!") + if input.dtype != torch.quint8: + raise NotImplementedError( + "Only torch.quint8 is supported for activation tensor!" + ) + if weight.dtype != torch.qint8: + raise NotImplementedError("Only torch.qint8 is supported for weight tensor!") + if input.ndim != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + + packed_params = torch.ops.quantized.conv2d_prepack( + weight, bias, stride, padding, dilation, groups + ) + return torch.ops.quantized.conv2d(input, packed_params, scale, zero_point) + + +def conv3d( + input, + weight, + bias, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode="zeros", + scale=1.0, + zero_point=0, + dtype=torch.quint8, +): + r""" + Applies a 3D convolution over a quantized 3D input composed of several input + planes. + + See :class:`~torch.ao.nn.quantized.Conv3d` for details and output shape. + + Args: + input: quantized input tensor of shape + :math:`(\text{minibatch} , \text{in\_channels} , iD , iH , iW)` + weight: quantized filters of shape + :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kD , kH , kW)` + bias: **non-quantized** bias tensor of shape + :math:`(\text{out\_channels})`. The tensor type must be `torch.float`. + stride: the stride of the convolving kernel. Can be a single number or a + tuple `(sD, sH, sW)`. Default: 1 + padding: implicit paddings on both sides of the input. Can be a + single number or a tuple `(padD, padH, padW)`. Default: 0 + dilation: the spacing between kernel elements. Can be a single number or + a tuple `(dD, dH, dW)`. Default: 1 + groups: split input into groups, :math:`\text{in\_channels}` should be + divisible by the number of groups. Default: 1 + padding_mode: the padding mode to use. Only "zeros" is supported for + quantized convolution at the moment. Default: "zeros" + scale: quantization scale for the output. Default: 1.0 + zero_point: quantization zero_point for the output. Default: 0 + dtype: quantization data type to use. Default: ``torch.quint8`` + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> from torch.ao.nn.quantized import functional as qF + >>> filters = torch.randn(8, 4, 3, 3, 3, dtype=torch.float) + >>> inputs = torch.randn(1, 4, 5, 5, 5, dtype=torch.float) + >>> bias = torch.randn(8, dtype=torch.float) + >>> + >>> scale, zero_point = 1.0, 0 + >>> dtype_inputs = torch.quint8 + >>> dtype_filters = torch.qint8 + >>> + >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters) + >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs) + >>> qF.conv3d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point) + """ # noqa: E501 + if padding_mode != "zeros": + raise NotImplementedError("Only zero-padding is supported!") + if input.dtype != torch.quint8: + raise NotImplementedError( + "Only torch.quint8 is supported for activation tensor!" + ) + if weight.dtype != torch.qint8: + raise NotImplementedError("Only torch.qint8 is supported for weight tensor!") + if input.ndim != 5: + raise ValueError("Input shape must be `(N, C, D, H, W)`!") + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + + packed_params = torch.ops.quantized.conv3d_prepack( + weight, bias, stride, padding, dilation, groups + ) + return torch.ops.quantized.conv3d(input, packed_params, scale, zero_point) + + +def interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): + r"""Down/up samples the input to either the given :attr:`size` or the given + :attr:`scale_factor` + + See :func:`torch.nn.functional.interpolate` for implementation details. + + The input dimensions are interpreted in the form: + `mini-batch x channels x [optional depth] x [optional height] x width`. + + .. note:: The input quantization parameters propagate to the output. + + .. note:: Only 2D/3D input is supported for quantized inputs + + .. note:: Only the following modes are supported for the quantized inputs: + + - `bilinear` + - `nearest` + + Args: + input (Tensor): the input tensor + size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]): + output spatial size. + scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple. + mode (str): algorithm used for upsampling: + ``'nearest'`` | ``'bilinear'`` + align_corners (bool, optional): Geometrically, we consider the pixels of the + input and output as squares rather than points. + If set to ``True``, the input and output tensors are aligned by the + center points of their corner pixels, preserving the values at the corner pixels. + If set to ``False``, the input and output tensors are aligned by the corner + points of their corner pixels, and the interpolation uses edge value padding + for out-of-boundary values, making this operation *independent* of input size + when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode` + is ``'bilinear'``. + Default: ``False`` + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.interpolate' must be quantized!") + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + +def linear( + input: Tensor, + weight: Tensor, + bias: Tensor | None = None, + scale: float | None = None, + zero_point: int | None = None, +) -> Tensor: + r""" + Applies a linear transformation to the incoming quantized data: + :math:`y = xA^T + b`. + See :class:`~torch.ao.nn.quantized.Linear` + + .. note:: + + Current implementation packs weights on every call, which has penalty on performance. + If you want to avoid the overhead, use :class:`~torch.ao.nn.quantized.Linear`. + + Args: + input (Tensor): Quantized input of type `torch.quint8` + weight (Tensor): Quantized weight of type `torch.qint8` + bias (Tensor): None or fp32 bias of type `torch.float` + scale (double): output scale. If None, derived from the input scale + zero_point (long): output zero point. If None, derived from the input zero_point + + Shape: + - Input: :math:`(N, *, in\_features)` where `*` means any number of + additional dimensions + - Weight: :math:`(out\_features, in\_features)` + - Bias: :math:`(out\_features)` + - Output: :math:`(N, *, out\_features)` + """ + if scale is None: + scale = input.q_scale() + if zero_point is None: + zero_point = input.q_zero_point() + _packed_params = torch.ops.quantized.linear_prepack(weight, bias) + return torch.ops.quantized.linear(input, _packed_params, scale, zero_point) + + +def max_pool1d( + input, + kernel_size, + stride=None, + padding=0, + dilation=1, + ceil_mode=False, + return_indices=False, +): + r"""Applies a 1D max pooling over a quantized input signal composed of + several quantized input planes. + + .. note:: The input quantization parameters are propagated to the output. + + See :class:`~torch.ao.nn.quantized.MaxPool1d` for details. + """ + if return_indices: + raise NotImplementedError("return_indices is not yet implemented!") + if stride is None: + stride = torch.jit.annotate(list[int], []) + return torch.nn.functional.max_pool1d( + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode=ceil_mode, + return_indices=return_indices, + ) + + +def max_pool2d( + input, + kernel_size, + stride=None, + padding=0, + dilation=1, + ceil_mode=False, + return_indices=False, +): + r"""Applies a 2D max pooling over a quantized input signal composed of + several quantized input planes. + + .. note:: The input quantization parameters are propagated to the output. + + See :class:`~torch.ao.nn.quantized.MaxPool2d` for details. + """ + if return_indices: + raise NotImplementedError("return_indices is not yet implemented!") + if stride is None: + stride = torch.jit.annotate(list[int], []) + return torch.nn.functional.max_pool2d( + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode=ceil_mode, + return_indices=return_indices, + ) + + +def celu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.0) -> Tensor: + r"""celu(input, scale, zero_point, alpha=1.) -> Tensor + + Applies the quantized CELU function element-wise. + + .. math:: + \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x / \alpha) - 1)) + + Args: + input: quantized input + alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0 + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.celu' must be quantized!") + return torch.ops.quantized.celu(input, scale, zero_point, alpha) + + +def leaky_relu( + input: Tensor, + negative_slope: float = 0.01, + inplace: bool = False, + scale: float | None = None, + zero_point: int | None = None, +): + r""" + Quantized version of the. + leaky_relu(input, negative_slope=0.01, inplace=False, scale, zero_point) -> Tensor + + Applies element-wise, + :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)` + + Args: + input: Quantized input + negative_slope: The slope of the negative input + inplace: Inplace modification of the input tensor + scale, zero_point: Scale and zero point of the output tensor. + + See :class:`~torch.nn.LeakyReLU` for more details. + """ + if scale is not None and zero_point is not None: + assert not inplace, "Cannot rescale with `inplace`" + output = torch._empty_affine_quantized( + input.shape, scale=scale, zero_point=int(zero_point), dtype=input.dtype + ) + torch._C._nn.leaky_relu(input, negative_slope, out=output) + return output + if inplace: + result = torch._C._nn.leaky_relu_(input, negative_slope) + else: + result = torch._C._nn.leaky_relu(input, negative_slope) + return result + + +def hardtanh( + input: Tensor, min_val: float = -1.0, max_val: float = 1.0, inplace: bool = False +) -> Tensor: + r"""This is the quantized version of :func:`~torch.nn.functional.hardtanh`.""" + if not input.is_quantized: + raise ValueError("Input to 'quantized.hardtanh' must be quantized!") + if inplace: + return torch._C._nn.hardtanh_(input, min_val, max_val) + return torch._C._nn.hardtanh(input, min_val, max_val) + + +def hardswish(input: Tensor, scale: float, zero_point: int) -> Tensor: + r"""This is the quantized version of :func:`~torch.nn.functional.hardswish`. + + Args: + input: quantized input + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.hardswish' must be quantized!") + return torch._ops.ops.quantized.hardswish(input, scale, zero_point) + + +def threshold(input: Tensor, threshold: float, value: float) -> Tensor: + r"""Applies the quantized version of the threshold function element-wise: + + .. math:: + x = \begin{cases} + x & \text{if~} x > \text{threshold} \\ + \text{value} & \text{otherwise} + \end{cases} + + See :class:`~torch.nn.Threshold` for more details. + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.threshold' must be quantized!") + if threshold is None: + raise ValueError("Input to 'threshold' must be specified!") + if value is None: + raise ValueError("Input to 'value' must be specified!") + return torch._ops.ops.quantized.threshold(input, threshold, value) + + +def elu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.0) -> Tensor: + r"""This is the quantized version of :func:`~torch.nn.functional.elu`. + + Args: + input: quantized input + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + alpha: the alpha constant + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.elu' must be quantized!") + return torch.ops.quantized.elu(input, scale, zero_point, alpha) + + +def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor: + r"""This is the quantized version of :func:`~torch.nn.functional.hardsigmoid`.""" + if not input.is_quantized: + raise ValueError("Input to 'quantized.hardsigmoid' must be quantized!") + if inplace: + return torch._C._nn.hardsigmoid_(input) # type: ignore[attr-defined] + return torch._C._nn.hardsigmoid(input) + + +def clamp(input: Tensor, min_: float, max_: float) -> Tensor: + r"""float(input, min\_, max\_) -> Tensor + + Applies the clamp function element-wise. + See :class:`~torch.ao.nn.quantized.clamp` for more details. + + Args: + input: quantized input + min_: minimum value for clamping + max_: maximum value for clamping + """ + if not input.is_quantized: + raise ValueError("Input to 'quantized.clamp' must be quantized!") + return torch.clamp(input, min_, max_) + + +def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + r"""Upsamples the input to either the given :attr:`size` or the given + :attr:`scale_factor` + + .. warning:: + This function is deprecated in favor of + :func:`torch.ao.nn.quantized.functional.interpolate`. + This is equivalent with ``nn.quantized.functional.interpolate(...)``. + + See :func:`torch.nn.functional.interpolate` for implementation details. + + The input dimensions are interpreted in the form: + `mini-batch x channels x [optional depth] x [optional height] x width`. + + .. note:: The input quantization parameters propagate to the output. + + .. note:: Only 2D input is supported for quantized inputs + + .. note:: Only the following modes are supported for the quantized inputs: + + - `bilinear` + - `nearest` + + Args: + input (Tensor): quantized input tensor + size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]): + output spatial size. + scale_factor (float or Tuple[float]): multiplier for spatial size. Has to be an integer. + mode (str): algorithm used for upsampling: + ``'nearest'`` | ``'bilinear'`` + align_corners (bool, optional): Geometrically, we consider the pixels of the + input and output as squares rather than points. + If set to ``True``, the input and output tensors are aligned by the + center points of their corner pixels, preserving the values at the corner pixels. + If set to ``False``, the input and output tensors are aligned by the corner + points of their corner pixels, and the interpolation uses edge value padding + for out-of-boundary values, making this operation *independent* of input size + when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode` + is ``'bilinear'``. + Default: ``False`` + + .. warning:: + With ``align_corners = True``, the linearly interpolating modes + (`bilinear`) don't proportionally align the + output and input pixels, and thus the output values can depend on the + input size. This was the default behavior for these modes up to version + 0.3.1. Since then, the default behavior is ``align_corners = False``. + See :class:`~torch.nn.Upsample` for concrete examples on how this + affects the outputs. + """ + warnings.warn( + "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, + ) + return interpolate(input, size, scale_factor, mode, align_corners) + + +def upsample_bilinear(input, size=None, scale_factor=None): + r"""Upsamples the input, using bilinear upsampling. + + .. warning:: + This function is deprecated in favor of + :func:`torch.ao.nn.quantized.functional.interpolate`. + This is equivalent with + ``nn.quantized.functional.interpolate(..., mode='bilinear', align_corners=True)``. + + .. note:: The input quantization parameters propagate to the output. + + .. note:: Only 2D inputs are supported + + Args: + input (Tensor): quantized input + size (int or Tuple[int, int]): output spatial size. + scale_factor (int or Tuple[int, int]): multiplier for spatial size + """ + # DeprecationWarning is ignored by default + warnings.warn( + "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, + ) + return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True) + + +def upsample_nearest(input, size=None, scale_factor=None): + r"""Upsamples the input, using nearest neighbours' pixel values. + + .. warning:: + This function is deprecated in favor of + :func:`torch.ao.nn.quantized.functional.interpolate`. + This is equivalent with ``nn.quantized.functional.interpolate(..., mode='nearest')``. + + .. note:: The input quantization parameters propagate to the output. + + .. note:: Only 2D inputs are supported + + Args: + input (Tensor): quantized input + size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial + size. + scale_factor (int): multiplier for spatial size. Has to be an integer. + """ + # DeprecationWarning is ignored by default + warnings.warn( + "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, + ) + return interpolate(input, size, scale_factor, mode="nearest") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a3bad8c49350f56e5e58235570799a8d0968296d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py @@ -0,0 +1,162 @@ +# mypy: allow-untyped-defs +import torch + +# The quantized modules use `torch.nn` and `torch.ao.nn.quantizable` +# packages. However, the `quantizable` package uses "lazy imports" +# to avoid circular dependency. +# Hence we need to include it here to make sure it is resolved before +# they are used in the modules. +import torch.ao.nn.quantizable +from torch.nn.modules.pooling import MaxPool2d + +from .activation import ( + ELU, + Hardswish, + LeakyReLU, + MultiheadAttention, + PReLU, + ReLU6, + Sigmoid, + Softmax, +) +from .batchnorm import BatchNorm2d, BatchNorm3d +from .conv import ( + Conv1d, + Conv2d, + Conv3d, + ConvTranspose1d, + ConvTranspose2d, + ConvTranspose3d, +) +from .dropout import Dropout +from .embedding_ops import Embedding, EmbeddingBag +from .functional_modules import FloatFunctional, FXFloatFunctional, QFunctional +from .linear import Linear +from .normalization import ( + GroupNorm, + InstanceNorm1d, + InstanceNorm2d, + InstanceNorm3d, + LayerNorm, +) +from .rnn import LSTM + + +__all__ = [ + "BatchNorm2d", + "BatchNorm3d", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "DeQuantize", + "ELU", + "Embedding", + "EmbeddingBag", + "GroupNorm", + "Hardswish", + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", + "LayerNorm", + "LeakyReLU", + "Linear", + "LSTM", + "MultiheadAttention", + "Quantize", + "ReLU6", + "Sigmoid", + "Softmax", + "Dropout", + "PReLU", + # Wrapper modules + "FloatFunctional", + "FXFloatFunctional", + "QFunctional", +] + + +class Quantize(torch.nn.Module): + r"""Quantizes an incoming tensor + + Args: + `scale`: scale of the output Quantized Tensor + `zero_point`: zero_point of output Quantized Tensor + `dtype`: data type of output Quantized Tensor + `factory_kwargs`: Dictionary of kwargs used for configuring initialization + of internal buffers. Currently, `device` and `dtype` are supported. + Example: `factory_kwargs={'device': 'cuda', 'dtype': torch.float64}` + will initialize internal buffers as type `torch.float64` on the current CUDA device. + Note that `dtype` only applies to floating-point buffers. + + Examples:: + >>> t = torch.tensor([[1., -1.], [1., -1.]]) + >>> scale, zero_point, dtype = 1.0, 2, torch.qint8 + >>> qm = Quantize(scale, zero_point, dtype) + >>> # xdoctest: +SKIP + >>> qt = qm(t) + >>> print(qt) + tensor([[ 1., -1.], + [ 1., -1.]], size=(2, 2), dtype=torch.qint8, scale=1.0, zero_point=2) + """ + + scale: torch.Tensor + zero_point: torch.Tensor + + def __init__(self, scale, zero_point, dtype, factory_kwargs=None): + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + super().__init__() + self.register_buffer("scale", torch.tensor([scale], **factory_kwargs)) + self.register_buffer( + "zero_point", + torch.tensor( + [zero_point], + dtype=torch.long, + **{k: v for k, v in factory_kwargs.items() if k != "dtype"}, + ), + ) + self.dtype = dtype + + def forward(self, X): + return torch.quantize_per_tensor( + X, float(self.scale), int(self.zero_point), self.dtype + ) + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + assert hasattr(mod, "activation_post_process") + scale, zero_point = mod.activation_post_process.calculate_qparams() + return Quantize( + scale.float().item(), + zero_point.long().item(), + mod.activation_post_process.dtype, + ) + + def extra_repr(self): + return f"scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}" + + +class DeQuantize(torch.nn.Module): + r"""Dequantizes an incoming tensor + + Examples:: + >>> input = torch.tensor([[1., -1.], [1., -1.]]) + >>> scale, zero_point, dtype = 1.0, 2, torch.qint8 + >>> qm = Quantize(scale, zero_point, dtype) + >>> # xdoctest: +SKIP + >>> quantized_input = qm(input) + >>> dqm = DeQuantize() + >>> dequantized = dqm(quantized_input) + >>> print(dequantized) + tensor([[ 1., -1.], + [ 1., -1.]], dtype=torch.float32) + """ + + def forward(self, Xq): + return Xq.dequantize() + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + return DeQuantize() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6267c296257ac64d0540d25117ef3dc11b78c39d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb774117527e192d03049ea2a01ce0d044d64f9f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21f24a6b22fa14f5d48eaae306d23ce30de56010 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fe8d3d70361bced0c2150fdb2145f2741941bbc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f1a4ccc6b5e2c83389f25e439d721e66ae46180 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cb1a4ea9e0dd1d940740d754c1ac4421bc0bfa9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2b47e95416e269cbdfab48594d4beb5a1a3c72d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f835eaaeca3a8fc7fcd688e1366b41440e568f8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..376c4b0891ef56e2fb28d5d3c6e51c7683e826e0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52b634c99c8e2a42ac4152dd199ddefdf4febeb2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6281545f7d827e3ae4ab96af82b19f8d658dbd8d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..3ecf1d5c9a1e2c198d89f284e109dd9410994b60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py @@ -0,0 +1,351 @@ +# mypy: allow-untyped-defs +from warnings import warn + +import torch + + +__all__ = [ + "ReLU6", + "Hardswish", + "ELU", + "LeakyReLU", + "Sigmoid", + "Softmax", + "MultiheadAttention", + "PReLU", +] + + +class ReLU6(torch.nn.ReLU): + r"""Applies the element-wise function: + + :math:`\text{ReLU6}(x) = \min(\max(x_0, x), q(6))`, where :math:`x_0` is the + zero_point, and :math:`q(6)` is the quantized representation of number 6. + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(N, *)` where `*` means, any number of additional + dimensions + - Output: :math:`(N, *)`, same shape as the input + + .. image:: ../scripts/activation_images/ReLU6.png + + Examples:: + + >>> m = nn.quantized.ReLU6() + >>> input = torch.randn(2) + >>> # xdoctest: +SKIP + >>> input = torch.quantize_per_tensor(input, 1.0, 0, dtype=torch.qint32) + >>> output = m(input) + """ + + def __init__(self, inplace=False): + super().__init__(inplace) + self.inplace = inplace + + def forward(self, input): + return torch.ops.quantized.relu6(input, self.inplace) + + def _get_name(self): + return "QuantizedReLU6" + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + return ReLU6(mod.inplace) + + +class Hardswish(torch.nn.Hardswish): + r"""This is the quantized version of :class:`~torch.nn.Hardswish`. + + Args: + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + """ + + def __init__(self, scale, zero_point, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.hardswish(input, self.scale, self.zero_point) + + def _get_name(self): + return "QuantizedHardswish" + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + return Hardswish(float(scale), int(zero_point)) + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls(float(scale), int(zero_point)) + + +class ELU(torch.nn.ELU): + r"""This is the quantized equivalent of :class:`~torch.nn.ELU`. + + Args: + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + alpha: the alpha constant + """ + + def __init__(self, scale, zero_point, alpha=1.0): + super().__init__(alpha) + self.scale = scale + self.zero_point = zero_point + + def forward(self, input): + return torch.ao.nn.quantized.functional.elu( + input, self.scale, self.zero_point, self.alpha + ) + + def _get_name(self): + return "QuantizedELU" + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + return ELU(float(scale), int(zero_point), mod.alpha) + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls(float(scale), int(zero_point), mod.alpha) + + +class LeakyReLU(torch.nn.LeakyReLU): + r"""This is the quantized equivalent of :class:`~torch.nn.LeakyReLU`. + + Args: + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + negative_slope: Controls the angle of the negative slope. Default: 1e-2 + """ + + def __init__( + self, + scale: float, + zero_point: int, + negative_slope: float = 1e-2, + inplace: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(negative_slope, inplace) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.leaky_relu( + input, self.negative_slope, self.inplace, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedLeakyReLU" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace) + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace) + + +class Sigmoid(torch.nn.Sigmoid): + r"""This is the quantized equivalent of :class:`~torch.nn.Sigmoid`. + + Args: + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + """ + + def __init__(self, output_scale: float, output_zero_point: int): + super().__init__() + self.output_scale = output_scale + self.output_zero_point = output_zero_point + + def forward(self, input): + return torch.ops.quantized.sigmoid( + input, self.output_scale, self.output_zero_point + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + ( + output_scale, + output_zero_point, + ) = mod.activation_post_process.calculate_qparams() + return cls(float(output_scale), int(output_zero_point)) + + +class Softmax(torch.nn.Softmax): + r"""This is the quantized version of :class:`~torch.nn.Softmax`. + + Args: + dim: A dimension along which Softmax will be computed (so every slice along dim will sum to 1). + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + """ + + def __init__(self, dim=None, scale=1.0, zero_point=0): + super().__init__() + self.dim = dim + self.scale = scale + self.zero_point = zero_point + + def forward(self, input): + dim = self.dim + if dim is None: + stacklevel = 3 + # Note: adding the mypy ignore on _get_softmax_dim seems less bad + # than making `_get_softmax_dim` an official API. + dim = torch.nn.functional._get_softmax_dim( # type: ignore[attr-defined] + "softmax", input.dim(), stacklevel + ) + return torch.ops.quantized.softmax(input, dim, self.scale, self.zero_point) + + def _get_name(self): + return "QuantizedSoftmax" + + @staticmethod + def from_float(mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + return Softmax(mod.dim, float(scale), int(zero_point)) + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls(mod.dim, float(scale), int(zero_point)) + + +class MultiheadAttention(torch.ao.nn.quantizable.MultiheadAttention): + # pyrefly: ignore [bad-override] + _FLOAT_MODULE = torch.ao.nn.quantizable.MultiheadAttention + + def _get_name(self): + return "QuantizedMultiheadAttention" + + @classmethod + def from_float(cls, other): + # The whole flow is float -> observed -> quantized + # This class does observed -> quantized only + raise NotImplementedError( + "It looks like you are trying to convert a " + "non-observed MHA module. Please, see " + "the examples on quantizable MHAs." + ) + + @classmethod + def from_observed(cls, other): + converted = torch.ao.quantization.convert( + other, + mapping=None, + inplace=False, + remove_qconfig=True, + convert_custom_config_dict=None, + ) + converted.__class__ = cls + # Remove the parameters for the bias_k and bias_v to quantize them + # TODO: This is a potential source of accuracy drop. + # quantized cat takes the scale and zp of the first + # element, which might lose the precision in the bias_k + # and the bias_v (which are cat'ed with k/v being first). + if converted.bias_k is not None: + bias_k = converted._parameters.pop("bias_k") + sc, zp = torch._choose_qparams_per_tensor(bias_k, reduce_range=False) + bias_k = torch.quantize_per_tensor(bias_k, sc, zp, torch.quint8) + setattr(converted, "bias_k", bias_k) # noqa: B010 + + if converted.bias_v is not None: + bias_v = converted._parameters.pop("bias_v") + sc, zp = torch._choose_qparams_per_tensor( + bias_k, # type: ignore[possibly-undefined] + reduce_range=False, + ) + bias_v = torch.quantize_per_tensor(bias_v, sc, zp, torch.quint8) + setattr(converted, "bias_v", bias_v) # noqa: B010 + + del converted.in_proj_weight + del converted.in_proj_bias + + return converted + + +class PReLU(torch.nn.Module): + r"""This is the quantized equivalent of :class:`~torch.nn.PReLU`. + + Args: + scale: quantization scale of the output tensor + zero_point: quantization zero point of the output tensor + num_parameters: number of parameters: 1, or the number of channels at input. Default: 1 + """ + + def __init__( + self, output_scale: float, output_zero_point: int, num_parameters: int = 1 + ) -> None: + super().__init__() + self.num_parameters = num_parameters + self.scale = output_scale + self.zero_point = output_zero_point + w = torch.randn(num_parameters, dtype=torch.float) + qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.quint8) + self.set_weight(qw) + + def set_weight(self, w: torch.Tensor) -> None: + self.weight = w + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.prelu( + input, self.weight, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedPReLU" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + qprelu = cls(float(scale), int(zero_point), mod.num_parameters) + float_wt = mod.weight.float() + observer = mod.qconfig.weight() + observer(float_wt) + if observer.dtype != torch.quint8: + warn( + f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}", + stacklevel=2, + ) + wt_scale, wt_zp = observer.calculate_qparams() + qweight = torch.quantize_per_tensor( + float_wt, float(wt_scale), int(wt_zp), torch.quint8 + ) + qprelu.set_weight(qweight) + return qprelu + + @classmethod + def from_reference(cls, mod, scale, zero_point): + qprelu = cls(float(scale), int(zero_point), mod.num_parameters) + float_wt = mod.weight.float() + observer = mod.qconfig.weight() + observer(float_wt) + if observer.dtype != torch.quint8: + warn( + f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}", + stacklevel=2, + ) + wt_scale, wt_zp = observer.calculate_qparams() + qweight = torch.quantize_per_tensor( + float_wt, float(wt_scale), int(wt_zp), torch.quint8 + ) + qprelu.set_weight(qweight) + return qprelu diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..f1e6779c08b1f6af61c2377335b984c7f75a29a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py @@ -0,0 +1,130 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.intrinsic as nni + + +__all__ = ["BatchNorm2d", "BatchNorm3d"] + + +class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm): + def __init__( + self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(num_features, eps, momentum, True, True, **factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(1.0, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(0, **factory_kwargs)) + + @staticmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + activation_post_process = mod.activation_post_process + if type(mod) is cls._NNI_BN_RELU_MODULE: + mod = mod[0] + scale, zero_point = activation_post_process.calculate_qparams() + new_mod = cls(mod.num_features, mod.eps) + new_mod.weight = mod.weight + new_mod.bias = mod.bias + new_mod.running_mean = mod.running_mean + new_mod.running_var = mod.running_var + new_mod.scale = scale + new_mod.zero_point = zero_point + return new_mod + + @classmethod + def from_reference(cls, bn, output_scale, output_zero_point): + qbn = cls( + bn.num_features, + bn.eps, + bn.momentum, + device=bn.weight.device, + dtype=bn.weight.dtype, + ) + qbn.weight = bn.weight + qbn.bias = bn.bias + qbn.running_mean = bn.running_mean + qbn.running_var = bn.running_var + qbn.scale = output_scale + qbn.zero_point = output_zero_point + return qbn + + +class BatchNorm2d(_BatchNorm): + r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`.""" + + _NNI_BN_RELU_MODULE = nni.BNReLU2d + + def __init__( + self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(num_features, eps, momentum, **factory_kwargs) + + def _get_name(self): + return "QuantizedBatchNorm2d" + + def _check_input_dim(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + + def forward(self, input: torch.Tensor) -> torch.Tensor: + # disabling this since this is not symbolically traceable + # self._check_input_dim(input) + return torch.ops.quantized.batch_norm2d( + input, + self.weight, + self.bias, + self.running_mean, + self.running_var, + self.eps, + self.scale, + self.zero_point, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + return _BatchNorm.from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class BatchNorm3d(_BatchNorm): + r"""This is the quantized version of :class:`~torch.nn.BatchNorm3d`.""" + + _NNI_BN_RELU_MODULE = nni.BNReLU3d + + def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(num_features, eps, momentum, **factory_kwargs) + + def _get_name(self): + return "QuantizedBatchNorm3d" + + def _check_input_dim(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, H, W)`!") + + def forward(self, input: torch.Tensor) -> torch.Tensor: + # disabling this since this is not symbolically traceable + # self._check_input_dim(input) + return torch.ops.quantized.batch_norm3d( + input, + self.weight, + self.bias, + self.running_mean, + self.running_var, + self.eps, + self.scale, + self.zero_point, + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + return _BatchNorm.from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..a292d616a86c31d22550faa7d38d256350e4e91a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py @@ -0,0 +1,1244 @@ +# mypy: allow-untyped-defs +r"""Quantized convolution modules.""" + +from typing import ClassVar, Literal, Optional + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +import torch.nn as nn +import torch.nn.functional as F +from torch._ops import ops +from torch.nn.common_types import _size_1_t +from torch.nn.modules.utils import _pair, _single, _triple +from torch.nn.utils import fuse_conv_bn_weights + +from .utils import _quantize_weight, WeightedQuantizedModule + + +__all__ = [ + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", +] + +_SUPPORTED_PADDING = {"zeros", "reflect"} + + +def _reverse_repeat_padding(padding: list[int]) -> list[int]: + _reversed_padding_repeated_twice: list[int] = [] + N = len(padding) + for idx in range(N): + _reversed_padding_repeated_twice.extend(padding[N - idx - 1] for _ in range(2)) + return _reversed_padding_repeated_twice + + +class _ConvNd(WeightedQuantizedModule): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + # All subclasses have this signature - See PR #49702s + raise NotImplementedError + + def _init( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode="zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + if in_channels % groups != 0: + raise ValueError("in_channels must be divisible by groups") + if out_channels % groups != 0: + raise ValueError("out_channels must be divisible by groups") + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.transposed = transposed + self.output_padding = output_padding + self.groups = groups + if padding_mode not in _SUPPORTED_PADDING: + raise ValueError( + f"'padding_mode' {padding_mode} is not supported by quantized convolution" + ) + self.padding_mode = padding_mode + # Initialize as NCHW. set_weight will internally transpose to NHWC. + if self.transposed: + weight_shape = [in_channels, out_channels // self.groups] + else: + weight_shape = [out_channels, in_channels // self.groups] + qweight = torch._empty_affine_quantized( + weight_shape + list(kernel_size), + scale=1, + zero_point=0, + dtype=torch.qint8, + **{k: v for k, v in factory_kwargs.items() if k != "dtype"}, + ) + bias_float = ( + torch.zeros( + out_channels, + dtype=torch.float, + **{k: v for k, v in factory_kwargs.items() if k != "dtype"}, + ) + if bias + else None + ) + + self.set_weight_bias(qweight, bias_float) + self.scale = 1.0 + self.zero_point = 0 + + def set_weight_bias(self, qweight, bias_float): + raise NotImplementedError + + def bias(self): + raise NotImplementedError + + def _weight_bias(self): + raise NotImplementedError + + def extra_repr(self): + s = ( + "{in_channels}, {out_channels}, kernel_size={kernel_size}" + ", stride={stride}, scale={scale}, zero_point={zero_point}" + ) + if self.padding != (0,) * len(self.padding): + s += ", padding={padding}" + if self.dilation != (1,) * len(self.dilation): + s += ", dilation={dilation}" + if self.output_padding != (0,) * len(self.output_padding): + s += ", output_padding={output_padding}" + if self.groups != 1: + s += ", groups={groups}" + if self.bias() is None: + s += ", bias=False" + return s.format(**self.__dict__) + + # ===== Serialization methods ===== + # The special consideration here is that we have to unpack the weights into + # their regular QTensor form for serialization. Packed weights should not + # live outside the process in which they were created, rather they should be + # derived from the QTensor weight. + # self + # |--- weight : Tensor + # |--- bias : Tensor + # + # TODO: maybe change to this when https://github.com/pytorch/pytorch/pull/32958 is landed + # self + # |--- _packed_params : Conv2dPackedParamsBase or Conv3dPackedParamsBase + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + (w, b) = self._weight_bias() + destination[prefix + "weight"] = w + destination[prefix + "bias"] = b + destination[prefix + "scale"] = torch.tensor(self.scale) + destination[prefix + "zero_point"] = torch.tensor(self.zero_point) + + @torch.jit.export + def __getstate__(self): + (w, b) = self._weight_bias() + return ( + self.in_channels, + self.out_channels, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.transposed, + self.output_padding, + self.groups, + self.padding_mode, + w, + b, + self.scale, + self.zero_point, + self.training, + ) + + # ===== Deserialization methods ===== + # Counterpart to the serialization methods, we must pack the serialized + # QTensor weight into its packed format for use by the FBGEMM ops. + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.set_weight_bias(state_dict[prefix + "weight"], state_dict[prefix + "bias"]) + state_dict.pop(prefix + "weight") + state_dict.pop(prefix + "bias") + self.scale = float(state_dict[prefix + "scale"]) + state_dict.pop(prefix + "scale") + self.zero_point = int(state_dict[prefix + "zero_point"]) + state_dict.pop(prefix + "zero_point") + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @torch.jit.export + def __setstate__(self, state): + self.in_channels = state[0] + self.out_channels = state[1] + self.kernel_size = state[2] + self.stride = state[3] + self.padding = state[4] + self.dilation = state[5] + self.transposed = state[6] + self.output_padding = state[7] + self.groups = state[8] + self.padding_mode = state[9] + self.set_weight_bias(state[10], state[11]) + self.scale = state[12] + self.zero_point = state[13] + self.training = state[14] + + def __deepcopy__(self, memo): + new_instance = type(self).__new__(type(self)) + torch.nn.Module.__init__(new_instance) + state = self.__getstate__() + new_instance.__setstate__(state) + return new_instance + + def __copy__(self): + return self.__deepcopy__({}) + + @classmethod + def get_qconv(cls, mod, activation_post_process, weight_post_process=None): + r"""Creates a qconv object and returns it.""" + if weight_post_process is None: + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + assert weight_post_process.dtype == torch.qint8, ( + "Weight observer must have a dtype of qint8" + ) + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + # the __init__ call used is the one from derived classes and not the one from _ConvNd + qconv = cls( + mod.in_channels, + mod.out_channels, + mod.kernel_size, + mod.stride, + mod.padding, + mod.dilation, + mod.groups, + mod.bias is not None, + mod.padding_mode, + ) + qconv.set_weight_bias(qweight, mod.bias) + if ( + activation_post_process is None + or activation_post_process.dtype == torch.float + ): + return qconv # dynamic quantization doesn't need scale/zero_point + else: + act_scale, act_zp = activation_post_process.calculate_qparams() + qconv.scale = float(act_scale) + qconv.zero_point = int(act_zp) + return qconv + + @staticmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + if hasattr(mod, "weight_fake_quant"): + # assert type(mod) is cls.__QAT_MODULE, " nnq." + cls.__name__ + \ + # ".from_float only works for " + cls.__QAT_MODULE.__name__ + if type(mod) is cls._NNIQAT_CONV_BN_MODULE: + mod.weight, mod.bias = fuse_conv_bn_weights( + mod.weight, + mod.bias, + mod.bn.running_mean, + mod.bn.running_var, + mod.bn.eps, + mod.bn.weight, + mod.bn.bias, + ) + assert hasattr(mod, "activation_post_process"), ( + "Input QAT module must have observer attached" + ) + weight_post_process = mod.weight_fake_quant + activation_post_process = mod.activation_post_process + else: + assert type(mod) is cls._FLOAT_MODULE, ( + " nnq." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + + " but got:" + + str(type(mod)) + ) + assert hasattr(mod, "qconfig"), ( + "Input float module must have qconfig defined." + ) + activation_post_process = ( + None + if not hasattr(mod, "activation_post_process") + else mod.activation_post_process + ) + if type(mod) in [ + cls._NNI_CONV_RELU_MODULE, + cls._NNI_CONV_ADD_MODULE, + cls._NNI_CONV_ADD_RELU_MODULE, + ]: + mod = mod[0] + weight_post_process = mod.qconfig.weight() + return cls.get_qconv(mod, activation_post_process, weight_post_process) + + @classmethod + def from_reference(cls, ref_qconv, output_scale, output_zero_point): + r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module + Args: + ref_qconv (Module): a reference quantized module, either produced by torch.ao.quantization + utilities or provided by the user + output_scale (float): scale for output Tensor + output_zero_point (int): zero point for output Tensor + """ + qconv = cls( + ref_qconv.in_channels, + ref_qconv.out_channels, + ref_qconv.kernel_size, # type: ignore[arg-type] + ref_qconv.stride, # type: ignore[arg-type] + ref_qconv.padding, # type: ignore[arg-type] + ref_qconv.dilation, # type: ignore[arg-type] + ref_qconv.groups, + ref_qconv.bias is not None, # type: ignore[arg-type] + ref_qconv.padding_mode, + device=ref_qconv.weight.device, + dtype=ref_qconv.weight.dtype, + ) + qweight = ref_qconv.get_quantized_weight() + qconv.set_weight_bias(qweight, ref_qconv.bias) + qconv.scale = float(output_scale) + qconv.zero_point = int(output_zero_point) + return qconv + + +class Conv1d(_ConvNd): + r"""Applies a 1D convolution over a quantized input signal composed of + several quantized input planes. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv1d`. + + .. note:: + Only `zeros` is supported for the :attr:`padding_mode` argument. + + .. note:: + Only `torch.quint8` is supported for the input data type. + + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv1d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> m = nn.quantized.Conv1d(16, 33, 3, stride=2) + >>> input = torch.randn(20, 16, 100) + >>> # quantize input to quint8 + >>> # xdoctest: +SKIP + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, + ... dtype=torch.quint8) + >>> output = m(q_input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d + _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn1d + _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU1d + _NNI_CONV_ADD_MODULE: ClassVar[Optional[type[nn.Module]]] = None + _NNI_CONV_ADD_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _single(kernel_size) + stride = _single(stride) + # pyrefly: ignore [bad-assignment] + padding = padding if isinstance(padding, str) else _single(padding) + dilation = _single(dilation) + + # Subclasses of _ConvNd needs to call _init rather than __init__. See + # discussion on PR #49702 + super()._init( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _single(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConv1d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + if self.padding_mode == "zeros": + self._packed_params = torch.ops.quantized.conv1d_prepack( + w, b, self.stride, self.padding, self.dilation, self.groups + ) + else: + self._packed_params = torch.ops.quantized.conv1d_prepack( + w, b, self.stride, _pair(0), self.dilation, self.groups + ) + + def _weight_bias(self): + w, b = torch.ops.quantized.conv1d_unpack(self._packed_params) + return w, b + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + if self.padding_mode != "zeros": + # Padding in Conv1d is stored as (p, p), need to get (p,) + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1]) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv1d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module or qparams_dict. + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + return _ConvNd.from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class Conv2d(_ConvNd): + r"""Applies a 2D convolution over a quantized input signal composed of + several quantized input planes. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv2d`. + + .. note:: + Only `zeros` is supported for the :attr:`padding_mode` argument. + + .. note:: + Only `torch.quint8` is supported for the input data type. + + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv2d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> # With square kernels and equal stride + >>> m = nn.quantized.Conv2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> # non-square kernels and unequal stride and with padding and dilation + >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + >>> input = torch.randn(20, 16, 50, 100) + >>> # quantize input to quint8 + >>> # xdoctest: +SKIP + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> output = m(q_input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d + _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn2d + _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU2d + _NNI_CONV_ADD_MODULE: ClassVar[type[nni.ConvAdd2d]] = nni.ConvAdd2d + _NNI_CONV_ADD_RELU_MODULE: ClassVar[type[nni.ConvAddReLU2d]] = nni.ConvAddReLU2d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + # Subclasses of _ConvNd need to call _init rather than __init__. See + # discussion on PR #49702 + super()._init( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _pair(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConv2d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + if self.padding_mode == "zeros": + self._packed_params = torch.ops.quantized.conv2d_prepack( + w, b, self.stride, self.padding, self.dilation, self.groups + ) + else: + self._packed_params = torch.ops.quantized.conv2d_prepack( + w, b, self.stride, _pair(0), self.dilation, self.groups + ) + + def _weight_bias(self): + return self._packed_params.unpack() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv2d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module or qparams_dict. + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + return _ConvNd.from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +class Conv3d(_ConvNd): + r"""Applies a 3D convolution over a quantized input signal composed of + several quantized input planes. + + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.Conv3d`. + + .. note:: + Only `zeros` is supported for the :attr:`padding_mode` argument. + + .. note:: + Only `torch.quint8` is supported for the input data type. + + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + + See :class:`~torch.nn.Conv3d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> # With square kernels and equal stride + >>> m = nn.quantized.Conv3d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2)) + >>> # non-square kernels and unequal stride and with padding and dilation + >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2)) + >>> input = torch.randn(20, 16, 56, 56, 56) + >>> # quantize input to quint8 + >>> # xdoctest: +SKIP + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> output = m(q_input) + + """ + + _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d + _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn3d + _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU3d + _NNI_CONV_ADD_MODULE: ClassVar[Optional[type[nn.Module]]] = None + _NNI_CONV_ADD_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + ): + assert padding_mode != "reflect", "Conv3d does not support reflection padding" + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + # Subclasses of _ConvNd need to call _init rather than __init__. See + # discussion on PR #49702 + super()._init( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + False, + _triple(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConv3d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + if self.padding_mode == "zeros": + self._packed_params = torch.ops.quantized.conv3d_prepack( + w, b, self.stride, self.padding, self.dilation, self.groups + ) + else: + self._packed_params = torch.ops.quantized.conv3d_prepack( + w, b, self.stride, _triple(0), self.dilation, self.groups + ) + + def _weight_bias(self): + return self._packed_params.unpack() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, D, H, W)`!") + if self.padding_mode != "zeros": + _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding) + input = F.pad( + input, _reversed_padding_repeated_twice, mode=self.padding_mode + ) + return ops.quantized.conv3d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module or qparams_dict. + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + return _ConvNd.from_float( + cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + + +# === Transposed Convolutions === + + +class _ConvTransposeNd(_ConvNd): + _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]] + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + device=None, + dtype=None, + ): + if padding_mode != "zeros": + raise ValueError( + f'Only "zeros" padding mode is supported for {self.__class__.__name__}' + ) + factory_kwargs = {"device": device, "dtype": dtype} + # Subclasses of _ConvNd need to call _init rather than __init__. See + # discussion on PR #49702 + super()._init( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _input_padding( + self, kernel_size: list[int], dilation: list[int], padding: list[int] + ) -> list[int]: + res = torch.jit.annotate(list[int], []) + for kdx in range(len(kernel_size)): + pad = dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx] + res.append(pad) + return res + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): # type: ignore[override] + r"""Creates a quantized module from a float module or qparams_dict. + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + # derived classes override cls._FLOAT_MODULE attribute + msg = ( + " nnq." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ # type: ignore[attr-defined] + ) + assert type(mod) is cls._FLOAT_MODULE, msg + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined." + weight_post_process = mod.qconfig.weight() # type: ignore[operator, union-attr] + weight_post_process(mod.weight) + assert weight_post_process.dtype == torch.qint8, ( + "Weight observer must have a dtype of qint8" + ) + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + # the __init__ call used is the one from derived classes and not the one from _ConvTransposeNd + qconv = cls( + mod.in_channels, + mod.out_channels, + mod.kernel_size, # type: ignore[call-arg] + mod.stride, + mod.padding, + mod.output_padding, + mod.groups, + mod.bias is not None, + mod.dilation, + mod.padding_mode, + ) + qconv.set_weight_bias(qweight, mod.bias) + if ( + not hasattr(mod, "activation_post_process") + or mod.activation_post_process.dtype == torch.float + ): + return qconv # dynamic quantization doesn't need scale/zero_point + else: + act_scale, act_zp = mod.activation_post_process.calculate_qparams() # type: ignore[operator, union-attr] + qconv.scale = float(act_scale) + qconv.zero_point = int(act_zp) + return qconv + + @staticmethod + def from_reference(cls, ref_qconvt, output_scale, output_zero_point): # type: ignore[override] + r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module + Args: + ref_qconvt (Module): a reference quantized module, either produced by torch.ao.quantization + utilities or provided by the user + output_scale (float): scale for output Tensor + output_zero_point (int): zero point for output Tensor + """ + qconv = cls( + ref_qconvt.in_channels, + ref_qconvt.out_channels, + ref_qconvt.kernel_size, # type: ignore[arg-type] + ref_qconvt.stride, # type: ignore[arg-type] + ref_qconvt.padding, # type: ignore[arg-type] + ref_qconvt.output_padding, # type: ignore[arg-type] + ref_qconvt.groups, + ref_qconvt.bias is not None, # type: ignore[arg-type] + ref_qconvt.dilation, # type: ignore[arg-type] + ref_qconvt.padding_mode, + device=ref_qconvt.weight.device, + dtype=ref_qconvt.weight.dtype, + ) + qweight = ref_qconvt.get_quantized_weight() + qconv.set_weight_bias(qweight, ref_qconvt.bias) + qconv.scale = float(output_scale) + qconv.zero_point = int(output_zero_point) + return qconv + + +class ConvTranspose1d(_ConvTransposeNd): + r"""Applies a 1D transposed convolution operator over an input image + composed of several input planes. + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose1d`. + + .. note:: Currently only the QNNPACK engine is implemented. + Please, set the `torch.backends.quantized.engine = 'qnnpack'` + + For special notes, please, see :class:`~torch.ao.nn.quantized.Conv1d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose2d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> torch.backends.quantized.engine = 'qnnpack' + >>> from torch.ao.nn import quantized as nnq + >>> # With square kernels and equal stride + >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nnq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> input = torch.randn(20, 16, 50) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> output = m(q_input) + >>> # exact output size can be also specified as an argument + >>> input = torch.randn(1, 16, 12) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> downsample = nnq.Conv1d(16, 16, 3, stride=2, padding=1) + >>> upsample = nnq.ConvTranspose1d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(q_input) + >>> h.size() + torch.Size([1, 16, 6]) + >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose1d]] = nn.ConvTranspose1d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _single(kernel_size) + stride = _single(stride) + padding = _single(padding) + dilation = _single(dilation) + output_padding = _single(output_padding) + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConvTranspose1d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack( + w, + b, + self.stride, + self.padding, + self.output_padding, + self.dilation, + self.groups, + ) + + def _weight_bias(self): + w, b = torch.ops.quantized.conv_transpose1d_unpack(self._packed_params) + return w, b + + def weight(self): + (w, _) = self._weight_bias() + return w + + def bias(self): + (_, b) = self._weight_bias() + return b + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 3: + raise ValueError("Input shape must be `(N, C, L)`!") + return torch.ops.quantized.conv_transpose1d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_reference(cls, ref_qconvt, output_scale, output_zero_point): # type: ignore[override] + return _ConvTransposeNd.from_reference( + cls, ref_qconvt, output_scale, output_zero_point + ) + + +class ConvTranspose2d(_ConvTransposeNd): + r"""Applies a 2D transposed convolution operator over an input image + composed of several input planes. + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose2d`. + + For special notes, please, see :class:`~torch.ao.nn.quantized.Conv2d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose2d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> # QNNPACK or FBGEMM as backend + >>> torch.backends.quantized.engine = 'qnnpack' + >>> # With square kernels and equal stride + >>> import torch.ao.nn.quantized as nnq + >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> input = torch.randn(20, 16, 50, 100) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> output = m(q_input) + >>> # exact output size can be also specified as an argument + >>> input = torch.randn(1, 16, 12, 12) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1) + >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(q_input) + >>> h.size() + torch.Size([1, 16, 6, 6]) + >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose2d]] = nn.ConvTranspose2d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + output_padding = _pair(output_padding) + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConvTranspose2d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack( + w, + b, + self.stride, + self.padding, + self.output_padding, + self.dilation, + self.groups, + ) + + def _weight_bias(self): + w, b = torch.ops.quantized.conv2d_unpack(self._packed_params) + return w, b + + def weight(self): + (w, _) = self._weight_bias() + return w + + def bias(self): + (_, b) = self._weight_bias() + return b + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + return ops.quantized.conv_transpose2d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_reference(cls, ref_qconvt, output_scale, output_zero_point): # type: ignore[override] + return _ConvTransposeNd.from_reference( + cls, ref_qconvt, output_scale, output_zero_point + ) + + +class ConvTranspose3d(_ConvTransposeNd): + r"""Applies a 3D transposed convolution operator over an input image + composed of several input planes. + For details on input arguments, parameters, and implementation see + :class:`~torch.nn.ConvTranspose3d`. + + .. note:: Currently only the FBGEMM engine is implemented. + Please, set the `torch.backends.quantized.engine = 'fbgemm'` + + For special notes, please, see :class:`~torch.ao.nn.quantized.Conv3d` + + Attributes: + weight (Tensor): packed tensor derived from the learnable weight + parameter. + scale (Tensor): scalar for the output scale + zero_point (Tensor): scalar for the output zero point + See :class:`~torch.nn.ConvTranspose3d` for other attributes. + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> torch.backends.quantized.engine = 'fbgemm' + >>> from torch.ao.nn import quantized as nnq + >>> # With cubic kernels and equal stride + >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2) + >>> # non-cubic kernels and unequal stride and with padding + >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2)) + >>> input = torch.randn(20, 16, 50, 100, 100) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> output = m(q_input) + >>> # exact output size can be also specified as an argument + >>> input = torch.randn(1, 16, 12, 12, 12) + >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8) + >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1) + >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(q_input) + >>> h.size() + torch.Size([1, 16, 6, 6, 6]) + >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12, 12, 12]) + """ + + _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose3d]] = nn.ConvTranspose3d + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + output_padding = _triple(output_padding) + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _get_name(self): + return "QuantizedConvTranspose3d" + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack( + w, + b, + self.stride, + self.padding, + self.output_padding, + self.dilation, + self.groups, + ) + + def _weight_bias(self): + w, b = torch.ops.quantized.conv3d_unpack(self._packed_params) + return w, b + + def weight(self): + (w, _) = self._weight_bias() + return w + + def bias(self): + (_, b) = self._weight_bias() + return b + + def forward(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, T, H, W)`!") + return ops.quantized.conv_transpose3d( + input, self._packed_params, self.scale, self.zero_point + ) + + @classmethod + def from_reference(cls, ref_qconvt, output_scale, output_zero_point): # type: ignore[override] + return _ConvTransposeNd.from_reference( + cls, ref_qconvt, output_scale, output_zero_point + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py new file mode 100644 index 0000000000000000000000000000000000000000..3744ca30d5a49ba92cbb86690f2683af02d594fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py @@ -0,0 +1,30 @@ +# mypy: allow-untyped-defs +import torch + + +__all__ = ["Dropout"] + + +class Dropout(torch.nn.Dropout): + r"""This is the quantized equivalent of :class:`~torch.nn.Dropout`. + And this is a placeholder to enable models where fp32 tensors + had dropout to work with quantized tensors in train and eval mode. + + Args: + p: probability of an element to be zeroed + inplace: can optionally do the operation in-place. Default: ``False`` + """ + + def forward(self, input): + return input + + def _get_name(self): + return "QuantizedDropout" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + return cls(mod.p, mod.inplace) + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls(mod.p, mod.inplace) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7e843653ed27a49fa62d0f7e3408a7ac04f48fdf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py @@ -0,0 +1,413 @@ +# mypy: allow-untyped-defs +import torch +import torch.nn as nn +from torch import Tensor # noqa: F401 +from torch._jit_internal import List, Optional # noqa: F401 + +from .utils import _hide_packed_params_repr, _quantize_weight + + +__all__ = ["EmbeddingPackedParams", "Embedding", "EmbeddingBag"] + + +class EmbeddingPackedParams(torch.nn.Module): + _version = 1 + + def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8): + super().__init__() + self.dtype = dtype + if self.dtype in [torch.quint8, torch.quint4x2]: + scales = torch.ones(num_embeddings, dtype=torch.float) + zero_points = torch.zeros(num_embeddings, dtype=torch.float) + wq = torch._empty_per_channel_affine_quantized( + [num_embeddings, embedding_dim], + scales=scales, + zero_points=zero_points, + axis=0, + dtype=self.dtype, + ) + self.set_weight(wq) + else: + raise NotImplementedError( + f"Unsupported dtype on quantized embedding! Supports quint8 and quint4x2. Got dtype: {dtype}" + ) + + @torch.jit.export + def set_weight(self, weight: torch.Tensor) -> None: + if self.dtype in [torch.quint8, torch.quint4x2]: + self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight) + else: + raise NotImplementedError( + "Unsupported dtype for quantized embedding prepack! Supports quint8 and quint4x2." + ) + + @torch.jit.export + def _weight(self): + if self.dtype in [torch.quint8, torch.quint4x2]: + return torch.ops.quantized.embedding_bag_unpack(self._packed_weight) + else: + raise NotImplementedError( + "Unsupported dtype for quantized embedding unpack! Supports quint8 and quint4x2." + ) + + def forward(self, x): + return x + + # Version 1 + # self + # |--- _packed_weight : Tensor representing weight of EmbeddingPackedParamsBase + # |--- dtype : torch.dtype + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "dtype"] = self.dtype + destination[prefix + "_packed_weight"] = self._weight() + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.dtype = state_dict[prefix + "dtype"] + state_dict.pop(prefix + "dtype") + + weight = state_dict[prefix + "_packed_weight"] + state_dict.pop(prefix + "_packed_weight") + self.set_weight(weight) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def __repr__(self): + return self._weight().__repr__() + + +class Embedding(torch.nn.Module): + r""" + A quantized Embedding module with quantized packed weights as inputs. + We adopt the same interface as `torch.nn.Embedding`, please see + https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html for documentation. + + Similar to :class:`~torch.nn.Embedding`, attributes will be randomly + initialized at module creation time and will be overwritten later + + Attributes: + weight (Tensor): the non-learnable quantized weights of the module of + shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`. + + Examples:: + >>> m = nn.quantized.Embedding(num_embeddings=10, embedding_dim=12) + >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8]) + >>> output = m(indices) + >>> print(output.size()) + torch.Size([9, 12]) + + """ + + _version = 1 + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + _weight: Optional[Tensor] = None, + dtype=torch.quint8, + ) -> None: + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.dtype = dtype + + if _weight is None: + scales = torch.ones(num_embeddings, dtype=torch.float) + zero_points = torch.zeros(num_embeddings, dtype=torch.float) + qweight = torch._empty_per_channel_affine_quantized( + [num_embeddings, embedding_dim], + scales=scales, + zero_points=zero_points, + axis=0, + dtype=torch.quint8, + ) + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "Shape of weight does not match num_embeddings and embedding_dim" + qweight = _weight + + self._packed_params = EmbeddingPackedParams( + num_embeddings, embedding_dim, dtype + ) + self._packed_params.set_weight(qweight) + + def forward(self, indices: Tensor) -> Tensor: + if self.dtype == torch.quint4x2: + return torch.ops.quantized.embedding_4bit( + self._packed_params._packed_weight, indices + ) + else: + return torch.ops.quantized.embedding_byte( + self._packed_params._packed_weight, indices + ) + + def _get_name(self): + return "QuantizedEmbedding" + + def __repr__(self): + return _hide_packed_params_repr(self, EmbeddingPackedParams) + + def extra_repr(self): + extra_repr_str = ( + f"num_embeddings={self.num_embeddings}, embedding_dim={self.embedding_dim}, " + f"dtype={self._packed_params.dtype}, qscheme={self.weight().qscheme()}" + ) + + return extra_repr_str + + def set_weight(self, w: torch.Tensor) -> None: + self._packed_params.set_weight(w) + + def weight(self): + return self._packed_params._weight() + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a quantized embedding module from a float module + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by user + """ + if hasattr(mod, "weight_fake_quant"): + assert type(mod) is torch.ao.nn.qat.Embedding, ( + "nnq." + + cls.__name__ + + ".from_float " + + "with fake quant only works for " + + torch.ao.nn.qat.Embedding.__name__ + ) + weight_observer = mod.weight_fake_quant + else: + assert type(mod) is nn.Embedding, ( + "nnq." + + cls.__name__ + + ".from_float only works for " + + nn.Embedding.__name__ + ) + assert hasattr(mod, "qconfig"), ( + "Embedding input float module must have qconfig defined" + ) + from torch.ao.quantization import float_qparams_weight_only_qconfig + + if mod.qconfig is not None and mod.qconfig.weight is not None: # type: ignore[union-attr] + weight_observer = mod.qconfig.weight() # type: ignore[union-attr, operator] + else: + weight_observer = float_qparams_weight_only_qconfig.weight() + + dtype = weight_observer.dtype + is_float_qparams_qconfig = ( + weight_observer.qscheme == torch.per_channel_affine_float_qparams + ) + assert is_float_qparams_qconfig, ( + "Embedding quantization is only supported with float_qparams_weight_only_qconfig." + ) + + assert dtype == torch.quint8 or dtype == torch.quint4x2, ( + f"The only supported dtype for nnq.Embedding is torch.quint8 and torch.quint4x2, got {dtype}" + ) + + # Run the observer to calculate qparams. + weight_observer(mod.weight) + qweight = _quantize_weight(mod.weight.float(), weight_observer) + + # Create quantized Embedding module and pass in the quantized weight + qembedding = Embedding(mod.num_embeddings, mod.embedding_dim) + qembedding.set_weight(qweight) + return qembedding + + @classmethod + def from_reference(cls, ref_embedding): + qembedding = cls( + ref_embedding.num_embeddings, + ref_embedding.embedding_dim, + ref_embedding.padding_idx, + ref_embedding.max_norm, + ref_embedding.norm_type, + ref_embedding.scale_grad_by_freq, + ref_embedding.sparse, + ref_embedding.get_quantized_weight(), + ref_embedding.weight_dtype, + ) + return qembedding + + +class EmbeddingBag(Embedding): + r""" + A quantized EmbeddingBag module with quantized packed weights as inputs. + We adopt the same interface as `torch.nn.EmbeddingBag`, please see + https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html for documentation. + + Similar to :class:`~torch.nn.EmbeddingBag`, attributes will be randomly + initialized at module creation time and will be overwritten later + + Attributes: + weight (Tensor): the non-learnable quantized weights of the module of + shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`. + + Examples:: + >>> m = nn.quantized.EmbeddingBag(num_embeddings=10, embedding_dim=12, include_last_offset=True, mode='sum') + >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3]) + >>> offsets = torch.tensor([0, 19, 20, 28, 28, 32]) + >>> output = m(indices, offsets) + >>> print(output.size()) + torch.Size([5, 12]) + + """ + + _version = 1 + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "sum", + sparse: bool = False, + _weight: Optional[Tensor] = None, + include_last_offset: bool = False, + dtype=torch.quint8, + ) -> None: + super().__init__(num_embeddings, embedding_dim, _weight=_weight, dtype=dtype) + + self.mode = mode + self.pruned_weights = False + self.include_last_offset = include_last_offset + self.dtype = dtype + + def forward( + self, + indices: Tensor, + offsets: Optional[Tensor] = None, + per_sample_weights: Optional[Tensor] = None, + compressed_indices_mapping: Optional[Tensor] = None, + ) -> Tensor: + if self.dtype == torch.quint4x2: + return torch.ops.quantized.embedding_bag_4bit( + self._packed_params._packed_weight, + indices, + offsets, + False, + 0, + self.pruned_weights, + per_sample_weights, + compressed_indices_mapping, + self.include_last_offset, + ) + else: + return torch.ops.quantized.embedding_bag_byte( + self._packed_params._packed_weight, + indices, + offsets, + False, + 0, + self.pruned_weights, + per_sample_weights, + compressed_indices_mapping, + self.include_last_offset, + ) + + def _get_name(self): + return "QuantizedEmbeddingBag" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a quantized embedding_bag module from a float module + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by user + """ + if hasattr(mod, "weight_fake_quant"): + weight_observer = mod.weight_fake_quant + else: + assert type(mod) is nn.EmbeddingBag, ( + "nnq." + + cls.__name__ + + ".from_float only works for " + + nn.EmbeddingBag.__name__ + ) + assert hasattr(mod, "qconfig"), ( + "EmbeddingBag input float module must have qconfig defined" + ) + from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig + + if mod.qconfig is not None and mod.qconfig.weight is not None: # type: ignore[union-attr] + weight_observer = mod.qconfig.weight() # type: ignore[union-attr, operator] + else: + weight_observer = float_qparams_weight_only_qconfig.weight() + + dtype = weight_observer.dtype + is_float_qparams_qconfig = ( + weight_observer.qscheme == torch.per_channel_affine_float_qparams + ) + assert is_float_qparams_qconfig, ( + "EmbeddingBag quantization is only supported with float_qparams_weight_only_qconfig." + ) + + assert dtype == torch.quint8 or dtype == torch.quint4x2, ( + f"The only supported dtype for nnq.EmbeddingBag is torch.quint8 and torch.quint4x2, got {dtype}" + ) + + # Run the observer to calculate qparams. + weight_observer(mod.weight) + qweight = _quantize_weight(mod.weight.float(), weight_observer) + + # Create quantized EmbeddingBag module and pass in the quantized weight + qembedding_bag = EmbeddingBag( + mod.num_embeddings, + mod.embedding_dim, + max_norm=mod.max_norm, + norm_type=mod.norm_type, + scale_grad_by_freq=mod.scale_grad_by_freq, + mode=mod.mode, + sparse=mod.sparse, + include_last_offset=mod.include_last_offset, + dtype=dtype, + ) + qembedding_bag.set_weight(qweight) + return qembedding_bag + + @classmethod + def from_reference(cls, ref_embedding_bag): + qembedding_bag = cls( + ref_embedding_bag.num_embeddings, + ref_embedding_bag.embedding_dim, + ref_embedding_bag.max_norm, + ref_embedding_bag.norm_type, + ref_embedding_bag.scale_grad_by_freq, + ref_embedding_bag.mode, + ref_embedding_bag.sparse, + ref_embedding_bag.get_quantized_weight(), + ref_embedding_bag.include_last_offset, + ref_embedding_bag.weight_dtype, + ) + return qembedding_bag diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..acb578d0cc7989ecedd92fcb30664d50b4c18f87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py @@ -0,0 +1,298 @@ +# mypy: allow-untyped-defs + +import torch +from torch import Tensor +from torch._ops import ops + + +__all__ = ["FloatFunctional", "FXFloatFunctional", "QFunctional"] + + +class FloatFunctional(torch.nn.Module): + r"""State collector class for float operations. + + The instance of this class can be used instead of the ``torch.`` prefix for + some operations. See example usage below. + + .. note:: + + This class does not provide a ``forward`` hook. Instead, you must use + one of the underlying functions (e.g. ``add``). + + Examples:: + + >>> f_add = FloatFunctional() + >>> a = torch.tensor(3.0) + >>> b = torch.tensor(4.0) + >>> f_add.add(a, b) # Equivalent to ``torch.add(a, b)`` + + Valid operation names: + - add + - cat + - mul + - add_relu + - add_scalar + - mul_scalar + """ + + def __init__(self) -> None: + super().__init__() + self.activation_post_process = torch.nn.Identity() + + def forward(self, x): + raise RuntimeError( + "FloatFunctional is not intended to use the " + + "'forward'. Please use the underlying operation" + ) + + r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" + + def add(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.add(x, y) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.add(Tensor, float)``""" + + def add_scalar(self, x: Tensor, y: float) -> Tensor: + r = torch.add(x, y) + # Note: this operation is not observed because the observation is not + # needed for the quantized op. + return r + + r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" + + def mul(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.mul(x, y) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.mul(Tensor, float)``""" + + def mul_scalar(self, x: Tensor, y: float) -> Tensor: + r = torch.mul(x, y) + # Note: this operation is not observed because the observation is not + # needed for the quantized op. + return r + + r"""Operation equivalent to ``torch.cat``""" + + def cat(self, x: list[Tensor], dim: int = 0) -> Tensor: + r = torch.cat(x, dim=dim) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``relu(torch.add(x,y))``""" + + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.add(x, y) + r = torch.nn.functional.relu(r) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``""" + + def matmul(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.matmul(x, y) + r = self.activation_post_process(r) + return r + + +class FXFloatFunctional(torch.nn.Module): + r"""module to replace FloatFunctional module before FX graph mode quantization, + since activation_post_process will be inserted in top level module directly + + Valid operation names: + - add + - cat + - mul + - add_relu + - add_scalar + - mul_scalar + """ + + def forward(self, x): + raise RuntimeError( + "FloatFunctional is not intended to use the " + + "'forward'. Please use the underlying operation" + ) + + r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" + + def add(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.add(x, y) + return r + + r"""Operation equivalent to ``torch.add(Tensor, float)``""" + + def add_scalar(self, x: Tensor, y: float) -> Tensor: + r = torch.add(x, y) + return r + + r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" + + def mul(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.mul(x, y) + return r + + r"""Operation equivalent to ``torch.mul(Tensor, float)``""" + + def mul_scalar(self, x: Tensor, y: float) -> Tensor: + r = torch.mul(x, y) + return r + + r"""Operation equivalent to ``torch.cat``""" + + def cat(self, x: list[Tensor], dim: int = 0) -> Tensor: + r = torch.cat(x, dim=dim) + return r + + r"""Operation equivalent to ``relu(torch.add(x,y))``""" + + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.add(x, y) + r = torch.nn.functional.relu(r) + return r + + r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``""" + + def matmul(self, x: Tensor, y: Tensor) -> Tensor: + r = torch.matmul(x, y) + return r + + +class QFunctional(torch.nn.Module): + r"""Wrapper class for quantized operations. + + The instance of this class can be used instead of the + ``torch.ops.quantized`` prefix. See example usage below. + + .. note:: + + This class does not provide a ``forward`` hook. Instead, you must use + one of the underlying functions (e.g. ``add``). + + Examples:: + + >>> q_add = QFunctional() + >>> # xdoctest: +SKIP + >>> a = torch.quantize_per_tensor(torch.tensor(3.0), 1.0, 0, torch.qint32) + >>> b = torch.quantize_per_tensor(torch.tensor(4.0), 1.0, 0, torch.qint32) + >>> q_add.add(a, b) # Equivalent to ``torch.ops.quantized.add(a, b, 1.0, 0)`` + + Valid operation names: + - add + - cat + - mul + - add_relu + - add_scalar + - mul_scalar + """ + + def __init__(self) -> None: + super().__init__() + self.scale = 1.0 + self.zero_point = 0 + self.activation_post_process = torch.nn.Identity() + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "scale"] = torch.tensor(self.scale) + destination[prefix + "zero_point"] = torch.tensor(self.zero_point) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.scale = float(state_dict.pop(prefix + "scale")) + self.zero_point = int(state_dict.pop(prefix + "zero_point")) + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def _get_name(self): + return "QFunctional" + + def extra_repr(self): + return f"scale={self.scale}, zero_point={self.zero_point}" + + def forward(self, x): + raise RuntimeError( + "Functional is not intended to use the " + + "'forward'. Please use the underlying operation" + ) + + r"""Operation equivalent to ``torch.ops.quantized.add``""" + + def add(self, x: Tensor, y: Tensor) -> Tensor: + r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``""" + + def add_scalar(self, x: Tensor, y: float) -> Tensor: + r = ops.quantized.add_scalar(x, y) + # Note: this operation is not observed because the observation is not + # needed for the quantized op. + return r + + r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``""" + + def mul(self, x: Tensor, y: Tensor) -> Tensor: + r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``""" + + def mul_scalar(self, x: Tensor, y: float) -> Tensor: + r = ops.quantized.mul_scalar(x, y) + # Note: this operation is not observed because the observation is not + # needed for the quantized op. + return r + + r"""Operation equivalent to ``torch.ops.quantized.cat``""" + + def cat(self, x: list[Tensor], dim: int = 0) -> Tensor: + r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.ops.quantized.add_relu``""" + + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: + r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point) + r = self.activation_post_process(r) + return r + + r"""Operation equivalent to ``torch.ops.quantized.matmul(Tensor, Tensor)``""" + + def matmul(self, x: Tensor, y: Tensor) -> Tensor: + r = ops.quantized.matmul(x, y, scale=self.scale, zero_point=self.zero_point) + # Note: this operation is not observed because the observation is not + # needed for the quantized op. + return r + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + assert type(mod) is FloatFunctional, ( + "QFunctional.from_float expects an instance of FloatFunctional" + ) + scale, zero_point = mod.activation_post_process.calculate_qparams() # type: ignore[operator] + new_mod = QFunctional() + new_mod.scale = float(scale) + new_mod.zero_point = int(zero_point) + return new_mod diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..84fa07b4a02207a34c16747d52d7283ad2ecfc8f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py @@ -0,0 +1,361 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +from collections.abc import Iterable + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +import torch.nn as nn +from torch.nn.utils.fusion import fuse_linear_bn_weights +from torch.nn.utils.parametrize import type_before_parametrizations + +from .utils import _hide_packed_params_repr, _quantize_weight, WeightedQuantizedModule + + +__all__ = ["LinearPackedParams", "Linear"] + + +class LinearPackedParams(torch.nn.Module): + _version = 3 + + def __init__(self, dtype=torch.qint8): + super().__init__() + self.dtype = dtype + if self.dtype == torch.qint8: + wq = torch._empty_affine_quantized( + [1, 1], scale=1.0, zero_point=0, dtype=torch.qint8 + ) + elif self.dtype == torch.float16: + wq = torch.zeros([1, 1], dtype=torch.float) + self.set_weight_bias(wq, None) # type: ignore[possibly-undefined] + + @torch.jit.export + def set_weight_bias(self, weight: torch.Tensor, bias: torch.Tensor | None) -> None: + if self.dtype == torch.qint8: + self._packed_params = torch.ops.quantized.linear_prepack(weight, bias) + elif self.dtype == torch.float16: + self._packed_params = torch.ops.quantized.linear_prepack_fp16(weight, bias) + else: + raise RuntimeError("Unsupported dtype on dynamic quantized linear!") + + @torch.jit.export + def _weight_bias(self): + if self.dtype == torch.qint8: + return torch.ops.quantized.linear_unpack(self._packed_params) + elif self.dtype == torch.float16: + return torch.ops.quantized.linear_unpack_fp16(self._packed_params) + else: + raise RuntimeError("Unsupported dtype on dynamic quantized linear!") + + def forward(self, x): + return x + + # Version 1 + # self + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 2 + # self + # |--- weight : Tensor + # |--- bias : Tensor + # |--- dtype : torch.dtype + # + # Version 3 + # self + # |--- _packed_params : (Tensor, Tensor) representing (weight, bias) + # of LinearPackedParams + # |--- dtype : torch.dtype + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "dtype"] = self.dtype + destination[prefix + "_packed_params"] = self._weight_bias() + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + if version is None or version < 2: + self.dtype = torch.qint8 + else: + self.dtype = state_dict[prefix + "dtype"] + state_dict.pop(prefix + "dtype") + + if version is None or version < 3: + self.set_weight_bias( + state_dict[prefix + "weight"], state_dict[prefix + "bias"] + ) + state_dict.pop(prefix + "weight") + state_dict.pop(prefix + "bias") + + if version == 3: + weight, bias = state_dict[prefix + "_packed_params"] + state_dict.pop(prefix + "_packed_params") + self.set_weight_bias(weight, bias) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def __repr__(self): + return self._weight_bias().__repr__() + + +class Linear(WeightedQuantizedModule): + r""" + A quantized linear module with quantized tensor as inputs and outputs. + We adopt the same interface as `torch.nn.Linear`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation. + + Similar to :class:`~torch.nn.Linear`, attributes will be randomly + initialized at module creation time and will be overwritten later + + Attributes: + weight (Tensor): the non-learnable quantized weights of the module of + shape :math:`(\text{out\_features}, \text{in\_features})`. + bias (Tensor): the non-learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized to zero. + scale: `scale` parameter of output Quantized Tensor, type: double + zero_point: `zero_point` parameter for output Quantized Tensor, type: long + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> m = nn.quantized.Linear(20, 30) + >>> input = torch.randn(128, 20) + >>> # xdoctest: +SKIP + >>> input = torch.quantize_per_tensor(input, 1.0, 0, torch.quint8) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + _version = 3 + _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear) + + def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8): + super().__init__() + # We don't muck around with buffers or attributes or anything here + # to keep the module simple. *everything* is simply a Python attribute. + # Serialization logic is explicitly handled in the below serialization and + # deserialization modules + self.in_features = in_features + self.out_features = out_features + bias = None + if bias_: + bias = torch.zeros(out_features, dtype=torch.float) + + if dtype == torch.qint8: + qweight = torch._empty_affine_quantized( + [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8 + ) + elif dtype == torch.float16: + qweight = torch.zeros([out_features, in_features], dtype=torch.float) + else: + raise RuntimeError("Unsupported dtype specified for quantized Linear!") + + self._packed_params = LinearPackedParams(dtype) + self._packed_params.set_weight_bias(qweight, bias) + self.scale = 1.0 + self.zero_point = 0 + + def _get_name(self): + return "QuantizedLinear" + + def extra_repr(self): + return ( + f"in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, " + f"zero_point={self.zero_point}, qscheme={self.weight().qscheme()}" + ) + + def __repr__(self): + return _hide_packed_params_repr(self, LinearPackedParams) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.linear( + x, self._packed_params._packed_params, self.scale, self.zero_point + ) + + # ===== Serialization methods ===== + # The special consideration here is that we have to unpack the weights into their + # regular QTensor form for serialization. Packed weights should not live + # outside the process in which they were created, rather they should be derived + # from the QTensor weight. + # + # Version 1 + # self + # |--- scale : float + # |--- zero_point : int + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 2 + # self + # |--- scale : float + # |--- zero_point : int + # |--- _packed_params : Module + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 3 + # self + # |--- scale : float + # |--- zero_point : int + # |--- _packed_params : Module + # |--- _packed_params : (Tensor, Tensor) representing weight, bias + # of LinearPackedParams C++ struct + # + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "scale"] = torch.tensor(self.scale) + destination[prefix + "zero_point"] = torch.tensor(self.zero_point) + + # ===== Deserialization methods ===== + # Counterpart to the serialization methods, we must pack the serialized QTensor + # weight into its packed format for use by the FBGEMM ops. + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.scale = float(state_dict[prefix + "scale"]) + state_dict.pop(prefix + "scale") + + self.zero_point = int(state_dict[prefix + "zero_point"]) + state_dict.pop(prefix + "zero_point") + + version = local_metadata.get("version", None) + + if version is None or version == 1: + # We moved the parameters into a LinearPackedParameters submodule + weight = state_dict.pop(prefix + "weight") + bias = state_dict.pop(prefix + "bias") + state_dict.update( + { + prefix + "_packed_params.weight": weight, + prefix + "_packed_params.bias": bias, + } + ) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + # Function rather than property to make sure that JIT serialization doesn't + # register this as an attribute + def _weight_bias(self): + return self._packed_params._weight_bias() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def set_weight_bias(self, w: torch.Tensor, b: torch.Tensor | None) -> None: + self._packed_params.set_weight_bias(w, b) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a quantized module from an observed float module + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + use_precomputed_fake_quant (bool): if True, the module will reuse min/max + values from the precomputed fake quant module. + """ + if hasattr(mod, "weight_fake_quant"): + if type_before_parametrizations(mod) == nniqat.LinearBn1d: + mod.weight, mod.bias = fuse_linear_bn_weights( + mod.weight, + mod.bias, + mod.bn.running_mean, + mod.bn.running_var, + mod.bn.eps, + mod.bn.weight, + mod.bn.bias, + ) + weight_post_process = mod.weight_fake_quant + activation_post_process = mod.activation_post_process + else: + # This function does not participate in JIT, so it is OK to ignore + # the type mismatch in assignment. Also, mypy has an issue with + # iterables not being implemented, so we are ignoring those too. + if not isinstance(cls._FLOAT_MODULE, Iterable): + # pyrefly: ignore [bad-assignment] + cls._FLOAT_MODULE = [cls._FLOAT_MODULE] + supported_modules = ", ".join( + [float_mod.__name__ for float_mod in cls._FLOAT_MODULE] + ) + error_msg = f"nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}" + assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, ( + error_msg.format() + ) + assert hasattr(mod, "qconfig"), ( + "Input float module must have qconfig defined" + ) + activation_post_process = mod.activation_post_process + if type_before_parametrizations(mod) == nni.LinearReLU: + mod = mod[0] + weight_post_process = ( + mod.qconfig.weight() + if not hasattr(mod, "weight_fake_quant") + else mod.weight_fake_quant + ) + + if not use_precomputed_fake_quant: + # Observer may not have been called yet + # Observer might have been called in the previous stage via PTQ algorithm e.g. AdaRound + weight_post_process(mod.weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() + assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8" + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + qlinear = cls(mod.in_features, mod.out_features, dtype=dtype) + qlinear.set_weight_bias(qweight, mod.bias) + qlinear.scale = float(act_scale) + qlinear.zero_point = int(act_zp) + return qlinear + + @classmethod + def from_reference(cls, ref_qlinear, output_scale, output_zero_point): + r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module + + Args: + ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization + utilities or provided by the user + output_scale (float): scale for output Tensor + output_zero_point (int): zero point for output Tensor + """ + qlinear = cls(ref_qlinear.in_features, ref_qlinear.out_features) + qweight = ref_qlinear.get_quantized_weight() + qlinear.set_weight_bias(qweight, ref_qlinear.bias) + + qlinear.scale = float(output_scale) + qlinear.zero_point = int(output_zero_point) + return qlinear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..fa335b4699db5519e2e53f27aa18958b5afced94 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py @@ -0,0 +1,358 @@ +# mypy: allow-untyped-defs +import torch + + +__all__ = [ + "LayerNorm", + "GroupNorm", + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", +] + + +class LayerNorm(torch.nn.LayerNorm): + r"""This is the quantized version of :class:`~torch.nn.LayerNorm`. + + Additional args: + * **scale** - quantization scale of the output, type: double. + * **zero_point** - quantization zero point of the output, type: long. + + """ + + def __init__( + self, + normalized_shape, + weight, + bias, + scale, + zero_point, + eps=1e-5, + elementwise_affine=True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + normalized_shape, + eps=eps, + elementwise_affine=elementwise_affine, + # pyrefly: ignore [bad-argument-type] + **factory_kwargs, + ) + self.weight = weight + self.bias = bias + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.layer_norm( + input, + self.normalized_shape, + weight=self.weight, + bias=self.bias, + eps=self.eps, + output_scale=self.scale, + output_zero_point=self.zero_point, + ) + + def _get_name(self): + return "QuantizedLayerNorm" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + new_mod = cls( + mod.normalized_shape, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.elementwise_affine, + ) + return new_mod + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls( + mod.normalized_shape, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.elementwise_affine, + ) + + +class GroupNorm(torch.nn.GroupNorm): + r"""This is the quantized version of :class:`~torch.nn.GroupNorm`. + + Additional args: + * **scale** - quantization scale of the output, type: double. + * **zero_point** - quantization zero point of the output, type: long. + + """ + + __constants__ = ["num_groups", "num_channels", "eps", "affine"] + + def __init__( + self, + num_groups, + num_channels, + weight, + bias, + scale, + zero_point, + eps=1e-5, + affine=True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs) + self.weight = weight + self.bias = bias + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.group_norm( + input, + self.num_groups, + self.weight, + self.bias, + self.eps, + self.scale, + self.zero_point, + ) + + def _get_name(self): + return "QuantizedGroupNorm" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + new_mod = cls( + mod.num_groups, + mod.num_channels, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + return new_mod + + +class InstanceNorm1d(torch.nn.InstanceNorm1d): + r"""This is the quantized version of :class:`~torch.nn.InstanceNorm1d`. + + Additional args: + * **scale** - quantization scale of the output, type: double. + * **zero_point** - quantization zero point of the output, type: long. + + """ + + def __init__( + self, + num_features, + weight, + bias, + scale, + zero_point, + eps=1e-5, + momentum=0.1, + affine=False, + track_running_stats=False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + self.weight = weight + self.bias = bias + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.instance_norm( + input, self.weight, self.bias, self.eps, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedInstanceNorm1d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + new_mod = cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + return new_mod + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + + +class InstanceNorm2d(torch.nn.InstanceNorm2d): + r"""This is the quantized version of :class:`~torch.nn.InstanceNorm2d`. + + Additional args: + * **scale** - quantization scale of the output, type: double. + * **zero_point** - quantization zero point of the output, type: long. + + """ + + def __init__( + self, + num_features, + weight, + bias, + scale, + zero_point, + eps=1e-5, + momentum=0.1, + affine=False, + track_running_stats=False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + self.weight = weight + self.bias = bias + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.instance_norm( + input, self.weight, self.bias, self.eps, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedInstanceNorm2d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + new_mod = cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + return new_mod + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + + +class InstanceNorm3d(torch.nn.InstanceNorm3d): + r"""This is the quantized version of :class:`~torch.nn.InstanceNorm3d`. + + Additional args: + * **scale** - quantization scale of the output, type: double. + * **zero_point** - quantization zero point of the output, type: long. + + """ + + def __init__( + self, + num_features, + weight, + bias, + scale, + zero_point, + eps=1e-5, + momentum=0.1, + affine=False, + track_running_stats=False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + self.weight = weight + self.bias = bias + # pyrefly: ignore [bad-argument-type] + self.register_buffer("scale", torch.tensor(scale, **factory_kwargs)) + # pyrefly: ignore [bad-argument-type] + self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs)) + + def forward(self, input): + return torch.ops.quantized.instance_norm( + input, self.weight, self.bias, self.eps, self.scale, self.zero_point + ) + + def _get_name(self): + return "QuantizedInstanceNorm3d" + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + scale, zero_point = mod.activation_post_process.calculate_qparams() + new_mod = cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) + return new_mod + + @classmethod + def from_reference(cls, mod, scale, zero_point): + return cls( + mod.num_features, + mod.weight, + mod.bias, + float(scale), + int(zero_point), + mod.eps, + mod.affine, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..5040b8c97d050102779c742989dd4f52cd3bffa8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py @@ -0,0 +1,59 @@ +from typing import Any + +import torch + + +__all__ = [ + "LSTM", +] + + +class LSTM(torch.ao.nn.quantizable.LSTM): + r"""A quantized long short-term memory (LSTM). + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTM` + + Attributes: + layers : instances of the `_LSTMLayer` + + .. note:: + To access the weights and biases, you need to access them per layer. + See examples in :class:`~torch.ao.nn.quantizable.LSTM` + + Examples:: + >>> # xdoctest: +SKIP + >>> custom_module_config = { + ... 'float_to_observed_custom_module_class': { + ... nn.LSTM: nn.quantizable.LSTM, + ... }, + ... 'observed_to_quantized_custom_module_class': { + ... nn.quantizable.LSTM: nn.quantized.LSTM, + ... } + ... } + >>> tq.prepare(model, prepare_custom_module_class=custom_module_config) + >>> tq.convert(model, convert_custom_module_class=custom_module_config) + """ + + _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM # type: ignore[assignment] + + def _get_name(self) -> str: + return "QuantizedLSTM" + + @classmethod + def from_float(cls, *args: Any, **kwargs: Any) -> None: + # The whole flow is float -> observed -> quantized + # This class does observed -> quantized only + raise NotImplementedError( + "It looks like you are trying to convert a " + "non-observed LSTM module. Please, see " + "the examples on quantizable LSTMs." + ) + + @classmethod + def from_observed(cls: type["LSTM"], other: torch.ao.nn.quantizable.LSTM) -> "LSTM": + assert isinstance(other, cls._FLOAT_MODULE) # type: ignore[has-type] + converted = torch.ao.quantization.convert( + other, inplace=False, remove_qconfig=True + ) + converted.__class__ = cls + return converted diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..330070913a7521871f123a3e076264498a6ef612 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py @@ -0,0 +1,144 @@ +# mypy: allow-untyped-defs +import abc +import collections +import itertools + +import torch +from torch.nn.modules.module import _addindent + + +__all__ = [ + "WeightedQuantizedModule", +] + + +class WeightedQuantizedModule(torch.nn.Module, metaclass=abc.ABCMeta): + """Wrapper for quantized modules than can be lowered from reference modules.""" + + @classmethod + @abc.abstractmethod + def from_reference(cls, ref_module, output_scale, output_zero_point): + raise NotImplementedError + + +def _get_weight_observer(observer): + # FakeQuantize observer + if hasattr(observer, "activation_post_process"): + observer = observer.activation_post_process + # UniformQuantizationObserverBase observer + return observer + + +def _needs_weight_clamping(observer, dtype): + observer = _get_weight_observer(observer) + if dtype in [torch.qint8, torch.quint8, torch.qint32]: + info = torch.iinfo(dtype) + return observer.quant_min > info.min or observer.quant_max < info.max + return False + + +def _clamp_weights(qweight, observer, scale, zp): + if not _needs_weight_clamping(observer, qweight.dtype): + return qweight + + observer = _get_weight_observer(observer) + min_, max_ = observer.quant_min, observer.quant_max + + # Doing this because can't use torch.ops.quantized.clamp() with per_channel qscheme yet. + qw_int_max = torch.clone(qweight.int_repr()).fill_(max_) + qw_int_min = torch.clone(qweight.int_repr()).fill_(min_) + qw_int = torch.minimum(torch.maximum(qweight.int_repr(), qw_int_min), qw_int_max) + + if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]: + qweight = torch._make_per_tensor_quantized_tensor( + qw_int, scale.item(), zp.item() + ) + elif observer.qscheme in [ + torch.per_channel_symmetric, + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + qweight = torch._make_per_channel_quantized_tensor( + qw_int, scale, zp, axis=observer.ch_axis + ) + else: + raise ValueError("Unexpected qscheme " + observer.qscheme) + return qweight + + +def _quantize_weight(float_wt, observer): + wt_scale, wt_zp = observer.calculate_qparams() + if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]: + qweight = torch.quantize_per_tensor( + float_wt, float(wt_scale), int(wt_zp), torch.qint8 + ) + qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp) + elif observer.qscheme in [torch.per_channel_symmetric, torch.per_channel_affine]: + wt_axis = observer.ch_axis + qweight = torch.quantize_per_channel( + float_wt, + wt_scale.to(torch.double), + wt_zp.to(torch.int64), + wt_axis, + torch.qint8, + ) + qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp) + elif observer.qscheme == torch.per_channel_affine_float_qparams: + qweight = torch.quantize_per_channel( + float_wt, + wt_scale.to(torch.float), + wt_zp.to(torch.float), + observer.ch_axis, + observer.dtype, + ) + qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp) + else: + raise ValueError("Unexpected qscheme " + observer.qscheme) + return qweight + + +def _ntuple_from_first(n): + """Converts the argument to a tuple of size n + with the first element repeated.""" + + def parse(x): + while isinstance(x, collections.abc.Sequence): + if len(x) == n: + break + x = x[0] + return tuple(itertools.repeat(x, n)) + + return parse + + +def _hide_packed_params_repr(self, params): + # We don't want to show `PackedParams` children, hence custom + # `__repr__`. This is the same as nn.Module.__repr__, except the check + # for the `params module`. + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split("\n") + child_lines = [] + for key, module in self._modules.items(): + if isinstance(module, params): + continue + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append("(" + key + "): " + mod_str) + lines = extra_lines + child_lines + + main_str = self._get_name() + "(" + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += "\n " + "\n ".join(lines) + "\n" + + main_str += ")" + return main_str + + +_pair_from_first = _ntuple_from_first(2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1e15e9c1516d30f7ca9ee47b21b267533de75b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py @@ -0,0 +1,19 @@ +from .modules import * # noqa: F403 + + +__all__ = [ + "Linear", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "RNNCell", + "LSTMCell", + "GRUCell", + "LSTM", + "GRU", + "Embedding", + "EmbeddingBag", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d10eff8aa431fc79df0a413aa0d38ad8a868df60 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe97c22f5a46a5eafc1432075fc57dd44c3aa8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py @@ -0,0 +1,29 @@ +from .conv import ( + Conv1d, + Conv2d, + Conv3d, + ConvTranspose1d, + ConvTranspose2d, + ConvTranspose3d, +) +from .linear import Linear +from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNNCell +from .sparse import Embedding, EmbeddingBag + + +__all__ = [ + "Linear", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "RNNCell", + "LSTMCell", + "GRUCell", + "LSTM", + "GRU", + "Embedding", + "EmbeddingBag", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb4ea23be0524260424bb4967ca2ab09731ebc8f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e60cd9afa36c944d42abf1f27cf14669df7d367c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f44fcbf3dc5c5687f28b66ec63eaedb622b288c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d26ab1d3135fcaa5abc6a1f5a31609106e32d50 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..786d2a34375072315e16b1cb844774231cefe023 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8414ec75339cc2f25caf242f4d08df63f79c45ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..3273b89cc70ab21a87a0369e71c3ceff19615111 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py @@ -0,0 +1,518 @@ +# mypy: allow-untyped-defs +from typing import Any, Literal, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.common_types import _size_1_t + +from .utils import ReferenceQuantizedModule + + +__all__ = [ + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", +] + + +class _ConvNd(torch.nn.modules.conv._ConvNd, ReferenceQuantizedModule): + """A reference version of nn.quantized.Conv2d + we will not pack the parameters in this module, since weight packing is an + optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), + this is useful when user want to use this module in other backends like Glow. + """ + + __annotations__ = {"bias": Optional[torch.Tensor]} + _IS_REFERENCE = True + + @staticmethod + def from_float(cls, float_conv, weight_qparams): + qref_conv = cls( + float_conv.in_channels, + float_conv.out_channels, + float_conv.kernel_size, # type: ignore[arg-type] + float_conv.stride, # type: ignore[arg-type] + float_conv.padding, # type: ignore[arg-type] + float_conv.dilation, # type: ignore[arg-type] + float_conv.groups, + float_conv.bias is not None, # type: ignore[arg-type] + float_conv.padding_mode, + device=float_conv.weight.device, + dtype=float_conv.weight.dtype, + weight_qparams=weight_qparams, + ) + qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) + if float_conv.bias is not None: + qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) + return qref_conv + + +class Conv1d(_ConvNd, nn.Conv1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.Conv1d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv1d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv1d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv1d + """ + weight_quant_dequant = self.get_weight() + + result = F.conv1d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + return result + + def _get_name(self): + return "QuantizedConv1d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvNd.from_float(cls, float_conv, weight_qparams) + + +class Conv2d(_ConvNd, nn.Conv2d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.Conv2d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + # pyrefly: ignore [bad-argument-type] + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv2d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv2d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv2d + """ + weight_quant_dequant = self.get_weight() + + result = F.conv2d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + return result + + def _get_name(self): + return "QuantizedConv2d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvNd.from_float(cls, float_conv, weight_qparams) + + +class Conv3d(_ConvNd, nn.Conv3d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.Conv3d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + # pyrefly: ignore [bad-argument-type] + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv3d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv3d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv3d + """ + weight_quant_dequant = self.get_weight() + + result = F.conv3d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + return result + + def _get_name(self): + return "QuantizedConv3d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvNd.from_float(cls, float_conv, weight_qparams) + + +class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd): + """A reference version of nn.quantized.ConvTranspose2d + we will not pack the parameters in this module, since weight packing is an + optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), + this is useful when user want to use this module in other backends like Glow. + """ + + @staticmethod + def from_float(cls, float_conv, weight_qparams): + qref_conv = cls( + float_conv.in_channels, + float_conv.out_channels, + float_conv.kernel_size, # type: ignore[arg-type] + float_conv.stride, # type: ignore[arg-type] + float_conv.padding, # type: ignore[arg-type] + float_conv.output_padding, # type: ignore[arg-type] + float_conv.groups, + float_conv.bias is not None, # type: ignore[arg-type] + float_conv.dilation, # type: ignore[arg-type] + float_conv.padding_mode, + device=float_conv.weight.device, + dtype=float_conv.weight.dtype, + weight_qparams=weight_qparams, + ) + qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) + if float_conv.bias is not None: + qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) + return qref_conv + + +class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + output_padding: _size_1_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_1_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.ConvTranspose1d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward( + self, x: torch.Tensor, output_size: list[int] | None = None + ) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose1d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose1d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv1d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, # type: ignore[arg-type] + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + self.dilation, # type: ignore[arg-type] + ) + + weight_quant_dequant = self.get_weight() + result = F.conv_transpose1d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + return result + + def _get_name(self): + return "QuantizedConvTranspose1d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + + +class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.ConvTranspose2d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + # pyrefly: ignore [bad-argument-type] + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward( + self, x: torch.Tensor, output_size: list[int] | None = None + ) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose2d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose2d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv2d + """ + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + + output_padding = self._output_padding( + input, # type: ignore[arg-type] + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + self.dilation, # type: ignore[arg-type] + ) + + weight_quant_dequant = self.get_weight() + result = F.conv_transpose2d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + + return result + + def _get_name(self): + return "QuantizedConvTranspose2d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + + +class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ): + nn.ConvTranspose3d.__init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + groups, + bias, + dilation, + # pyrefly: ignore [bad-argument-type] + padding_mode, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def forward( + self, x: torch.Tensor, output_size: list[int] | None = None + ) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose3d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose3d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv3d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, # type: ignore[arg-type] + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + self.dilation, # type: ignore[arg-type] + ) + + weight_quant_dequant = self.get_weight() + result = F.conv_transpose3d( + x, + weight_quant_dequant, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + return result + + def _get_name(self): + return "QuantizedConvTranspose3d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): # type: ignore[override] + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..6014fab24036c30b183f5622d12aae4a345baedb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py @@ -0,0 +1,69 @@ +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import ReferenceQuantizedModule + + +__all__ = ["Linear"] + + +class Linear(nn.Linear, ReferenceQuantizedModule): + """A reference quantized linear module that fits into the FX + Graph Mode Quantization workflow + activation will be floating point Tensor, we will store floating + point weight as well in the module, but in forward we'll quantize + and dequantize the weight before running the floating point functional + linear operator. + """ + + _IS_REFERENCE = True + + def __init__( + self, + in_features: int, + out_features: int, + bias_: bool = True, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + weight_qparams: dict[str, Any] | None = None, + ) -> None: + super().__init__(in_features, out_features, bias_, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def _get_name(self) -> str: + return "QuantizedLinear(Reference)" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.linear --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.linear --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized linear + """ + weight_quant_dequant = self.get_weight() + result = F.linear(x, weight_quant_dequant, self.bias) + return result + + @classmethod + def from_float( + cls, float_linear: nn.Linear, weight_qparams: dict[str, Any] + ) -> "Linear": + qref_linear = Linear( + float_linear.in_features, + float_linear.out_features, + float_linear.bias is not None, + device=float_linear.weight.device, + dtype=float_linear.weight.dtype, + weight_qparams=weight_qparams, + ) + qref_linear.weight = torch.nn.Parameter(float_linear.weight.detach()) + if float_linear.bias is not None: + qref_linear.bias = torch.nn.Parameter(float_linear.bias.detach()) + return qref_linear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..1bdbfb81430b4db9e09ea752310732b89f47bfa1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py @@ -0,0 +1,861 @@ +# mypy: allow-untyped-defs +from typing import Any + +import torch +import torch.nn as nn +from torch import _VF, Tensor +from torch.nn.utils.rnn import PackedSequence + +from .utils import _quantize_and_dequantize_weight, _quantize_weight + + +__all__ = [ + "RNNCellBase", + "RNNCell", + "LSTMCell", + "GRUCell", + "RNNBase", + "LSTM", + "GRU", + "get_quantized_weight", +] + + +def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: + return tensor.index_select(dim, permutation) + + +def _get_weight_and_quantization_params(module, wn): + weight = getattr(module, wn) + params = [weight] + for param_name in [ + wn + n for n in ["_qscheme", "_dtype", "_scale", "_zero_point", "_axis_int"] + ]: + if hasattr(module, param_name): + param = getattr(module, param_name) + else: + param = None + params.append(param) + return params + + +def get_quantized_weight(module, wn): + if not hasattr(module, wn): + return None + params = _get_weight_and_quantization_params(module, wn) + weight = _quantize_weight(*params) + return weight + + +def _get_quantize_and_dequantized_weight(module, wn): + if not hasattr(module, wn): + return None + params = _get_weight_and_quantization_params(module, wn) + weight = _quantize_and_dequantize_weight(*params) + return weight + + +class RNNCellBase(nn.RNNCellBase): + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool, + num_chunks: int, + device=None, + dtype=None, + weight_qparams_dict=None, + ) -> None: + super().__init__( + input_size, hidden_size, bias, num_chunks, device=device, dtype=dtype + ) + # TODO(jerryzh168): maybe make this arg a required arg + if weight_qparams_dict is None: + weight_qparams = { + "qscheme": torch.per_tensor_affine, + "dtype": torch.quint8, + "scale": 1.0, + "zero_point": 0, + } + weight_qparams_dict = { + "weight_ih": weight_qparams, + "weight_hh": weight_qparams, + "is_decomposed": False, + } + assert len(weight_qparams_dict) == 3, ( + "Expected length for weight_qparams_dict to be 3 for QuantizedRNNCellBase(Reference)" + ) + self._init_weight_qparams_dict(weight_qparams_dict, device) + + def _init_weight_qparams_dict(self, weight_qparams_dict, device): + assert weight_qparams_dict is not None + self.is_decomposed = weight_qparams_dict["is_decomposed"] + for key, weight_qparams in weight_qparams_dict.items(): + if key == "is_decomposed": + continue + # TODO: refactor the duplicated code to utils.py + weight_qscheme = weight_qparams["qscheme"] + weight_dtype = weight_qparams["dtype"] + setattr(self, key + "_qscheme", weight_qscheme) + setattr(self, key + "_dtype", weight_dtype) + assert weight_qscheme in [ + None, + torch.per_tensor_affine, + torch.per_channel_affine, + ], Exception( + f"qscheme: {weight_qscheme} is not support in {self._get_name()}" + ) + if weight_qscheme is not None: + scale = weight_qparams["scale"] + scale_tensor = ( + scale.detach().clone() + if isinstance(scale, torch.Tensor) + else torch.tensor(scale, dtype=torch.float, device=device) + ) + self.register_buffer(key + "_scale", scale_tensor) + zp = weight_qparams["zero_point"] + zp_tensor = ( + zp.detach().clone() + if isinstance(zp, torch.Tensor) + else torch.tensor(zp, dtype=torch.int, device=device) + ) + self.register_buffer(key + "_zero_point", zp_tensor) + if weight_qscheme == torch.per_channel_affine: + axis = weight_qparams["axis"] + axis_tensor = ( + axis.detach().clone() + if isinstance(axis, torch.Tensor) + else torch.tensor(axis, dtype=torch.int, device=device) + ) + self.register_buffer(key + "_axis", axis_tensor) + else: + # added for TorchScriptability, not used + self.register_buffer( + key + "_axis", torch.tensor(0, dtype=torch.int, device=device) + ) + setattr(self, key + "_axis_int", getattr(self, key + "_axis").item()) + + def _get_name(self): + return "QuantizedRNNCellBase(Reference)" + + def get_quantized_weight_ih(self): + return get_quantized_weight(self, "weight_ih") + + def get_quantized_weight_hh(self): + return get_quantized_weight(self, "weight_hh") + + def get_weight_ih(self): + return _get_quantize_and_dequantized_weight(self, "weight_ih") + + def get_weight_hh(self): + return _get_quantize_and_dequantized_weight(self, "weight_hh") + + +class RNNCell(RNNCellBase): + """ + We'll store weight_qparams for all the weights (weight_ih and weight_hh), + we need to pass in a `weight_qparams_dict` that maps from weight name, + e.g. weight_ih, to the weight_qparams for that weight + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + nonlinearity: str = "tanh", + device=None, + dtype=None, + weight_qparams_dict: dict[str, Any] | None = None, + ) -> None: + factory_kwargs = { + "device": device, + "dtype": dtype, + "weight_qparams_dict": weight_qparams_dict, + } + super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs) + self.nonlinearity = nonlinearity + + def _get_name(self): + return "QuantizedRNNCell(Reference)" + + # TODO: refactor nn.RNNCell to have a _forward that takes weight_ih and weight_hh as input + # and remove duplicated code, same for the other two Cell modules + def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor: + assert input.dim() in ( + 1, + 2, + ), ( + f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + else: + hx = hx.unsqueeze(0) if not is_batched else hx + + if self.nonlinearity == "tanh": + ret = _VF.rnn_tanh_cell( + input, + hx, + self.get_weight_ih(), + self.get_weight_hh(), + self.bias_ih, + self.bias_hh, + ) + elif self.nonlinearity == "relu": + ret = _VF.rnn_relu_cell( + input, + hx, + self.get_weight_ih(), + self.get_weight_hh(), + self.bias_ih, + self.bias_hh, + ) + else: + ret = input # TODO: remove when jit supports exception flow + raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}") + + if not is_batched: + ret = ret.squeeze(0) + + return ret + + @classmethod + def from_float(cls, mod, weight_qparams_dict): + ref_mod = cls( + mod.input_size, + mod.hidden_size, + mod.bias, + mod.nonlinearity, + mod.weight_ih.device, + mod.weight_ih.dtype, + weight_qparams_dict, + ) + ref_mod.weight_ih = mod.weight_ih + ref_mod.weight_hh = mod.weight_hh + ref_mod.bias_ih = mod.bias_ih + ref_mod.bias_hh = mod.bias_hh + return ref_mod + + +class LSTMCell(RNNCellBase): + """ + We'll store weight_qparams for all the weights (weight_ih and weight_hh), + we need to pass in a `weight_qparams_dict` that maps from weight name, + e.g. weight_ih, to the weight_qparams for that weight + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + device=None, + dtype=None, + weight_qparams_dict: dict[str, Any] | None = None, + ) -> None: + factory_kwargs = { + "device": device, + "dtype": dtype, + "weight_qparams_dict": weight_qparams_dict, + } + super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs) + + def _get_name(self): + return "QuantizedLSTMCell(Reference)" + + def forward( + self, input: Tensor, hx: tuple[Tensor, Tensor] | None = None + ) -> tuple[Tensor, Tensor]: + assert input.dim() in ( + 1, + 2, + ), ( + f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + zeros = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + hx = (zeros, zeros) + else: + hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx + + ret = _VF.lstm_cell( + input, + hx, + self.get_weight_ih(), + self.get_weight_hh(), + self.bias_ih, + self.bias_hh, + ) + + if not is_batched: + ret = (ret[0].squeeze(0), ret[1].squeeze(0)) + return ret + + @classmethod + def from_float(cls, mod, weight_qparams_dict, use_precomputed_fake_quant=False): + ref_mod = cls( + mod.input_size, + mod.hidden_size, + mod.bias, + mod.weight_ih.device, + mod.weight_ih.dtype, + weight_qparams_dict, + ) + ref_mod.weight_ih = mod.weight_ih + ref_mod.weight_hh = mod.weight_hh + ref_mod.bias_ih = mod.bias_ih + ref_mod.bias_hh = mod.bias_hh + return ref_mod + + +class GRUCell(RNNCellBase): + """ + We'll store weight_qparams for all the weights (weight_ih and weight_hh), + we need to pass in a `weight_qparams_dict` that maps from weight name, + e.g. weight_ih, to the weight_qparams for that weight + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + device=None, + dtype=None, + weight_qparams_dict: dict[str, Any] | None = None, + ) -> None: + factory_kwargs = { + "device": device, + "dtype": dtype, + "weight_qparams_dict": weight_qparams_dict, + } + super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs) + + def _get_name(self): + return "QuantizedGRUCell(Reference)" + + def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor: + assert input.dim() in ( + 1, + 2, + ), ( + f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + else: + hx = hx.unsqueeze(0) if not is_batched else hx + + ret = _VF.gru_cell( + input, + hx, + self.get_weight_ih(), + self.get_weight_hh(), + self.bias_ih, + self.bias_hh, + ) + + if not is_batched: + ret = ret.squeeze(0) + + return ret + + @classmethod + def from_float(cls, mod, weight_qparams_dict): + ref_mod = cls( + mod.input_size, + mod.hidden_size, + mod.bias, + mod.weight_ih.device, + mod.weight_ih.dtype, + weight_qparams_dict, + ) + ref_mod.weight_ih = mod.weight_ih + ref_mod.weight_hh = mod.weight_hh + ref_mod.bias_ih = mod.bias_ih + ref_mod.bias_hh = mod.bias_hh + return ref_mod + + +class RNNBase(nn.RNNBase): + def __init__( + self, + mode: str, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + proj_size: int = 0, + device=None, + dtype=None, + weight_qparams_dict: dict[str, Any] | None = None, + ) -> None: + super().__init__( + mode, + input_size, + hidden_size, + num_layers, + bias, + batch_first, + dropout, + bidirectional, + proj_size, + device, + dtype, + ) + # TODO(jerryzh168): maybe make this arg a required arg + if weight_qparams_dict is None: + weight_qparams = { + "qscheme": torch.per_tensor_affine, + "dtype": torch.quint8, + "scale": 1.0, + "zero_point": 0, + } + weight_qparams_dict = {"is_decomposed": False} # type: ignore[dict-item] + for wn in self._flat_weights_names: + if wn.startswith("weight"): + weight_qparams_dict[wn] = weight_qparams + self._init_weight_qparams_dict(weight_qparams_dict, device) + + def _init_weight_qparams_dict(self, weight_qparams_dict, device): + self.is_decomposed = weight_qparams_dict["is_decomposed"] + for key, weight_qparams in weight_qparams_dict.items(): + if key == "is_decomposed": + continue + weight_qscheme = weight_qparams["qscheme"] + weight_dtype = weight_qparams["dtype"] + setattr(self, key + "_qscheme", weight_qscheme) + setattr(self, key + "_dtype", weight_dtype) + assert weight_qscheme in [ + None, + torch.per_tensor_affine, + torch.per_channel_affine, + ], Exception( + f"qscheme: {weight_qscheme} is not support in {self._get_name()}" + ) + if weight_qscheme is not None: + self.register_buffer( + key + "_scale", + torch.tensor( + weight_qparams["scale"], dtype=torch.float, device=device + ), + ) + self.register_buffer( + key + "_zero_point", + torch.tensor( + weight_qparams["zero_point"], dtype=torch.int, device=device + ), + ) + if weight_qscheme == torch.per_channel_affine: + self.register_buffer( + key + "_axis", + torch.tensor( + weight_qparams["axis"], dtype=torch.int, device=device + ), + ) + else: + # added for TorchScriptability, not used + self.register_buffer( + key + "_axis", torch.tensor(0, dtype=torch.int, device=device) + ) + setattr(self, key + "_axis_int", getattr(self, key + "_axis").item()) + + +class LSTM(RNNBase): + """Reference Quantized LSTM Module + We'll store weight_qparams for all the weights in _flat_weights, we need to pass in + a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0, + to the weight_qparams for that weight + """ + + def __init__(self, *args, **kwargs): + super().__init__("LSTM", *args, **kwargs) + + # Same as above, see torch/nn/modules/module.py::_forward_unimplemented + def permute_hidden( # type: ignore[override] + self, + hx: tuple[Tensor, Tensor], + permutation: Tensor | None, + ) -> tuple[Tensor, Tensor]: + if permutation is None: + return hx + return _apply_permutation(hx[0], permutation), _apply_permutation( + hx[1], permutation + ) + + def get_expected_cell_size( + self, input: Tensor, batch_sizes: Tensor | None + ) -> tuple[int, int, int]: + if batch_sizes is not None: + mini_batch = int(batch_sizes[0]) + else: + mini_batch = input.size(0) if self.batch_first else input.size(1) + num_directions = 2 if self.bidirectional else 1 + expected_hidden_size = ( + self.num_layers * num_directions, + mini_batch, + self.hidden_size, + ) + return expected_hidden_size + + # In the future, we should prevent mypy from applying contravariance rules here. + # See torch/nn/modules/module.py::_forward_unimplemented + def check_forward_args( # type: ignore[override] + self, + input: Tensor, + hidden: tuple[Tensor, Tensor], + batch_sizes: Tensor | None, + ): + self.check_input(input, batch_sizes) + self.check_hidden_size( + hidden[0], + self.get_expected_hidden_size(input, batch_sizes), + "Expected hidden[0] size {}, got {}", + ) + self.check_hidden_size( + hidden[1], + self.get_expected_cell_size(input, batch_sizes), + "Expected hidden[1] size {}, got {}", + ) + + def get_quantized_weight_bias_dict(self): + """dictionary from flat_weight_name to quantized weight or (unquantized) bias + e.g. + { + "weight_ih_l0": quantized_weight, + "bias_ih_l0": unquantized_bias, + ... + } + """ + quantized_weight_bias_dict = {} + for wn in self._flat_weights_names: + if hasattr(self, wn): + if wn.startswith("weight"): + weight_or_bias = get_quantized_weight(self, wn) + else: + weight_or_bias = getattr(self, wn) + else: + weight_or_bias = None + quantized_weight_bias_dict[wn] = weight_or_bias + return quantized_weight_bias_dict + + def get_flat_weights(self): + flat_weights = [] + for wn in self._flat_weights_names: + if hasattr(self, wn): + weight = getattr(self, wn) + if wn.startswith("weight"): + params = _get_weight_and_quantization_params(self, wn) + weight = _quantize_and_dequantize_weight(*params) + else: + weight = None + flat_weights.append(weight) + return flat_weights + + def forward(self, input, hx=None): # noqa: F811 + orig_input = input + # xxx: isinstance check needs to be in conditional for TorchScript to compile + batch_sizes = None + if isinstance(orig_input, PackedSequence): + input, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = int(batch_sizes[0]) + else: + batch_sizes = None + is_batched = input.dim() == 3 + batch_dim = 0 if self.batch_first else 1 + if not is_batched: + input = input.unsqueeze(batch_dim) + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + + if hx is None: + num_directions = 2 if self.bidirectional else 1 + real_hidden_size = ( + self.proj_size if self.proj_size > 0 else self.hidden_size + ) + h_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + real_hidden_size, + dtype=input.dtype, + device=input.device, + ) + c_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + hx = (h_zeros, c_zeros) + else: + if batch_sizes is None: # If not PackedSequence input. + if is_batched: # type: ignore[possibly-undefined] + if hx[0].dim() != 3 or hx[1].dim() != 3: + msg = ( + "For batched 3-D input, hx and cx should " + f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors" + ) + raise RuntimeError(msg) + else: + if hx[0].dim() != 2 or hx[1].dim() != 2: + msg = ( + "For unbatched 2-D input, hx and cx should " + f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors" + ) + raise RuntimeError(msg) + hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1)) + + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + self.check_forward_args(input, hx, batch_sizes) + if batch_sizes is None: + result = _VF.lstm( + input, + hx, + self.get_flat_weights(), + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = _VF.lstm( + input, + batch_sizes, + hx, + self.get_flat_weights(), + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + output = result[0] + hidden = result[1:] + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + output_packed = PackedSequence( + output, + # pyrefly: ignore [bad-argument-type] + batch_sizes, + sorted_indices, + unsorted_indices, + ) + return output_packed, self.permute_hidden(hidden, unsorted_indices) + else: + if not is_batched: # type: ignore[possibly-undefined] + output = output.squeeze(batch_dim) # type: ignore[possibly-undefined] + hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1)) + return output, self.permute_hidden(hidden, unsorted_indices) + + def _get_name(self): + return "QuantizedLSTM(Reference)" + + @classmethod + def from_float(cls, mod, weight_qparams_dict): + ref_mod = cls( + mod.input_size, + mod.hidden_size, + mod.num_layers, + mod.bias, + mod.batch_first, + mod.dropout, + mod.bidirectional, + weight_qparams_dict=weight_qparams_dict, + ) + for wn in mod._flat_weights_names: + setattr(ref_mod, wn, getattr(mod, wn)) + return ref_mod + + +class GRU(RNNBase): + """Reference Quantized GRU Module + We'll store weight_qparams for all the weights in _flat_weights, we need to pass in + a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0, + to the weight_qparams for that weight + """ + + def __init__(self, *args, **kwargs): + if "proj_size" in kwargs: + raise ValueError( + "proj_size argument is only supported for LSTM, not RNN or GRU" + ) + super().__init__("GRU", *args, **kwargs) + + def get_quantized_weight_bias_dict(self): + """dictionary from flat_weight_name to quantized weight or (unquantized) bias + e.g. + { + "weight_ih_l0": quantized_weight, + "bias_ih_l0": unquantized_bias, + ... + } + """ + quantized_weight_bias_dict = {} + for wn in self._flat_weights_names: + if hasattr(self, wn): + if wn.startswith("weight"): + weight_or_bias = get_quantized_weight(self, wn) + else: + weight_or_bias = getattr(self, wn) + else: + weight_or_bias = None + quantized_weight_bias_dict[wn] = weight_or_bias + return quantized_weight_bias_dict + + def get_flat_weights(self): + flat_weights = [] + for wn in self._flat_weights_names: + if hasattr(self, wn): + weight = getattr(self, wn) + if wn.startswith("weight"): + params = _get_weight_and_quantization_params(self, wn) + weight = _quantize_and_dequantize_weight(*params) + else: + weight = None + flat_weights.append(weight) + return flat_weights + + def forward(self, input, hx=None): # noqa: F811 + # Note: this is copied from the forward of GRU in https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py + # only changed self._flat_weights to self.get_flat_weights() + # TODO: maybe we can try inheriting from that class and define get_flat_weights + # as a @property? this might interfere with TorchScript, if we remove that + # requirement in the future we should be able to do this + orig_input = input + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + input, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = int(batch_sizes[0]) + else: + batch_sizes = None + assert input.dim() in ( + 2, + 3, + ), ( + f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor" + ) + is_batched = input.dim() == 3 + batch_dim = 0 if self.batch_first else 1 + if not is_batched: + input = input.unsqueeze(batch_dim) + if hx is not None: + if hx.dim() != 2: + raise RuntimeError( + f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor" + ) + hx = hx.unsqueeze(1) + else: + if hx is not None and hx.dim() != 3: + raise RuntimeError( + f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor" + ) + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + + if hx is None: + num_directions = 2 if self.bidirectional else 1 + hx = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + self.check_forward_args(input, hx, batch_sizes) + if batch_sizes is None: + result = _VF.gru( + input, + hx, + self.get_flat_weights(), + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = _VF.gru( + input, + batch_sizes, + hx, + self.get_flat_weights(), + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + output = result[0] + hidden = result[1] + + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + output_packed = PackedSequence( + output, + # pyrefly: ignore [bad-argument-type] + batch_sizes, + sorted_indices, + unsorted_indices, + ) + return output_packed, self.permute_hidden(hidden, unsorted_indices) + else: + if not is_batched: # type: ignore[possibly-undefined] + output = output.squeeze(batch_dim) # type: ignore[possibly-undefined] + hidden = hidden.squeeze(1) + + return output, self.permute_hidden(hidden, unsorted_indices) + + def _get_name(self): + return "QuantizedGRU(Reference)" + + @classmethod + def from_float(cls, mod, weight_qparams_dict): + ref_mod = cls( + mod.input_size, + mod.hidden_size, + mod.num_layers, + mod.bias, + mod.batch_first, + mod.dropout, + mod.bidirectional, + weight_qparams_dict=weight_qparams_dict, + ) + for wn in mod._flat_weights_names: + setattr(ref_mod, wn, getattr(mod, wn)) + return ref_mod diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..8ff80997c1439c50a456df328b4068ae0c419a01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py @@ -0,0 +1,163 @@ +# mypy: allow-untyped-defs +from typing import Any + +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from .utils import ReferenceQuantizedModule + + +__all__ = ["Embedding", "EmbeddingBag"] + + +class Embedding(nn.Embedding, ReferenceQuantizedModule): + """A reference quantized Embedding module that fits into the + FX Graph Mode Quantization workflow, activation will be floating point Tensor, + we will store floating point weight as well in the module, but in forward we'll + quantize and dequantize the weight before running the floating point functional + embedding operator. + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int | None = None, + max_norm: float | None = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + _weight: Tensor | None = None, + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ) -> None: + super().__init__( + num_embeddings, + embedding_dim, + padding_idx, + max_norm, + norm_type, + scale_grad_by_freq, + sparse, + _weight, + # pyrefly: ignore [bad-argument-type] + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def _get_name(self): + return "QuantizedEmbedding(Reference)" + + def forward(self, input: Tensor) -> Tensor: + weight_quant_dequant = self.get_weight() + return F.embedding( + input, + weight_quant_dequant, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + @classmethod + def from_float(cls, mod, weight_qparams): + return cls( + mod.num_embeddings, + mod.embedding_dim, + mod.padding_idx, + mod.max_norm, + mod.norm_type, + mod.scale_grad_by_freq, + mod.sparse, + mod.weight, + mod.weight.device, + mod.weight.dtype, + weight_qparams, + ) + + +class EmbeddingBag(nn.EmbeddingBag, ReferenceQuantizedModule): + """A reference quantized EmbeddingBag module that fits into the + FX Graph Mode Quantization workflow, activation will be floating point Tensor, + we will store floating point weight as well in the module, but in forward we'll + quantize and dequantize the weight before running the floating point functional + embedding operator. + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + max_norm: float | None = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "mean", + sparse: bool = False, + _weight: Tensor | None = None, + include_last_offset: bool = False, + padding_idx: int | None = None, + device=None, + dtype=None, + weight_qparams: dict[str, Any] | None = None, + ) -> None: + super().__init__( + num_embeddings, + embedding_dim, + max_norm, + norm_type, + scale_grad_by_freq, + mode, + sparse, + _weight, + include_last_offset, + padding_idx, + device, + dtype, + ) + self._init_weight_qparams(weight_qparams, device) + + def _get_name(self): + return "QuantizedEmbedding(Reference)" + + def forward( + self, + input: Tensor, + offsets: Tensor | None = None, + per_sample_weights: Tensor | None = None, + ) -> Tensor: + weight_quant_dequant = self.get_weight() + return F.embedding_bag( + input, + weight_quant_dequant, + offsets, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.mode, + self.sparse, + per_sample_weights, + self.include_last_offset, + self.padding_idx, + ) + + @classmethod + def from_float(cls, mod, weight_qparams, use_precomputed_fake_quant=False): + return cls( + mod.num_embeddings, + mod.embedding_dim, + mod.max_norm, + mod.norm_type, + mod.scale_grad_by_freq, + mod.mode, + mod.sparse, + mod.weight, + mod.include_last_offset, + mod.padding_idx, + mod.weight.device, + mod.weight.dtype, + weight_qparams, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdbcd4a6739e528e679c67b6a6614ea373801d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py @@ -0,0 +1,438 @@ +# mypy: allow-untyped-defs +import typing + +import torch + + +__all__ = [ + "ReferenceQuantizedModule", +] + + +class ReferenceQuantizedModule(torch.nn.Module): + def _init_weight_qparams(self, weight_qparams, device): + if weight_qparams is None: + weight_qparams = { + "qscheme": torch.per_tensor_affine, + "dtype": torch.quint8, + "scale": 1.0, + "zero_point": 0, + } + # pyrefly: ignore [bad-assignment] + self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"] + self.weight_dtype = weight_qparams["dtype"] + assert self.weight_qscheme in [ + None, + torch.per_tensor_affine, + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ], ( + f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}" + ) + if self.weight_dtype in [ + torch.quint8, + torch.qint8, + torch.quint4x2, + torch.qint32, + ]: + zero_point_dtype = ( + weight_qparams["zero_point"].dtype + if isinstance(weight_qparams["zero_point"], torch.Tensor) + else torch.int + ) + w_scale = weight_qparams["scale"] + w_scale_tensor = ( + w_scale.detach().clone() + if isinstance(w_scale, torch.Tensor) + else torch.tensor(w_scale, dtype=torch.float, device=device) + ) + self.register_buffer("weight_scale", w_scale_tensor) + w_zp = weight_qparams["zero_point"] + w_zp_tensor = ( + w_zp.detach().clone() + if isinstance(w_zp, torch.Tensor) + else torch.tensor(w_zp, dtype=zero_point_dtype, device=device) + ) + self.register_buffer("weight_zero_point", w_zp_tensor) + if self.weight_qscheme in [ + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + w_axis = weight_qparams["axis"] + w_axis_tensor = ( + w_axis.detach().clone() + if isinstance(w_axis, torch.Tensor) + else torch.tensor(w_axis, dtype=torch.int, device=device) + ) + self.register_buffer("weight_axis", w_axis_tensor) + else: + # added for TorchScriptability, not used + self.register_buffer( + "weight_axis", torch.tensor(0, dtype=torch.int, device=device) + ) + else: + # added for TorchScriptability, and for torch.float + self.register_buffer( + "weight_scale", torch.tensor(1.0, dtype=torch.float, device=device) + ) + self.register_buffer( + "weight_zero_point", torch.tensor(0, dtype=torch.int, device=device) + ) + self.register_buffer( + "weight_axis", torch.tensor(0, dtype=torch.int, device=device) + ) + # pyrefly: ignore [bad-assignment] + self.is_decomposed: bool = weight_qparams.get("is_decomposed", False) + # store weight_axis as weight_axis_int due to some constraints of torchdynamo.export + # for capturing `.item` operations + self.weight_axis_int: int = self.weight_axis.item() # type: ignore[operator, assignment] + # pyrefly: ignore [bad-assignment] + self.weight_quant_min: int | None = weight_qparams.get("quant_min") + # pyrefly: ignore [bad-assignment] + self.weight_quant_max: int | None = weight_qparams.get("quant_max") + + def get_weight(self): + """ + Fake quantize (quantize and dequantize) the weight with + the quantization parameters for weight, this is used to + simulate the numerics for the quantized weight in a quantized + model + """ + # suppress mypy warning + assert isinstance(self.weight_scale, torch.Tensor) + assert isinstance(self.weight_zero_point, torch.Tensor) + if self.is_decomposed: + return _quantize_and_dequantize_weight_decomposed( + self.weight, # type: ignore[arg-type] + self.weight_qscheme, + # pyrefly: ignore [bad-argument-type] + self.weight_dtype, + self.weight_scale, + self.weight_zero_point, + self.weight_axis_int, + self.weight_quant_min, + self.weight_quant_max, + ) + else: + return _quantize_and_dequantize_weight( + self.weight, # type: ignore[arg-type] + self.weight_qscheme, + # pyrefly: ignore [bad-argument-type] + self.weight_dtype, + self.weight_scale, + self.weight_zero_point, + self.weight_axis_int, + ) + + def get_quantized_weight(self): + # suppress mypy warning + assert isinstance(self.weight_scale, torch.Tensor) + assert isinstance(self.weight_zero_point, torch.Tensor) + # assert isinstance(self.weight_axis, torch.Tensor) + if self.is_decomposed: + return _quantize_weight_decomposed( + self.weight, # type: ignore[arg-type] + self.weight_qscheme, + # pyrefly: ignore [bad-argument-type] + self.weight_dtype, + self.weight_scale, + self.weight_zero_point, + self.weight_axis_int, + self.weight_quant_min, + self.weight_quant_max, + ) + else: + return _quantize_weight( + self.weight, # type: ignore[arg-type] + self.weight_qscheme, + # pyrefly: ignore [bad-argument-type] + self.weight_dtype, + self.weight_scale, + self.weight_zero_point, + self.weight_axis_int, + ) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + _save_weight_qparams( + destination, + prefix, + self.weight_qscheme, + self.weight_dtype, + self.weight_scale, + self.weight_zero_point, + self.weight_axis, + ) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + for key in _get_weight_qparam_keys(state_dict, prefix): + setattr(self, key, state_dict[prefix + key]) + state_dict.pop(prefix + key) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +def _quantize_weight_decomposed( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis: int, + weight_quant_min: int | None, + weight_quant_max: int | None, +) -> torch.Tensor: + _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = { + torch.uint8: (0, 255), + torch.int8: (-128, 127), + torch.int32: (-2147483648, 2147483647), # torch.jit interprets 2**31 as a float + } + + # TODO: add an util function for converting qdtype to dtype + _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = { + torch.quint8: torch.uint8, + torch.qint8: torch.int8, + torch.qint32: torch.int32, + } + if weight_qscheme == torch.per_tensor_affine: + if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]: + weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype] + if weight_quant_min is None or weight_quant_max is None: + weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[ + weight_dtype_ + ] + weight = torch.ops.quantized_decomposed.quantize_per_tensor( + weight, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + weight_dtype_, + ) + return weight + elif weight_qscheme in [ + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + # TODO: torch.quint4x2 is not supported + if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]: + weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype] + if weight_quant_min is None or weight_quant_max is None: + weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[ + weight_dtype_ + ] + weight = torch.ops.quantized_decomposed.quantize_per_channel( + weight, + weight_scale, + weight_zero_point, + weight_axis, + weight_quant_min, + weight_quant_max, + weight_dtype_, + ) # type: ignore[arg-type] + return weight + raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}") + + +def _dequantize_weight_decomposed( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis: int, + weight_quant_min: int | None, + weight_quant_max: int | None, +) -> torch.Tensor: + # TODO: get the quant_min and quant_max from activation_post_process + _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = { + torch.uint8: (0, 255), + torch.int8: (-128, 127), + torch.int32: (-2147483648, 2147483647), # torch.jit interprets 2**31 as a float + } + # TODO: add an util function for converting qdtype to dtype + _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = { + torch.quint8: torch.uint8, + torch.qint8: torch.int8, + torch.qint32: torch.int32, + } + weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype] + if weight_quant_min is None or weight_quant_max is None: + weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[weight_dtype_] + if weight_qscheme == torch.per_tensor_affine: + if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]: + weight = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + weight_dtype_, + ) + return weight + elif weight_qscheme in [ + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + # TODO: torch.quint4x2 is not supported + if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]: + weight = torch.ops.quantized_decomposed.dequantize_per_channel( + weight, + weight_scale, + weight_zero_point, + weight_axis, + weight_quant_min, + weight_quant_max, + weight_dtype_, + ) # type: ignore[arg-type] + return weight + raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}") + + +def _quantize_weight( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis_int: int, +) -> torch.Tensor: + if weight_dtype == torch.float16: + weight = weight.to(weight_dtype) + return weight + + if weight_qscheme == torch.per_tensor_affine: + if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]: + weight = torch.quantize_per_tensor( + weight, weight_scale, weight_zero_point, weight_dtype + ) + return weight + elif weight_qscheme in [ + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + if weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]: + weight = torch.quantize_per_channel( + weight, weight_scale, weight_zero_point, weight_axis_int, weight_dtype + ) # type: ignore[arg-type] + return weight + raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}") + + +def _quantize_and_dequantize_weight_decomposed( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis_int: int, + weight_quant_min: int | None, + weight_quant_max: int | None, +) -> torch.Tensor: + """Quantize and then dequantize the weight based on + the quantization parameters + """ + if weight_qscheme in [ + torch.per_tensor_affine, + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + weight_quant = _quantize_weight_decomposed( + weight, + weight_qscheme, + weight_dtype, + weight_scale, + weight_zero_point, + weight_axis_int, + weight_quant_min, + weight_quant_max, + ) + weight_dequant = _dequantize_weight_decomposed( + weight_quant, + weight_qscheme, + weight_dtype, + weight_scale, + weight_zero_point, + weight_axis_int, + weight_quant_min, + weight_quant_max, + ) + else: + weight_dequant = weight + return weight_dequant + + +def _quantize_and_dequantize_weight( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis_int: int, +) -> torch.Tensor: + """Quantize and then dequantize the weight based on + the quantization parameters + """ + if weight_qscheme in [ + torch.per_tensor_affine, + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ]: + weight_quant = _quantize_weight( + weight, + weight_qscheme, + weight_dtype, + weight_scale, + weight_zero_point, + weight_axis_int, + ) + weight_dequant = weight_quant.dequantize() + else: + weight_dequant = weight + return weight_dequant + + +def _save_weight_qparams( + destination, + prefix, + weight_qscheme, + weight_dtype, + weight_scale, + weight_zero_point, + weight_axis, +): + destination[prefix + "weight_qscheme"] = weight_qscheme + destination[prefix + "weight_dtype"] = weight_dtype + if weight_qscheme is not None: + destination[prefix + "weight_scale"] = weight_scale + destination[prefix + "weight_zero_point"] = weight_zero_point + if weight_qscheme == torch.per_channel_affine: + destination[prefix + "weight_axis"] = weight_axis + + +def _get_weight_qparam_keys(state_dict: dict[str, typing.Any], prefix: str): + keys = ["weight_qscheme", "weight_dtype"] + weight_qscheme = state_dict[prefix + "weight_qscheme"] + if weight_qscheme is not None: + keys.append("weight_scale") + keys.append("weight_zero_point") + if weight_qscheme == torch.quantize_per_channel: + keys.append("weight_axis") + return keys diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0fda5a58f2984ee05b0d167297b458f62c37fc59 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py @@ -0,0 +1 @@ +from . import quantized diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3759036fb1abc7dfca136083371b20d04cfb1613 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef66c90b0e8ecdbc7cd2cfb4c1cecf0bc38e8466 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py @@ -0,0 +1,10 @@ +from torch.ao.nn.sparse.quantized import dynamic + +from .linear import Linear, LinearPackedParams + + +__all__ = [ + "dynamic", + "Linear", + "LinearPackedParams", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b04421eb3ae125677272c6afa6aab5c5b7f0b1dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..466a2ee60241cb8404c6f75bc8b07f6353f0d8a1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96949caf4a30afd66e6d05c1e5737327d362bd9e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..91ecfd8793dc08b96ed64f47f531724aa8a866d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py @@ -0,0 +1,6 @@ +from .linear import Linear + + +__all__ = [ + "Linear", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6875d89edeb8aac39104411720c8b6d8d08c3748 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c2a7f384930a32cf41c49659a5cb4d9f0ca1b26 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..d327cabd0d3681cce4ec4b7d62f0f9e734ad0730 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py @@ -0,0 +1,191 @@ +# mypy: allow-untyped-defs + +import torch +import torch.ao.nn.intrinsic as nni +from torch.ao.nn.quantized.modules.utils import ( + _hide_packed_params_repr, + _quantize_weight, +) +from torch.ao.nn.sparse.quantized import linear +from torch.ao.nn.sparse.quantized.utils import LinearBlockSparsePattern + + +__all__ = ["Linear"] + + +class Linear(torch.nn.Module): + r""" + A dynamically quantized sparse linear module with float tensor as inputs and outputs. + """ + + _version = 1 + _op_type = "sparse_dynamic" + _FLOAT_MODULE = torch.nn.Linear + + def __init__( + self, + in_features, + out_features, + row_block_size, + col_block_size, + bias=True, + dtype=torch.qint8, + ): + super().__init__() + + if dtype != torch.qint8: + raise NotImplementedError( + "Only QINT8 is supported for Sparse Quantized Linear Dynamic" + ) + + self.in_features = in_features + self.out_features = out_features + + if bias: + bias = torch.zeros(self.out_features, dtype=torch.float) + else: + bias = None + + qweight = torch._empty_affine_quantized( + [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8 + ) + self._packed_params = linear.LinearPackedParams( + row_block_size=row_block_size, col_block_size=col_block_size, dtype=dtype + ) + self._packed_params.set_weight_bias( + qweight, bias, row_block_size, col_block_size + ) + + def _get_name(self): + return "SparseQuantizedDynamicLinear" + + def extra_repr(self): + return f"in_features={self.in_features}, out_features={self.out_features}, qscheme={self.weight().qscheme()}" + + def __repr__(self): + return _hide_packed_params_repr(self, linear.LinearPackedParams) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.sparse.qlinear_dynamic(x, self._packed_params._packed_params) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "op_type"] = self._op_type + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + op_type = int(state_dict[prefix + "op_type"]) + assert op_type == "sparse", ( + f"Cannot load from op_type [{op_type}], expecting [{self._op_type}]" + ) + state_dict.pop(prefix + "op_type") + + version = local_metadata.get("version", None) + assert version <= self._version + + # Is this code valid? In old quantization it seemed to be used to load + # older model + weight = state_dict.pop(prefix + "weight") + bias = state_dict.pop(prefix + "bias") + state_dict.update( + { + prefix + "_packed_params.weight": weight, + prefix + "_packed_params.bias": bias, + } + ) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def _weight_bias(self): + return self._packed_params._weight_bias() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def set_weight_bias( + self, + w: torch.Tensor, + b: torch.Tensor | None, + row_block_size: int | None, + col_block_size: int | None, + ) -> None: + assert row_block_size is not None and col_block_size is not None + self.out_features = w.shape[0] + self.in_features = w.shape[1] + self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a quantized sparse dynamic module from a float module. + + We only care about the convert at this stage, no need for observers just yet. + """ + assert type(mod) is cls._FLOAT_MODULE, ( + " nnq." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + # TODO: Need to add options to qconfig to avoid the calibration. + # TODO: Add calibration for the sparsity + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + if type(mod) is nni.LinearReLU: + mod = mod[0] + # pyrefly: ignore [missing-attribute] + if mod.qconfig is not None and mod.qconfig.weight is not None: + # pyrefly: ignore [not-callable] + weight_observer = mod.qconfig.weight() + else: + # We have the circular import issues if we import the qconfig in the beginning of this file: + # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the + # import until we need it. + from torch.ao.quantization.qconfig import default_dynamic_qconfig + + weight_observer = default_dynamic_qconfig.weight() + + # It is important to multiply by the mask BEFORE calling the `weight_observer` + # TODO (zaf): Mask might not be part of the qconfig (T83295194) + weight = mod.weight + if getattr(mod.qconfig, "mask", False): + weight = mod.qconfig.mask * mod.weight + + weight_observer(weight) + dtype = weight_observer.dtype + assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8" + _w_sc, w_zp = weight_observer.calculate_qparams() + if isinstance(w_zp, torch.Tensor): + assert not torch.any(w_zp.bool()), "All weight zero points must map to 0" + else: + assert w_zp == 0, "Weight zero point must map to 0" + qweight = _quantize_weight(weight.float(), weight_observer) + + row_block_size, col_block_size = LinearBlockSparsePattern.block_size() + qlinear = cls( + mod.in_features, + mod.out_features, + row_block_size, + col_block_size, + dtype=dtype, + ) + # pyrefly: ignore [bad-argument-type] + qlinear.set_weight_bias(qweight, mod.bias, row_block_size, col_block_size) + return qlinear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..f106a32abfbf960b989c8eba860db2dec4a7fe4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py @@ -0,0 +1,274 @@ +# mypy: allow-untyped-defs + +import torch +from torch.ao.nn.quantized.modules.utils import ( + _hide_packed_params_repr, + _quantize_weight, +) + + +__all__ = ["LinearPackedParams", "Linear"] + + +# TODO (zaf): Inherit from `quantized.LinearPackedParams` (T83294430) +class LinearPackedParams(torch.nn.Module): + _version = 1 + + def __init__(self, row_block_size=1, col_block_size=4, dtype=torch.qint8): + super().__init__() + + if dtype != torch.qint8: + raise NotImplementedError("Linear prepacking only supports QINT8") + self.dtype = dtype + wq = torch._empty_affine_quantized( + [1, 1], scale=1.0, zero_point=0, dtype=torch.qint8 + ) + self.set_weight_bias(wq, None, row_block_size, col_block_size) + + def _get_name(self): + return "SparseQuantizedLinearPackedParams" + + @torch.jit.export + def set_weight_bias( + self, + weight: torch.Tensor, + bias: torch.Tensor | None, + row_block_size: int | None, + col_block_size: int | None, + ) -> None: + assert row_block_size is not None and col_block_size is not None + self._packed_params = torch.ops.sparse.qlinear_prepack( + weight, bias, row_block_size, col_block_size + ) + + @torch.jit.export + def _weight_bias(self): + (weight, bias, block_sizes) = torch.ops.sparse.qlinear_unpack( + self._packed_params + ) + return (weight, bias, block_sizes[0], block_sizes[1]) + + def forward(self, x): + return x + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "dtype"] = self.dtype + destination[prefix + "_packed_params"] = self._weight_bias() + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + assert version <= self._version + + self.dtype = state_dict.pop(prefix + "dtype") + weight, bias, row_block_size, col_block_size = state_dict.pop( + prefix + "_packed_params" + ) + self.set_weight_bias(weight, bias, row_block_size, col_block_size) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @torch.jit.export + def __getstate__(self): + return self._packed_params, self.training, self.dtype + + @torch.jit.export + def __setstate__(self, state): + (self._packed_params, self.training, self.dtype) = state + + def __repr__(self): + return self._weight_bias().__repr__() + + +# TODO (zaf): Inherit from `quantized.Linear` (T83294430) +class Linear(torch.nn.Module): + r""" + A quantized sparse linear module with quantized tensor as inputs and outputs. + """ + + _version = 1 + _FLOAT_MODULE = torch.nn.Linear + + def __init__( + self, + in_features, + out_features, + row_block_size, + col_block_size, + bias=True, + dtype=torch.qint8, + ): + super().__init__() + + if dtype != torch.qint8: + raise NotImplementedError( + "Only QINT8 is supported for Sparse Quantized Linear" + ) + + self.in_features = in_features + self.out_features = out_features + + if bias: + bias = torch.zeros(self.out_features, dtype=torch.float) + else: + bias = None + + qweight = torch._empty_affine_quantized( + [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8 + ) + self._packed_params = LinearPackedParams( + row_block_size=row_block_size, col_block_size=col_block_size, dtype=dtype + ) + self._packed_params.set_weight_bias( + qweight, bias, row_block_size, col_block_size + ) + self.scale = 1.0 + self.zero_point = 0 + + @classmethod + def _get_name(cls): + return "SparseQuantizedLinear" + + def extra_repr(self): + return ( + f"in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, " + f"zero_point={self.zero_point}, qscheme={self.weight().qscheme()}" + ) + + def __repr__(self): + return _hide_packed_params_repr(self, LinearPackedParams) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.sparse.qlinear( + x, self._packed_params._packed_params, self.scale, self.zero_point + ) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "scale"] = torch.tensor(self.scale) + destination[prefix + "zero_point"] = torch.tensor(self.zero_point) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + self.scale = float(state_dict[prefix + "scale"]) + state_dict.pop(prefix + "scale") + + self.zero_point = int(state_dict[prefix + "zero_point"]) + state_dict.pop(prefix + "zero_point") + + state_dict.pop(prefix + "op_type") + + version = local_metadata.get("version", None) + assert version <= self._version + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def _weight_bias(self): + return self._packed_params._weight_bias() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def set_weight_bias( + self, + w: torch.Tensor, + b: torch.Tensor | None, + row_block_size: int | None, + col_block_size: int | None, + ) -> None: + assert row_block_size is not None and col_block_size is not None + self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size) + + @classmethod + def from_float(cls, mod, use_precomputed_fake_quant=False): + r"""Create a quantized sparse module from a float module. + + We only care about the convert at this stage, no need for observers just yet. + + TODO(zaf): Need to add the sparse params to the qconfig + """ + assert type(mod) is cls._FLOAT_MODULE, ( + cls._get_name() + ".from_float only works for " + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "sparse_params"), ( + "Expecting the Linear to have `sparse_params`. Make sure you have provided arguments " + 'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.' + ) + sparse_block_shape = mod.sparse_params.get("sparse_block_shape", None) # type: ignore[operator, union-attr] + assert isinstance(sparse_block_shape, (tuple, list)) + assert len(sparse_block_shape) == 2 + # TODO: Need to add options to qconfig to avoid the calibration. + # TODO: Add calibration for the sparsity + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + activation_post_process = mod.activation_post_process + weight_post_process = mod.qconfig.weight() # type: ignore[operator, union-attr] + + # Assumption is that the weight is already sparsified by the + # `sparsifier.convert` + weight = mod.weight + + weight_post_process(weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() # type: ignore[operator, union-attr] + assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8" + w_sc, w_zp = weight_post_process.calculate_qparams() + if isinstance(w_zp, torch.Tensor): + assert not torch.any(w_zp.bool()), "All weight zero points must map to 0" + else: + assert w_zp == 0, "Weight zero point must map to 0" + qweight = _quantize_weight(weight.float(), weight_post_process) + + row_block_size = mod.sparse_params["sparse_block_shape"][0] # type: ignore[index] + col_block_size = mod.sparse_params["sparse_block_shape"][1] # type: ignore[index] + qlinear = cls( + mod.in_features, + mod.out_features, + row_block_size, + col_block_size, + dtype=dtype, + ) + qlinear.set_weight_bias( + qweight, + mod.bias, + row_block_size, # type: ignore[arg-type] + col_block_size, # type: ignore[arg-type] + ) + qlinear.scale = float(act_scale) + qlinear.zero_point = int(act_zp) + return qlinear diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2cfd4a5973dfa8a5219f5ca97246424ae17a6308 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py @@ -0,0 +1,62 @@ +import threading + + +__all__ = ["LinearBlockSparsePattern"] + + +def _is_valid_linear_block_sparse_pattern( + row_block_size: int, col_block_size: int +) -> bool: + return (row_block_size == 1 and col_block_size == 4) or ( + row_block_size == 8 and col_block_size == 1 + ) + + +# This is a stop-gap measure as current flow does not allow module +# specific block sparse pattern. +# In fact there is no way to convey sparse pattern via module config +# of quantization flow. Thus using the global context to convey +# sparsity pattern. +# Once the flow supports it, this should be removed. +class LinearBlockSparsePattern: + rlock = threading.RLock() + row_block_size: int = 1 + col_block_size: int = 4 + prev_row_block_size: int = 1 + prev_col_block_size: int = 4 + + def __init__(self, row_block_size: int = 1, col_block_size: int = 4): + assert _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size) + LinearBlockSparsePattern.rlock.acquire() + LinearBlockSparsePattern.prev_row_block_size = ( + LinearBlockSparsePattern.row_block_size + ) + LinearBlockSparsePattern.prev_col_block_size = ( + LinearBlockSparsePattern.col_block_size + ) + LinearBlockSparsePattern.row_block_size = row_block_size + LinearBlockSparsePattern.col_block_size = col_block_size + + def __enter__(self) -> None: + pass + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + backtrace: object | None, + ) -> None: + LinearBlockSparsePattern.row_block_size = ( + LinearBlockSparsePattern.prev_row_block_size + ) + LinearBlockSparsePattern.col_block_size = ( + LinearBlockSparsePattern.prev_col_block_size + ) + LinearBlockSparsePattern.rlock.release() + + @staticmethod + def block_size() -> tuple[int, int]: + return ( + LinearBlockSparsePattern.row_block_size, + LinearBlockSparsePattern.col_block_size, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cfcd51af91e25c13076c8930312ba9c9df4e7bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec6195b14e08e26e450cf6f3c332e41f0dc89354 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f3d2d766dd1db6ffcaf85704ce7ff7d17a0bb4e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py new file mode 100644 index 0000000000000000000000000000000000000000..026ac73606e307bedd500a801a76ba1a97c4c655 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py @@ -0,0 +1,568 @@ +# mypy: allow-untyped-defs +from collections.abc import Callable +from typing import Any + +import torch +import torch.ao.nn.quantized as nnq +import torch.ao.nn.quantized.dynamic as nnqd +import torch.nn as nn +from torch.ao.quantization import prepare +from torch.ao.quantization.quantization_mappings import ( + get_default_compare_output_module_list, +) + + +NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST = { + nnqd.Linear, + nnq.Linear, + nnqd.LSTM, + nn.LSTM, +} + + +def _find_match( + str_list: dict[str, Any] | list[str], + key_str: str, + postfix: str, +) -> str | None: + split_str = key_str.split(".") + if split_str[-1] == postfix: + match_string = "".join(key_str.split(".")[0:-1]) + for s2 in str_list: + pattern1 = "".join(s2.split(".")[0:-1]) + pattern2 = "".join(s2.split(".")[0:-2]) + if match_string == pattern1: + return s2 + if match_string == pattern2: + return s2 + + # For matching "fc.weight" and "fc._packed_params._packed_params" + if postfix == "_packed_params": + match_string = "".join(key_str.split(".")[0:-2]) + if len(match_string) == 0: + return None + for s2 in str_list: + pattern1 = "".join(s2.split(".")[0:-1]) + pattern2 = "".join(s2.split(".")[0:-2]) + if match_string == pattern1: + return s2 + if match_string == pattern2: + return s2 + return None + else: + return None + + +def compare_weights( + float_dict: dict[str, Any], quantized_dict: dict[str, Any] +) -> dict[str, dict[str, torch.Tensor]]: + r"""Compare the weights of the float module with its corresponding quantized + module. Return a dict with key corresponding to module names and each entry being + a dictionary with two keys 'float' and 'quantized', containing the float and + quantized weights. This dict can be used to compare and compute the quantization + error of the weights of float and quantized models. + + Example usage:: + + wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict()) + for key in wt_compare_dict: + print( + key, + compute_error( + wt_compare_dict[key]["float"], + wt_compare_dict[key]["quantized"].dequantize(), + ), + ) + + Args: + float_dict: state dict of the float model + quantized_dict: state dict of the quantized model + + Return: + weight_dict: dict with key corresponding to module names and each entry being + a dictionary with two keys 'float' and 'quantized', containing the float and + quantized weights + """ + torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_weights") + weight_dict: dict[str, dict] = {} + for key in quantized_dict: + match_key = _find_match(float_dict, key, "weight") + if match_key is not None: + weight_dict[key] = {} + weight_dict[key]["float"] = float_dict[match_key] + weight_dict[key]["quantized"] = quantized_dict[key] + continue + + # For matching "fc.weight" and "fc._packed_params._packed_params" + match_key = _find_match(float_dict, key, "_packed_params") + if match_key is not None: + weight_dict[key] = {} + weight_dict[key]["float"] = float_dict[match_key] + weight_dict[key]["quantized"] = quantized_dict[key][0] + + # For LSTM + split_str = key.split(".") + if split_str[-1] == "param" and split_str[-3] == "_all_weight_values": + layer = split_str[-2] + module_name = ".".join(split_str[:-3]) + float_weight_ih_key = module_name + ".weight_ih_l" + layer + float_weight_hh_key = module_name + ".weight_hh_l" + layer + if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict: + weight_dict[key] = {} + weight_dict[key]["float"] = float_dict[float_weight_ih_key] + weight_dict[key]["quantized"] = ( + quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0] + ) + weight_dict[key]["float"] = float_dict[float_weight_hh_key] + weight_dict[key]["quantized"] = ( + quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0] + ) + + return weight_dict + + +def _get_logger_dict_helper( + mod: nn.Module, + target_dict: dict[str, Any], + prefix: str = "", +) -> None: + r"""This is the helper function for get_logger_dict + + Args: + mod: module we want to save all logger stats + prefix: prefix for the current module + target_dict: the dictionary used to save all logger stats + """ + + def get_prefix(prefix): + return prefix if prefix == "" else prefix + "." + + for child in mod.children(): + if isinstance(child, Logger): + target_dict[get_prefix(prefix) + "stats"] = child.stats + break + + for name, child in mod.named_children(): + module_prefix = get_prefix(prefix) + name if prefix else name + _get_logger_dict_helper(child, target_dict, module_prefix) + + +def get_logger_dict(mod: nn.Module, prefix: str = "") -> dict[str, dict]: + r"""Traverse the modules and save all logger stats into target dict. + This is mainly used for quantization accuracy debug. + + Type of loggers supported: + ShadowLogger: used to log the outputs of the quantized module and its matching float shadow module, + OutputLogger: used to log the outputs of the modules + + Args: + mod: module we want to save all logger stats + prefix: prefix for the current module + + Return: + target_dict: the dictionary used to save all logger stats + + """ + torch._C._log_api_usage_once("quantization_api._numeric_suite.get_logger_dict") + + target_dict: dict[str, dict] = {} + _get_logger_dict_helper(mod, target_dict, prefix) + return target_dict + + +class Logger(nn.Module): + r"""Base class for stats logging""" + + def __init__(self): + super().__init__() + self.stats = {} + # We only insert observer if the op is quantized with static quantization, + # which is identified by activation_observer.dtype == quint8. This is needed + # when attaching Logger as observer for FX mode + self.dtype = torch.quint8 + + def forward(self, x): + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + + +class ShadowLogger(Logger): + r"""Class used in Shadow module to record the outputs of the original and + shadow modules. + """ + + def __init__(self): + super().__init__() + self.stats["float"] = [] + self.stats["quantized"] = [] + + def forward(self, x, y): # type: ignore[override] + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + if len(x) > 1: + x = x[0] + if len(y) > 1: + y = y[0] + self.stats["quantized"].append(x.detach()) + self.stats["float"].append(y.detach()) + + +class OutputLogger(Logger): + r"""Class used to log the outputs of the module""" + + def __init__(self): + super().__init__() + self.stats["tensor_val"] = [] + + def forward(self, x): + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + self.stats["tensor_val"].append(x) + return x + + +def _convert_tuple_to_list(t: Any) -> Any: + return [_convert_tuple_to_list(x) for x in t] if type(t) is tuple else t + + +def _dequantize_tensor_list(t: Any) -> Any: + return ( + [_dequantize_tensor_list(x) for x in t] + if type(t) is list + else t.dequantize() + if t.is_quantized + else t + ) + + +class Shadow(nn.Module): + r"""Shadow module attaches the float module to its matching quantized module + as the shadow. Then it uses Logger module to process the outputs of both + modules. + + Args: + q_module: module quantized from float_module that we want to shadow + float_module: float module used to shadow q_module + logger_cls: type of logger used to process the outputs of q_module and + float_module. ShadowLogger or custom loggers can be used. + """ + + def __init__(self, q_module, float_module, logger_cls): + super().__init__() + self.orig_module = q_module + self.shadow_module = float_module + self.dequant = nnq.DeQuantize() + self.logger = logger_cls() + + def forward(self, *x) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + xl = _convert_tuple_to_list(x) + output = self.orig_module(*xl) + xl_float = _dequantize_tensor_list(xl) + shadow_output = self.shadow_module(*xl_float) + self.logger(output, shadow_output) + return output + + def add(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.add(x, y) + x = x.dequantize() + y = y.dequantize() + shadow_output = self.shadow_module.add(x, y) + self.logger(output, shadow_output) + return output + + def add_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.add_scalar(x, y) + x = x.dequantize() + shadow_output = self.shadow_module.add_scalar(x, y) + self.logger(output, shadow_output) + return output + + def mul(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.mul(x, y) + x = x.dequantize() + y = y.dequantize() + shadow_output = self.shadow_module.mul(x, y) + self.logger(output, shadow_output) + return output + + def mul_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.mul_scalar(x, y) + x = x.dequantize() + shadow_output = self.shadow_module.mul_scalar(x, y) + self.logger(output, shadow_output) + return output + + def cat(self, x: list[torch.Tensor], dim: int = 0) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.cat(x, dim) + x = [y.dequantize() for y in x] + shadow_output = self.shadow_module.cat(x, dim) + self.logger(output, shadow_output) + return output + + def add_relu(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + output = self.orig_module.add_relu(x, y) + x = x.dequantize() + y = y.dequantize() + shadow_output = self.shadow_module.add_relu(x, y) + self.logger(output, shadow_output) + return output + + +def prepare_model_with_stubs( + float_module: nn.Module, + q_module: nn.Module, + module_swap_list: set[type], + logger_cls: Callable, +) -> None: + r"""Prepare the model by attaching the float module to its matching quantized + module as the shadow if the float module type is in module_swap_list. + + Example usage:: + + prepare_model_with_stubs(float_model, q_model, module_swap_list, Logger) + q_model(data) + ob_dict = get_logger_dict(q_model) + + Args: + float_module: float module used to generate the q_module + q_module: module quantized from float_module + module_swap_list: list of float module types to attach the shadow + logger_cls: type of logger to be used in shadow module to process the outputs of + quantized module and its float shadow module + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite.prepare_model_with_stubs" + ) + + float_module_children = dict(float_module.named_children()) + + reassign = {} + for name, mod in q_module.named_children(): + if name not in float_module_children: + continue + + float_mod = float_module_children[name] + + if type(float_mod) not in module_swap_list: + prepare_model_with_stubs(float_mod, mod, module_swap_list, logger_cls) + + # Insert shadow module only if the module is not of the same type as + # the floating point module + if type(float_mod) in module_swap_list and not _is_identical_module_type( + mod, float_mod + ): + reassign[name] = Shadow(mod, float_mod, logger_cls) + + for key, value in reassign.items(): + q_module._modules[key] = value + + +def _is_identical_module_type(mod1, mod2): + # Compare if two modules have the same dtype + mod1_module_types = [type(mod) for mod in mod1.modules()] + mod2_module_types = [type(mod) for mod in mod2.modules()] + return mod1_module_types == mod2_module_types + + +def compare_model_stub( + float_model: nn.Module, + q_model: nn.Module, + module_swap_list: set[type], + *data, + logger_cls=ShadowLogger, +) -> dict[str, dict]: + r"""Compare quantized module in a model with its floating point counterpart, + feeding both of them the same input. Return a dict with key corresponding to + module names and each entry being a dictionary with two keys 'float' and + 'quantized', containing the output tensors of quantized and its matching + float shadow module. This dict can be used to compare and compute the module + level quantization error. + + This function first call prepare_model_with_stubs() to swap the quantized + module that we want to compare with the Shadow module, which takes quantized + module, corresponding float module and logger as input, and creates a forward + path inside to make the float module to shadow quantized module sharing the + same input. The logger can be customizable, default logger is ShadowLogger + and it will save the outputs of the quantized module and float module that + can be used to compute the module level quantization error. + + Example usage:: + + module_swap_list = [ + torchvision.models.quantization.resnet.QuantizableBasicBlock + ] + ob_dict = compare_model_stub(float_model, qmodel, module_swap_list, data) + for key in ob_dict: + print( + key, + compute_error( + ob_dict[key]["float"], ob_dict[key]["quantized"].dequantize() + ), + ) + + Args: + float_model: float model used to generate the q_model + q_model: model quantized from float_model + module_swap_list: list of float module types at which shadow modules will + be attached. + data: input data used to run the prepared q_model + logger_cls: type of logger to be used in shadow module to process the outputs of + quantized module and its float shadow module + """ + torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_model_stub") + prepare_model_with_stubs(float_model, q_model, module_swap_list, logger_cls) + q_model(*data) + ob_dict = get_logger_dict(q_model) + return ob_dict + + +def get_matching_activations( + float_module: nn.Module, + q_module: nn.Module, +) -> dict[str, dict[str, torch.Tensor]]: + r"""Find the matching activation between float and quantized modules. + + Args: + float_module: float module used to generate the q_module + q_module: module quantized from float_module + + Return: + act_dict: dict with key corresponding to quantized module names and each + entry being a dictionary with two keys 'float' and 'quantized', containing + the matching float and quantized activations + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite.get_matching_activations" + ) + float_dict = get_logger_dict(float_module) + quantized_dict = get_logger_dict(q_module) + act_dict: dict[str, dict] = {} + for key in quantized_dict: + if len(quantized_dict[key]["tensor_val"]) == 0: + continue + match_key = _find_match(sorted(float_dict, reverse=True), key, "stats") + if match_key is not None: + act_dict[key] = {} + act_dict[key]["float"] = float_dict[match_key]["tensor_val"] + act_dict[key]["quantized"] = quantized_dict[key]["tensor_val"] + return act_dict + + +def prepare_model_outputs( + float_module: nn.Module, + q_module: nn.Module, + logger_cls=OutputLogger, + allow_list=None, +) -> None: + r"""Prepare the model by attaching the logger to both float module + and quantized module if they are in the allow_list. + + Args: + float_module: float module used to generate the q_module + q_module: module quantized from float_module + logger_cls: type of logger to be attached to float_module and q_module + allow_list: list of module types to attach logger + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite.prepare_model_outputs" + ) + if allow_list is None: + allow_list = get_default_compare_output_module_list() + + qconfig_debug = torch.ao.quantization.QConfig(activation=logger_cls, weight=None) + float_module.qconfig = qconfig_debug # type: ignore[assignment] + prepare( + float_module, inplace=True, allow_list=allow_list, prepare_custom_config_dict={} + ) + q_module.qconfig = qconfig_debug # type: ignore[assignment] + prepare( + q_module, + inplace=True, + allow_list=allow_list, + observer_non_leaf_module_list=NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST, + prepare_custom_config_dict={}, + ) + + +def compare_model_outputs( + float_model: nn.Module, + q_model: nn.Module, + *data, + logger_cls=OutputLogger, + allow_list=None, +) -> dict[str, dict[str, torch.Tensor]]: + r"""Compare output activations between float and quantized models at + corresponding locations for the same input. Return a dict with key corresponding + to quantized module names and each entry being a dictionary with two keys + 'float' and 'quantized', containing the activations of quantized model and + float model at matching locations. This dict can be used to compare and + compute the propagation quantization error. + + Example usage:: + + act_compare_dict = compare_model_outputs(float_model, qmodel, data) + for key in act_compare_dict: + print( + key, + compute_error( + act_compare_dict[key]["float"], + act_compare_dict[key]["quantized"].dequantize(), + ), + ) + + Args: + float_model: float model used to generate the q_model + q_model: model quantized from float_model + data: input data used to run the prepared float_model and q_model + logger_cls: type of logger to be attached to float_module and q_module + allow_list: list of module types to attach logger + + Return: + act_compare_dict: dict with key corresponding to quantized module names + and each entry being a dictionary with two keys 'float' and 'quantized', + containing the matching float and quantized activations + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite.compare_model_outputs" + ) + if allow_list is None: + allow_list = get_default_compare_output_module_list() + prepare_model_outputs(float_model, q_model, logger_cls, allow_list) + float_model(*data) + q_model(*data) + act_compare_dict = get_matching_activations(float_model, q_model) + return act_compare_dict diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py new file mode 100644 index 0000000000000000000000000000000000000000..1861d0160db152e73debda3bda7f714ca4bbf601 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py @@ -0,0 +1,1121 @@ +# mypy: allow-untyped-defs +""" +This module contains tooling to compare weights and activations +across models. Example usage:: + + import copy + import torch + import torch.ao.quantization.quantize_fx as quantize_fx + import torch.ao.ns._numeric_suite_fx as ns + + m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)).eval() + mp = quantize_fx.prepare_fx(m, {"": torch.ao.quantization.default_qconfig}) + # We convert a copy because we need the original prepared model + # to be available for comparisons, and `quantize_fx.convert_fx` is inplace. + mq = quantize_fx.convert_fx(copy.deepcopy(mp)) + + # + # Comparing weights + # + + # extract weight pairs + weight_comparison = ns.extract_weights("a", mp, "b", mq) + + # add SQNR for each comparison, inplace + ns.extend_logger_results_with_comparison( + weight_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr" + ) + + # weight_comparison contains the weights from `mp` and `mq` stored + # in pairs, and can be used for further analysis. + + + # + # Comparing activations, with error propagation + # + + # add loggers + mp_ns, mq_ns = ns.add_loggers( + "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger + ) + + # send an example datum to capture intermediate activations + datum = torch.randn(1, 1, 1, 1) + mp_ns(datum) + mq_ns(datum) + + # extract intermediate activations + act_comparison = ns.extract_logger_info(mp_ns, mq_ns, ns.OutputLogger, "b") + + # add SQNR for each comparison, inplace + ns.extend_logger_results_with_comparison( + act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr" + ) + + # act_comparison contains the activations from `mp_ns` and `mq_ns` stored + # in pairs, and can be used for further analysis. + + # + # Comparing activations, without error propagation + # + + # create shadow model + mp_shadows_mq = ns.add_shadow_loggers( + "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger + ) + + # send an example datum to capture intermediate activations + datum = torch.randn(1, 1, 1, 1) + mp_shadows_mq(datum) + + # extract intermediate activations + shadow_act_comparison = ns.extract_shadow_logger_info( + mp_shadows_mq, ns.OutputLogger, "b" + ) + + # add SQNR for each comparison, inplace + ns.extend_logger_results_with_comparison( + shadow_act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr" + ) + + # shadow_act_comparison contains the activations from `mp_ns` and `mq_ns` stored + # in pairs, and can be used for further analysis. + +""" + +import collections +from collections.abc import Callable +from typing import Any, TYPE_CHECKING + +import torch +import torch.ao.quantization.quantize_fx as quantize_fx +import torch.nn as nn +from torch.ao.ns.fx.graph_matcher import get_matching_subgraph_pairs +from torch.ao.ns.fx.mappings import get_base_name_to_sets_of_related_ops +from torch.ao.ns.fx.n_shadows_utils import ( + _get_dedup_subgraphs, + create_add_loggers_graph, + create_n_transformed_and_logged_copies_of_subgraph, + create_results_comparison, + extract_weight_comparison, + group_results_by_subgraph, + OutputProp, + print_n_shadows_summary, + SHADOW_WRAPPER_NODE_NAME_PREFIX, +) +from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping +from torch.ao.quantization import QConfigMapping +from torch.ao.quantization.backend_config import BackendConfig +from torch.ao.quantization.backend_config.utils import ( + get_fusion_pattern_to_root_node_getter, +) +from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr +from torch.ao.quantization.fx.match_utils import _find_matches +from torch.ao.quantization.fx.qconfig_mapping_utils import ( + _generate_node_name_to_qconfig, +) +from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers +from torch.fx import GraphModule +from torch.fx.graph import Node + +from .fx.graph_passes import add_loggers_to_model, create_a_shadows_b +from .fx.ns_types import NSNodeTargetType, NSResultsType, NSSingleResultValuesType +from .fx.utils import ( + get_target_type_str, + maybe_add_missing_fqns, + rekey_logger_info_on_node_name_of_model, +) +from .fx.weight_utils import extract_weight_from_node + + +if TYPE_CHECKING: + from torch.ao.quantization.qconfig import QConfigAny + +RNNReturnType = tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]] + + +class OutputLogger(nn.Module): + """ + Base class for capturing intermediate values. + """ + + stats: list[torch.Tensor] + stats_rnn: list[RNNReturnType] + + # Mark as impure so that calls to it will not be removed during DCE. + _is_impure = True + + def __init__( + self, + ref_node_name: str, + prev_node_name: str, + model_name: str, + ref_name: str, + prev_node_target_type: str, + ref_node_target_type: str, + results_type: str, + index_within_arg: int, + index_of_arg: int, + fqn: str | None, + qconfig_str: str | None = "", + ): + super().__init__() + self.stats: list[torch.Tensor] = [] + self.stats_rnn: list[RNNReturnType] = [] + + # name of the node which was responsible for adding this logger + # Note: + # - if we are logging node outputs, this is the same as prev_node_name + # - if we are logging node inputs, this is the name of the node + # whose input this logger is logging. + # + # example, where logger1 is logging input of op1 and logger2 is logging + # the output of op1: + # + # x1 -> logger1 -> op1 -> logger2 -> x2 + # + # in this example, + # - logger1's prev_node_name is x1 and ref_node_name is op1 + # - logger2's prev_node_name is op1 and ref_node_name is op1 + self.ref_node_name = ref_node_name + # name of the node whose output this Logger is capturing + self.prev_node_name = prev_node_name + + # name of the model from which the node originated from + self.model_name = model_name + # reference name, used to match loggers from separate models + # to each other + self.ref_name = ref_name + # type of the target of the node whose output this logger is logging + self.prev_node_target_type = prev_node_target_type + # type of the target of the node which was responsible for adding this + # logger + self.ref_node_target_type = ref_node_target_type + # what kind of values are inside of stats + self.results_type = results_type + # index of this node within the arg of the input/output node + # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1 + self.index_within_arg = index_within_arg + # index of this node within the args of the input/output node + # for example, in add(x1, x2), x2 would have index_of_arg == 1 + self.index_of_arg = index_of_arg + # fully qualified name + self.fqn = fqn + # if loggers are added before prepare_fx, but we do not want + # collect results of calibration, only results after convert_fx + # so, we add a flag to control whether this logger collects data + self.enabled = True + # string representation of qconfig + self.qconfig_str = qconfig_str + # this can be turned off to reduce memory usage during calibration + self.save_activations = True + + # Note: cannot annotate the type of x because TorchScript does not support + # the Union type. + def forward(self, x): + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + # TODO(future PR): consider designing this better, as the difference + # between these two flags is subtle and not obvious. + if not self.enabled: + return x + if not self.save_activations: + return x + # TODO(future PR): consider refactoring this to better reuse the parent + # class + if isinstance(x, torch.Tensor): + self.stats.append(x.detach()) + elif isinstance(x, tuple) and len(x) == 2 and len(x[1]) == 2: + new_res = (x[0].detach(), (x[1][0].detach(), x[1][1].detach())) + self.stats_rnn.append(new_res) + return x + + def __repr__(self): + clean_dict = { + k: v + for k, v in self.__dict__.items() + # skip nn.Module keys + if (k != "training") and not k.startswith("_") + } + return f"OutputLogger({clean_dict})" + + +class OutputComparisonLogger(OutputLogger): + """ + Same as OutputLogger, but also requires the original activation + in order to calculate the comparison at calibration time + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO(future PR): make the comparison function configurable + self.comparison_fn = torch.ao.ns.fx.utils.compute_sqnr + self.comparison_fn_name = "sqnr" + # precalculated comparisons of logger output versus reference + self.comparisons = [] + # precalculated comparisons function + + def forward(self, x, x_ref): # type: ignore[override] + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + if not self.enabled: + return x + if not isinstance(x, torch.Tensor): + raise AssertionError("non-tensor inputs not yet supported") + if self.save_activations: + # save the activation, for debugging + self.stats.append(x.detach()) + # save the comparison + self.comparisons.append(self.comparison_fn(x, x_ref)) + return x + + def __repr__(self): + clean_dict = { + k: v + for k, v in self.__dict__.items() + # skip nn.Module keys + if (k != "training") and not k.startswith("_") + } + return f"OutputComparisonLogger({clean_dict})" + + +class NSTracer(quantize_fx.QuantizationTracer): + """ + Just like a regular FX quantization tracer, but treats observers and fake_quantize + modules as leaf modules. + """ + + def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool: + # fmt: off + """ + """ # blank docblock to make autodoc happy + # fmt: on + if isinstance(m, torch.ao.quantization.ObserverBase): + return True + elif isinstance(m, torch.ao.quantization.FakeQuantizeBase): + return True + return super().is_leaf_module(m, module_qualified_name) + + +def _extract_weights_one_model( + model_name: str, + model: GraphModule, + nodes_and_names_to_instrument: list[tuple[Node, str]], + results: NSResultsType, + op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] + | None = None, +) -> None: + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx._extract_weights_one_model" + ) + for node, ref_name in nodes_and_names_to_instrument: + res_type = NSSingleResultValuesType.WEIGHT.value + extracted_weight = extract_weight_from_node( + node, model, op_to_type_to_weight_extraction_fn + ) + if extracted_weight: + if ref_name not in results: + results[ref_name] = {res_type: {}} + results[ref_name][res_type][model_name] = [extracted_weight] + + +def _extract_weights_impl( + model_name_a: str, + gm_a: GraphModule, + model_name_b: str, + gm_b: GraphModule, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, + op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] + | None = None, +) -> NSResultsType: + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx._extract_weights_impl" + ) + matched_subgraph_pairs = get_matching_subgraph_pairs( + gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map + ) + + # split the subgraph pairs into one data structure for each model + nodes_and_names_to_instrument_a: list[tuple[Node, str]] = [] + nodes_and_names_to_instrument_b: list[tuple[Node, str]] = [] + for match_name, match in matched_subgraph_pairs.items(): + subgraph_a, subgraph_b = match + nodes_and_names_to_instrument_a.append((subgraph_a.base_op_node, match_name)) + nodes_and_names_to_instrument_b.append((subgraph_b.base_op_node, match_name)) + + # populate the results, one model at a time + results: NSResultsType = {} + _extract_weights_one_model( + model_name_a, + gm_a, + nodes_and_names_to_instrument_a, + results, + op_to_type_to_weight_extraction_fn, + ) + _extract_weights_one_model( + model_name_b, + gm_b, + nodes_and_names_to_instrument_b, + results, + op_to_type_to_weight_extraction_fn, + ) + + # fill in missing fqn entries + maybe_add_missing_fqns(results) + + # rekey on names of nodes in gm_b + results = rekey_logger_info_on_node_name_of_model(results, model_name_b) + + return results + + +def extract_weights( + model_name_a: str, + model_a: nn.Module, + model_name_b: str, + model_b: nn.Module, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, + op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] + | None = None, +) -> NSResultsType: + """ + Extract weights from model A and model B, and return a comparison. + + Args: + model_name_a: string name of model A to use in results + model_a: model A + model_name_b: string name of model B to use in results + model_b: model B + base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change + unmatchable_types_map: optional override of unmatchable types, subject to change + op_to_type_to_weight_extraction_fn: optional override of function which extracts weight + from a type, subject to change + + Return: + NSResultsType, containing the weight comparisons + """ + + torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.extract_weights") + if base_name_to_sets_of_related_ops is None: + base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops() + + # TODO(future PR): expose these + skipped_module_names: list[str] = [] + skipped_module_classes: list[Callable] = [] + tracer_a = NSTracer(skipped_module_names, skipped_module_classes) + tracer_b = NSTracer(skipped_module_names, skipped_module_classes) + gm_a = GraphModule(model_a, tracer_a.trace(model_a)) + maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr( + model_a, "node_name_to_scope" + ) + if maybe_model_a_node_name_to_scope is not None: + gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope + gm_b = GraphModule(model_b, tracer_b.trace(model_b)) + maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr( + model_b, "node_name_to_scope" + ) + if maybe_model_b_node_name_to_scope is not None: + gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope + return _extract_weights_impl( + model_name_a, + gm_a, + model_name_b, + gm_b, + base_name_to_sets_of_related_ops, + unmatchable_types_map, + op_to_type_to_weight_extraction_fn, + ) + + +def _add_loggers_one_model( + model_name: str, + model: GraphModule, + nodes_and_names_to_instrument_inputs: list[tuple[Node, str, str]], + nodes_and_names_to_instrument_outputs: list[tuple[Node, str, str]], + logger_cls: Callable, +) -> nn.Module: + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx._add_loggers_one_model" + ) + + # TODO(future PR): do not observe nodes we do not care + # about (both fp32, denylist, etc) + node_to_instrument_inputs_to_ref_name: dict[Node, tuple[str, str]] = {} + node_to_instrument_outputs_to_ref_name: dict[Node, tuple[str, str]] = {} + for node, ref_name, ref_node_type in nodes_and_names_to_instrument_inputs: + node_to_instrument_inputs_to_ref_name[node] = (ref_name, ref_node_type) + for node, ref_name, ref_node_type in nodes_and_names_to_instrument_outputs: + node_to_instrument_outputs_to_ref_name[node] = (ref_name, ref_node_type) + + model = add_loggers_to_model( + model, + node_to_instrument_inputs_to_ref_name, + node_to_instrument_outputs_to_ref_name, + logger_cls, + model_name, + ) + return model + + +def _add_loggers_impl( + name_a: str, + gm_a: GraphModule, + name_b: str, + gm_b: GraphModule, + logger_cls: Callable, + should_log_inputs: bool, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> tuple[nn.Module, nn.Module]: + torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_loggers_impl") + matched_subgraph_pairs = get_matching_subgraph_pairs( + gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map + ) + nodes_and_names_to_instrument_inputs_a = [] + nodes_and_names_to_instrument_inputs_b = [] + nodes_and_names_to_instrument_outputs_a = [] + nodes_and_names_to_instrument_outputs_b = [] + for match_name, (subgraph_a, subgraph_b) in matched_subgraph_pairs.items(): + ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a) + ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b) + # Note: for matching inputs we use start_node, such as observing + # the input of linear in linear-relu + if should_log_inputs: + nodes_and_names_to_instrument_inputs_a.append( + (subgraph_a.start_node, match_name, ref_node_type_a) + ) + nodes_and_names_to_instrument_inputs_b.append( + (subgraph_b.start_node, match_name, ref_node_type_b) + ) + # Note: for matching activations we always use end_node, + # such as observing the output of relu in linear-relu + nodes_and_names_to_instrument_outputs_a.append( + (subgraph_a.end_node, match_name, ref_node_type_a) + ) + nodes_and_names_to_instrument_outputs_b.append( + (subgraph_b.end_node, match_name, ref_node_type_b) + ) + + new_model_a = _add_loggers_one_model( + name_a, + gm_a, + nodes_and_names_to_instrument_inputs_a, + nodes_and_names_to_instrument_outputs_a, + logger_cls, + ) + new_model_b = _add_loggers_one_model( + name_b, + gm_b, + nodes_and_names_to_instrument_inputs_b, + nodes_and_names_to_instrument_outputs_b, + logger_cls, + ) + return (new_model_a, new_model_b) + + +def add_loggers( + name_a: str, + model_a: nn.Module, + name_b: str, + model_b: nn.Module, + logger_cls: Callable, + should_log_inputs: bool = False, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> tuple[nn.Module, nn.Module]: + """ + Instrument model A and model B with loggers. + + Args: + name_a: string name of model A to use in results + model_a: model A + name_b: string name of model B to use in results + model_b: model B + logger_cls: class of Logger to use + base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change + unmatchable_types_map: optional override of unmatchable types, subject to change + + Return: + Returns a tuple of (model_a_with_loggers, model_b_with_loggers). Modifies both models inplace. + """ + + torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.add_loggers") + # TODO(future PR): expose these + skipped_module_names: list[str] = [] + skipped_module_classes: list[Callable] = [] + tracer_a = NSTracer(skipped_module_names, skipped_module_classes) + tracer_b = NSTracer(skipped_module_names, skipped_module_classes) + gm_a = GraphModule(model_a, tracer_a.trace(model_a)) + maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr( + model_a, "node_name_to_scope" + ) + if maybe_model_a_node_name_to_scope is not None: + gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope + gm_b = GraphModule(model_b, tracer_b.trace(model_b)) + maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr( + model_b, "node_name_to_scope" + ) + if maybe_model_b_node_name_to_scope is not None: + gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope + return _add_loggers_impl( + name_a, + gm_a, + name_b, + gm_b, + logger_cls, + should_log_inputs=should_log_inputs, + base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops, + unmatchable_types_map=unmatchable_types_map, + ) + + +def _extract_logger_info_one_model( + model: nn.Module, + results: NSResultsType, + logger_cls: Callable, +) -> None: + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx._extract_logger_info_one_model" + ) + for _gm_name, mod in model.named_modules(): + # TODO(future PR): better check when scripted + is_logger = isinstance(mod, logger_cls) or ( # type: ignore[arg-type] + isinstance(mod, torch.jit.RecursiveScriptModule) + and mod.original_name == "OutputLogger" + ) + if is_logger: + key = mod.ref_name + if key not in results: + results[key] = {} + if mod.model_name in results[key]: + raise AssertionError(f"{mod.model_name} is already present in results") + if mod.results_type not in results[key]: + results[key][mod.results_type] = {} + if mod.model_name not in results[key][mod.results_type]: + results[key][mod.results_type][mod.model_name] = [] + stats_to_use = mod.stats + if len(mod.stats_rnn) > 0: + stats_to_use = mod.stats_rnn + data = { + "type": mod.results_type, + "values": stats_to_use, + "ref_node_name": mod.ref_node_name, + "ref_node_target_type": mod.ref_node_target_type, + "prev_node_name": mod.prev_node_name, + "prev_node_target_type": mod.prev_node_target_type, + "index_within_arg": mod.index_within_arg, + "index_of_arg": mod.index_of_arg, + "fqn": mod.fqn, + "qconfig_str": mod.qconfig_str, + } + if hasattr(mod, "comparisons"): + data["comparisons"] = mod.comparisons + data["comparison_fn_name"] = mod.comparison_fn_name + else: + data["comparisons"] = [] + data["comparison_fn_name"] = "" + results[key][mod.results_type][mod.model_name].append(data) + # ensure the list stays sorted + results[key][mod.results_type][mod.model_name].sort( + key=lambda res: f"{res['index_of_arg']}:{res['index_within_arg']}" + ) + + +# TODO(future PR): align on naming +# this is equivalent of just the comparison extraction part of `ns.compare_model_outputs` +def extract_logger_info( + model_a: nn.Module, + model_b: nn.Module, + logger_cls: Callable, + model_name_to_use_for_layer_names: str, +) -> NSResultsType: + """ + Traverse all loggers in `model_a` and `model_b`, and extract the logged + information. + + Args: + model_a: model A + model_b: model B + logger_cls: class of Logger to use + model_name_to_use_for_layer_names: string name of model to use for + layer names in the output + + Return: + NSResultsType, containing the logged comparisons + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx.extract_logger_info" + ) + results: NSResultsType = {} + for model in (model_a, model_b): + _extract_logger_info_one_model(model, results, logger_cls) + # fill in missing fqn entries + maybe_add_missing_fqns(results) + # rekey on the name of model b + results = rekey_logger_info_on_node_name_of_model( + results, model_name_to_use_for_layer_names + ) + return results + + +def _add_shadow_loggers_impl( + name_a: str, + gm_a: GraphModule, + name_b: str, + gm_b: GraphModule, + logger_cls: Callable, + should_log_inputs: bool, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> nn.Module: + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx._add_shadow_loggers_impl" + ) + matched_subgraph_pairs = get_matching_subgraph_pairs( + gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map + ) + gm_a_shadows_b = create_a_shadows_b( + name_a, + gm_a, + name_b, + gm_b, + matched_subgraph_pairs, + logger_cls, + should_log_inputs=should_log_inputs, + node_type_to_io_type_map=node_type_to_io_type_map, + ) + return gm_a_shadows_b + + +def add_shadow_loggers( + name_a: str, + model_a: nn.Module, + name_b: str, + model_b: nn.Module, + logger_cls: Callable, + should_log_inputs: bool = False, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> nn.Module: + """ + Instrument model A and model B with shadow loggers. + + Args: + name_a: string name of model A to use in results + model_a: model A + name_b: string name of model B to use in results + model_b: model B + logger_cls: class of Logger to use + should_log_inputs: whether to log inputs + base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change + unmatchable_types_map: optional override of unmatchable types, subject to change + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx.add_shadow_loggers" + ) + # TODO(future PR): expose these + skipped_module_names: list[str] = [] + skipped_module_classes: list[Callable] = [] + tracer_a = NSTracer(skipped_module_names, skipped_module_classes) + tracer_b = NSTracer(skipped_module_names, skipped_module_classes) + gm_a = GraphModule(model_a, tracer_a.trace(model_a)) + maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr( + model_a, "node_name_to_scope" + ) + if maybe_model_a_node_name_to_scope is not None: + gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope + gm_b = GraphModule(model_b, tracer_b.trace(model_b)) + maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr( + model_b, "node_name_to_scope" + ) + if maybe_model_b_node_name_to_scope is not None: + gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope + return _add_shadow_loggers_impl( + name_a, + gm_a, + name_b, + gm_b, + logger_cls, + should_log_inputs=should_log_inputs, + base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops, + node_type_to_io_type_map=node_type_to_io_type_map, + unmatchable_types_map=unmatchable_types_map, + ) + + +def extract_shadow_logger_info( + model_a_shadows_b: nn.Module, + logger_cls: Callable, + model_name_to_use_for_layer_names: str, +) -> NSResultsType: + """ + Traverse all loggers in a shadow model, and extract the logged + information. + + Args: + model_a_shadows_b: shadow model + logger_cls: class of Logger to use + model_name_to_use_for_layer_names: string name of model to use for + layer names in the output + + Return: + NSResultsType, containing the logged comparisons + """ + torch._C._log_api_usage_once( + "quantization_api._numeric_suite_fx.extract_shadow_logger_info" + ) + results: NSResultsType = collections.defaultdict(dict) + _extract_logger_info_one_model(model_a_shadows_b, results, logger_cls) + # fill in missing fqn entries + maybe_add_missing_fqns(results) + # rekey on the name of model b + results = rekey_logger_info_on_node_name_of_model( + results, model_name_to_use_for_layer_names + ) + return dict(results) + + +def extend_logger_results_with_comparison( + results: NSResultsType, + model_name_1: str, + model_name_2: str, + comparison_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor], + comparison_name: str, +) -> None: + """ + Compares the logged values from `model_name_2` against the corresponding + values in `model_name_1`, using `comparison_fn`. Records the result + in `model_name_2`'s results under `comparison_name`. Modifies `results` inplace. + + Args: + results: the result data structure from `extract_logger_info` or + `extract_shadow_logger_info`. + model_name_1: string name of model 1 + model_name_2: string name of model 2 + comparison_fn: function to compare two Tensors + comparison_name: string name of model to use for + layer names in the output + """ + for results_type_to_results in results.values(): + for model_name_to_results in results_type_to_results.values(): + if model_name_1 not in model_name_to_results: + raise AssertionError(f"{model_name_1} not found in results") + if model_name_2 not in model_name_to_results: + raise AssertionError(f"{model_name_2} not found in results") + + results_1 = model_name_to_results[model_name_1] + results_2 = model_name_to_results[model_name_2] + + for result_2 in results_2: + index_within_arg_2 = result_2["index_within_arg"] + index_of_arg_2 = result_2["index_of_arg"] + # find corresponding result_1 + result_1 = None + for cur_result_1 in results_1: + index_within_arg_1 = cur_result_1["index_within_arg"] + index_of_arg_1 = cur_result_1["index_of_arg"] + if (index_within_arg_1 == index_within_arg_2) and ( + index_of_arg_1 == index_of_arg_2 + ): + result_1 = cur_result_1 + break + if result_1 is None: + raise AssertionError("Expected result_1 to be not None") + + values_1 = result_1["values"] + values_2 = result_2["values"] + result_2[comparison_name] = [] + for value_1, value_2 in zip(values_1, values_2): + comparison_result = comparison_fn(value_1, value_2) + result_2[comparison_name].append(comparison_result) + + +def prepare_n_shadows_model( + model: torch.nn.Module, + example_inputs: Any, + qconfig_multi_mapping: QConfigMultiMapping, + backend_config: BackendConfig, + custom_prepare_fn: Callable | None = None, + custom_prepare_kwargs: dict[str, Any] | None = None, + custom_tracer: Any = None, +) -> GraphModule: + """ + Given a model with a graph with M ops such as + + + args_kwargs_m -> op_m -> output_m + + + And a set of N qconfigs for each op, creates a new model, with + each of the subgraph of `op_m` transformed into + + .. code:: + + |---------> op_m_n -> log_m_n + | / + args_kwargs_m ---------> op_m -> log_m_0 + + Where op_m_n is op_m wrapped in a submodule and transformed with + qconfig_n, and its inner graph looks like + + .. code:: + + args_m -------- op_m_prepared_with_qconfig_n -> out_m_n + / + kwargs_m --- + + This is useful for testing different quantization of multiple layers in + a single pass through the model. + + High level TODOs for future PRs: + * figure out a better way to name the output structure + * return a results data structure instead of printing it out + * add examples to docblocks + """ + + if custom_tracer is None: + tracer = quantize_fx.QuantizationTracer([], []) + else: + tracer = custom_tracer + mt = torch.fx.GraphModule(model, tracer.trace(model)) + # this is necessary to ensure logger FQNs get populated + mt._node_name_to_scope = tracer.node_name_to_scope # type: ignore[assignment] + + # run example input propagation, we need this to call prepare_fx on + # individual subgraphs + output_prop = OutputProp(mt) + output_prop.propagate(*example_inputs) + + # Find the set of subgraphs in the original graph which we need to + # consider. + modules = dict(mt.named_modules(remove_duplicate=False)) + patterns = _get_pattern_to_quantize_handlers(backend_config) + root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config) + standalone_module_names: list[str] = [] + standalone_module_classes: list[type] = [] + custom_module_classes: list[type] = [] + matches = _find_matches( + mt.graph, + modules, + patterns, + root_node_getter_mapping, + standalone_module_names, + standalone_module_classes, + custom_module_classes, + ) + subgraphs_dedup: dict[str, list[Node]] = _get_dedup_subgraphs(matches) + + # generate node to qconfig for each subgraph + # TODO(future PR): deduplicate repeating entries + list_of_node_name_to_qconfig: list[dict[str, QConfigAny]] = [] + for qconfig_mapping in qconfig_multi_mapping.qconfig_mappings_list: + node_name_to_qconfig = _generate_node_name_to_qconfig( + mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope + ) + list_of_node_name_to_qconfig.append(node_name_to_qconfig) + + # For each region in the model, do the following: + # For each qconfig for that region, do the following: + # 1. create a copy of the region wrapped in a module + # 2. pass original args, original kwargs, and expected output to module + # 3. add an output comparison logger and hook it up to compare + # actual output to expected output + # 4. run `prepare_fx` on the module + for subgraph_idx, (match_name, nodes_in_this_subgraph) in enumerate( + subgraphs_dedup.items() + ): + create_n_transformed_and_logged_copies_of_subgraph( + mt, + subgraph_idx, + match_name, + nodes_in_this_subgraph, + qconfig_multi_mapping.qconfig_mappings_list, + list_of_node_name_to_qconfig, + custom_prepare_fn, + custom_prepare_kwargs, # type: ignore[arg-type] + ) + + return mt + + +# TODO(future PR): we should rethink the names of all the PNP APIs +def _prepare_n_shadows_add_loggers_model( + model: torch.nn.Module, + example_inputs: Any, + qconfig_mapping: QConfigMapping, + backend_config: BackendConfig, +) -> torch.nn.Module: + r""" + Note: this API is not recommended for wide usage, it is only + provided for customers who need to migrate from the `add_loggers` + API. + + This creates a model which provides logging for the following + problem: if we quantize `model` with `qconfig_mapping` and feed + the same input through both models, log the comparisons of + corresponding intermediate layers. + + The problem is solved with a single model. Specifically, we + partition `model` into N subgraphs, create a copy of each relevant + subgraph, wrap it in a module, apply the quantization API to that + module, and hook up loggers to measure the comparisons. + + Example starting graph: + + x0 -> op0 -> x1 -> op1 -> x2 + + Example config: quantize op0 to int8, do nothing to op1. + The following graph will be created: + + .. code:: + + x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log + \ \ \ # noqa: W605 + ---> op0_1 -> x1_1 ----> clog -> op1_0 -> x2_1 ----> clog + + Where op0_0 is op0, op0_1 is op0 wrapped in a submodule and quantized + to int8, op1_0 is op1 (appearing in the graph twice), log is a logger, + and clog is a comparison logger. + """ + + tracer = quantize_fx.QuantizationTracer([], []) + mt = torch.fx.GraphModule(model, tracer.trace(model)) + # this is necessary to ensure logger FQNs get populated + mt._node_name_to_scope = tracer.node_name_to_scope # type: ignore[assignment] + + # run example input propagation, we need this to call prepare_fx on + # individual subgraphs + output_prop = OutputProp(mt) + output_prop.propagate(*example_inputs) + + # Find the set of subgraphs in the original graph which we need to + # consider. + modules = dict(mt.named_modules(remove_duplicate=False)) + patterns = _get_pattern_to_quantize_handlers(backend_config) + root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config) + standalone_module_names: list[str] = [] + standalone_module_classes: list[type] = [] + custom_module_classes: list[type] = [] + matches = _find_matches( + mt.graph, + modules, + patterns, + root_node_getter_mapping, + standalone_module_names, + standalone_module_classes, + custom_module_classes, + ) + subgraphs_dedup: dict[str, list[Node]] = _get_dedup_subgraphs(matches) + + # generate node to qconfig for each subgraph + node_name_to_qconfig = _generate_node_name_to_qconfig( + mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope + ) + + # Now, mutate the graph to be the add_loggers graph with propagation + # error. + create_add_loggers_graph(mt, subgraphs_dedup, qconfig_mapping, node_name_to_qconfig) + + return mt + + +# TODO(future PR): we should rethink the names of all the PNP APIs +def _n_shadows_compare_weights( + model: torch.nn.Module, + example_inputs: Any, + qconfig_mapping: QConfigMapping, + backend_config: BackendConfig, +) -> NSResultsType: + """ + Note: this API is not recommended for wide usage, it is only + provided for customers who need to migrate from the `add_loggers` + API. + """ + qconfig_multi_mapping = QConfigMultiMapping.from_list_qconfig_mapping( + [qconfig_mapping] + ) + mp = prepare_n_shadows_model( + model, example_inputs, qconfig_multi_mapping, backend_config + ) + # passing inputs through the model is necessary to populate + # observers which observe weights with real values + mp(*example_inputs) + mq = convert_n_shadows_model(mp) + weight_comparison = extract_weight_comparison(mq) + return weight_comparison + + +# TODO(future PR): consider aligning API signature with other similar quantization +# functions (enable_fake_quant, etc) +def loggers_set_enabled(model: torch.nn.Module, enabled: bool) -> None: + """ + Sets the `enabled` setting on a `model`'s loggers + """ + for _, child in model.named_modules(): + if isinstance(child, OutputLogger): + child.enabled = enabled + + +# TODO(future PR): consider aligning API signature with other similar quantization +# functions (enable_fake_quant, etc) +def loggers_set_save_activations( + model: torch.nn.Module, + save_activations: bool, +) -> None: + """ + Sets the `save_activations` setting on a `model`'s loggers + """ + for _name, child in model.named_modules(): + if isinstance(child, OutputLogger): + child.save_activations = save_activations + + +def convert_n_shadows_model( + model: GraphModule, + custom_convert_fn: Callable | None = None, + custom_convert_kwargs: dict[str, Any] | None = None, +) -> GraphModule: + """ + Given a model from `prepare_n_shadows_model`, runs `convert_fx` + on each shadow submodule. + """ + for node in model.graph.nodes: + # TODO(future PR): consider matching in a safer way than + # node name string match + if node.name.startswith(SHADOW_WRAPPER_NODE_NAME_PREFIX): + orig_mod = getattr(model, node.name) + if custom_convert_fn is None: + converted_mod = torch.ao.quantization.quantize_fx.convert_fx(orig_mod) + else: + if custom_convert_kwargs is None: + custom_convert_kwargs = {} + converted_mod = custom_convert_fn(orig_mod, **custom_convert_kwargs) + setattr(model, node.name, converted_mod) + + return model + + +def extract_results_n_shadows_model(model: torch.nn.Module) -> NSResultsType: + """ + Extracts logger results from `model`. + """ + results: NSResultsType = {} + _extract_logger_info_one_model(model, results, OutputLogger) + return results + + +def print_comparisons_n_shadows_model(results: NSResultsType) -> None: + """ + Prints a summary of extracted `results`. + """ + results_grouped = group_results_by_subgraph(results) + results_comparison = create_results_comparison(results_grouped) + print_n_shadows_summary(results_comparison) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef5330d2dfed35fc4ebce6ec88ec0448788115f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ad7877ed8be7cdcfc8a99527c6470c731c1e82c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c64e7f16784f5b3262ad614ba382218500395abf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c3b7decef5cea4527ec121108aa3618ec211075 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c36befccf69e1da2ee85ff06462303af6f112e0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d7df37b978fc4ece793d06d4b56e29f7201a722 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee739633fe41f1ccffb1b40bccbb1bdd3a526724 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee6d25c59ae9c398a1d10e3e7a910f538b2d8954 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2969564680c7c0a4de1efa03d7bc78dbd51a5045 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e15a5ad25aefa23e2ce1f558e5e3e581eb505bf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..4fdad3f2d9bc49094c0da3264012cc206c28ab86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py @@ -0,0 +1,485 @@ +# mypy: allow-untyped-defs +import collections +import enum +from typing import Any + +import torch +from torch.ao.quantization import FakeQuantizeBase, ObserverBase +from torch.ao.quantization.utils import getattr_from_fqn +from torch.fx import GraphModule +from torch.fx.graph import Graph, Node + +from .mappings import get_base_name_to_sets_of_related_ops, get_unmatchable_types_map +from .ns_types import NSNodeTargetType, NSSubgraph +from .pattern_utils import ( + end_node_matches_reversed_fusion, + get_reversed_fusions, + get_type_a_related_to_b, +) + + +toq = torch.ops.quantized + + +def _get_output_nodes(g: Graph) -> list[Node]: + return [n for n in g.nodes if n.op == "output"] + + +class _NSGraphMatchableSubgraphsIterator: + """ + Iterates through the graph of gm, starting with the output nodes + and continuing backwards. + 1. Returns matchable subgraphs, in order. A subgraph is defined by + (start_node, end_node). + 2. Skips over non-matchable subgraphs + """ + + def __init__( + self, + gm: GraphModule, + non_matchable_functions: set[NSNodeTargetType], + non_matchable_modules: set[NSNodeTargetType], + non_matchable_methods: set[NSNodeTargetType], + ): + self.gm: GraphModule = gm + self.non_matchable_functions: set[NSNodeTargetType] = non_matchable_functions + self.non_matchable_modules: set[NSNodeTargetType] = non_matchable_modules + self.non_matchable_methods: set[NSNodeTargetType] = non_matchable_methods + self.seen_nodes: set[Node] = set() + self.stack: list[Node] = [] + for start_node in _get_output_nodes(self.gm.graph): + self.stack.append(start_node) + + def __iter__(self): + return self + + def __next__(self) -> NSSubgraph: + """ + Returns the next matchable subgraph. + """ + while len(self.stack) > 0: + cur_end_node = self.stack.pop() + if cur_end_node in self.seen_nodes: + continue + + # for subgraphs which are single nodes, start_node == end_node + # for subgraphs with more than one node, start node != end_node + cur_start_node = cur_end_node + # Subgraphs like linear-relu have the base node as the start node. + # Subgraphs like dequantize-linear-relu-to(torch.float16) have the + # base node as the second node. + # The cur_base_op_node var will move to the actual node during + # the fusion matching later in this code block. + cur_base_op_node = cur_end_node + + # Check for potential fusions. For now, we are greedy + # and always skip all non-base nodes of a fusion. For example, + # if we match linear-relu backwards, we will always skip the + # relu node and attempt to match the linear node. This can + # be made configurable later if needed. + for _reverse_fusion_ops, base_op_idx in get_reversed_fusions(): + is_match = end_node_matches_reversed_fusion( + cur_end_node, _reverse_fusion_ops, self.gm, self.seen_nodes + ) + if is_match: + # navigate to the base node + for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1): + # pyrefly: ignore [bad-argument-type] + self.seen_nodes.add(cur_start_node) + # for now, assume that there are no other nodes + # which need to be added to the stack + cur_start_node = cur_start_node.args[0] # type: ignore[assignment] + # if the base op index matches the current node, set it + rev_base_op_idx = len(_reverse_fusion_ops) - 2 - base_op_idx + if rev_fusion_idx == rev_base_op_idx: + cur_base_op_node = cur_start_node + break + + # pyrefly: ignore [bad-argument-type] + self.seen_nodes.add(cur_start_node) + # add args of previous nodes to stack + # pyrefly: ignore [missing-attribute] + for arg in cur_start_node.all_input_nodes: + self._recursively_add_node_arg_to_stack(arg) + + # skip unmatchable nodes + # note: this check is done on the start_node, i.e. + # if we are matching linear-relu in reverse, this would do the matchable + # check on the linear + # pyrefly: ignore [bad-argument-type] + if not self._is_matchable(cur_base_op_node): + continue + + # If an observer or a fake_quant was not matched as a part of + # a pattern of multiple nodes, ignore it. One case where this is + # relevant is an observer on a graph input, which was added because + # it is necessary for the next node. + if cur_end_node.op == "call_module" and cur_start_node is cur_end_node: + maybe_obs = getattr_from_fqn(self.gm, cur_end_node.target) # type: ignore[arg-type] + if isinstance(maybe_obs, (ObserverBase, FakeQuantizeBase)): + continue + + return NSSubgraph( + # pyrefly: ignore [bad-argument-type] + start_node=cur_start_node, + end_node=cur_end_node, + # pyrefly: ignore [bad-argument-type] + base_op_node=cur_base_op_node, + ) + + raise StopIteration + + def _recursively_add_node_arg_to_stack(self, arg: Any) -> None: + """ + Adds all of the nodes in this arg to the stack, properly navigating + through list, dicts and tuples. + """ + if isinstance(arg, Node): + self.stack.append(arg) + elif ( + isinstance(arg, torch.fx.immutable_collections.immutable_list) + or type(arg) is tuple + ): + for inner_arg in arg: + self._recursively_add_node_arg_to_stack(inner_arg) + elif isinstance(arg, torch.fx.immutable_collections.immutable_dict): + for value in arg.values(): + self._recursively_add_node_arg_to_stack(value) + + def _is_matchable(self, node: Node) -> bool: + if node.op == "call_function": + return node.target not in self.non_matchable_functions + elif node.op == "call_module": + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + target_mod = getattr_from_fqn(self.gm, node.target) + return not any( + isinstance(target_mod, t) # type: ignore[arg-type] + for t in self.non_matchable_modules + ) + elif node.op == "call_method": + return node.target not in self.non_matchable_methods + else: + return False + + +class GraphMatchingException(Exception): + """ + Exception raised when two graphs cannot be matched. + """ + + +class SubgraphTypeRelationship(enum.Enum): + # same type, known + # example: F.linear and F.linear, or nn.Conv2d and nn.Conv2d + EQUAL = enum.auto() + # same type, but the type is not known to Numerical Suite + # (user defined type, etc). + EQUAL_BUT_UKNOWN = enum.auto() + # known, same subgraph_relationship set, but not the same type + # example: F.linear and toq.linear + RELATED_BUT_NOT_EQUAL = enum.auto() + # not related + NOT_RELATED = enum.auto() + + +def _get_subgraph_relationship_type( + subgraph_a: NSSubgraph, + subgraph_b: NSSubgraph, + gm_a: GraphModule, + gm_b: GraphModule, + type_a_related_to_b: set[tuple[NSNodeTargetType, NSNodeTargetType]], +) -> SubgraphTypeRelationship: + node_a = subgraph_a.base_op_node + node_b = subgraph_b.base_op_node + + # TODO(next): make this code handle matching by what is before the base op + if node_a.op != node_b.op: + if not ( + node_a.op in ("call_function", "call_method") + and node_b.op in ("call_function", "call_method") + ): + return SubgraphTypeRelationship.NOT_RELATED + + if node_a.op in ("call_function", "call_method"): + key = (node_a.target, node_b.target) + + if key not in type_a_related_to_b: + if node_a.target == node_b.target: + return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN + else: + return SubgraphTypeRelationship.NOT_RELATED + # after this point, we are dealing with known types + + if node_a.target == node_b.target: + node_a_has_prev = subgraph_a.base_op_node == subgraph_a.start_node + node_b_has_prev = subgraph_b.base_op_node == subgraph_b.start_node + if node_a_has_prev and (not node_b_has_prev): + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + elif (not node_a_has_prev) and node_b_has_prev: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + elif (not node_a_has_prev) and (not node_b_has_prev): + return SubgraphTypeRelationship.EQUAL + else: + # TODO(future PR): check for matches start_op_node and base_op_node + return SubgraphTypeRelationship.EQUAL + + if key in type_a_related_to_b: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + else: + return SubgraphTypeRelationship.NOT_RELATED + elif node_a.op == "call_module": + if ( + subgraph_a.base_op_node != subgraph_a.start_node + or subgraph_b.base_op_node != subgraph_b.start_node + ): + raise AssertionError( + "Matching call_module patterns where base_op_node != start_node is not supported yet" + ) + # for call_module, we need to look up the modules to do the type check + if not isinstance(node_a.target, str): + raise AssertionError(f"Expected str, got {type(node_a.target)}") + mod_a = getattr_from_fqn(gm_a, node_a.target) + if not isinstance(node_b.target, str): + raise AssertionError(f"Expected str, got {type(node_b.target)}") + mod_b = getattr_from_fqn(gm_b, node_b.target) + + key = (type(mod_a), type(mod_b)) + + if key not in type_a_related_to_b: + if type(mod_a) is type(mod_b): + return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN + else: + return SubgraphTypeRelationship.NOT_RELATED + elif type(mod_a) is type(mod_b): + return SubgraphTypeRelationship.EQUAL + else: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + + return SubgraphTypeRelationship.NOT_RELATED + + +def _get_name_for_subgraph( + subgraph_a: NSSubgraph, + gm_a: GraphModule, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]], + existing_names: set[str], +) -> str: + """ + Returns a unique name for a subgraph. This name is based on two things: + 1. the name of the set containing the underlying type of the base op in the + subgraph (i.e. 'torch.nn.functional.linear' if this is related to a linear op) + 2. the number of previous subgraphs with related underlying type of the base op + + For example, in the graph + + linear0 -> relu0 -> linear1 -> relu1 + + The subgraphs are (linear0, relu0) and (linear1, relu1). If we iterate + from the output node backwards, the name given to (linear1, relu1) will be + `base_op_torch.nn.functional.linear_0`, and the name given to (linear0, relu0) + will be `base_op_torch.nn.functional.linear_1`. + + Why are we not just using the node name? Answer: because of two requirements: + A. fusions must be supported + B. some Numeric Suite APIs can be called without having all of the models in memory + + For example, let's say we need to match nodes of + + (1) ... -> linear0 -> relu0 -> ... + + And + + (2) ... -> linear_relu0 -> ... + + Without being able to inspect them together. With the current naming scheme, if + we iterate through both of these graphs in the same order, and assuming the rest + of the graphs match, both of these subgraphs will get the same name without + (1) and (2) knowing anything about each other. + """ + target_type = _get_node_target_type(subgraph_a.base_op_node, gm_a) + target_base_type = None + for base_name, sets_of_related_ops in base_name_to_sets_of_related_ops.items(): + if target_type in sets_of_related_ops: + target_base_type = base_name + target_base_name = "base_op_" + str(target_base_type) + counter = 0 + proposed_name = target_base_name + "_" + str(counter) + while proposed_name in existing_names: + counter += 1 + proposed_name = target_base_name + "_" + str(counter) + existing_names.add(proposed_name) + return proposed_name + + +def _get_node_target_type(node: Node, gm: GraphModule) -> NSNodeTargetType | None: + if node.op in ("call_function", "call_method"): + return node.target + elif node.op == "call_module": + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + mod = getattr_from_fqn(gm, node.target) + return type(mod) + return None + + +def get_matching_subgraph_pairs( + gm_a: GraphModule, + gm_b: GraphModule, + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None, + unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> dict[str, tuple[NSSubgraph, NSSubgraph]]: + """ + Matches matchable subgraphs of graph_a to graph_b. + + For a node, "matchable" is defined as a node which is not an observer, + fake_quants, quant or dequant. + + A subgraph can contain one or more nodes. A subgraph is matchable if + at least one node inside of it is matchable. Currently, all nodes in + a subgraph must be matchable (because we assume no observers will be + inserted in the middle of a fusion). + + A subgraph is defined by (start_node, end_node). We assume that only + start_node and end_node are linked with the surrounding graph, all other + nodes in a subgraph are self-contained. + + A pair of nodes is "related" if both nodes represent the same mathematical + operation across different quantization flavors. For example, + `F.linear` and `torch.ops.quantized.linear` are related, and + `F.linear` and `torch.nn.Conv` are not related. + + For each matchable pair of nodes node_a and node_b, they will match + if node_a and node_b are related. + + For graphs A and B, they will match iff: + 1. the number of matchable subgraphs in A and B is equivalent + 2. when iterating through the matchable subgraphs of A and B in the same order, each + corresponding pair of base nodes is related. + + This enables us to find the corresponding subgraphs between + graphs of related models. For example, if we had two graphs such as: + + graph_a: x0 -> conv_0 (type: nn.Conv2d) -> obs_0 -> x1 + w -/ + b -/ + + graph_b: x0 -> quant_0 -> qconv_0 (type: nnq.Conv2d) -> dequant_0 -> x1 + packed_params_0 -/ + + This function will return the following result: + { + 'conv_0': ( # the name of the node in graph_b + (conv_0, conv_0), # (start_node_a, end_node_a) + (qconv_0, qconv_0), # (start_node_b, end_node_b) + ), + } + + Or, if we have a fusion pattern, + + graph_a: x0 -> linear_0 -> relu_0 -> obs_0 -> x1 + w -/ + b -/ + + graph_b: x0 -> quant_0 -> linear_relu_0 -> dequant_0 -> x1 + packed_params_0 -/ + + This function will return the following result: + { + 'linear_relu_0': ( # the name of the node in graph_b + (linear_0, relu_0), # (start_node_a, end_node_a) + (linear_relu_0, linear_relu_0), # (start_node_b, end_node_b) + ), + } + """ + if unmatchable_types_map is None: + unmatchable_types_map = get_unmatchable_types_map() + non_matchable_functions = unmatchable_types_map["funs_unmatchable"] + non_matchable_modules = unmatchable_types_map["mods_unmatchable"] + non_matchable_methods = unmatchable_types_map["meths_unmatchable"] + + graph_a_iterator = _NSGraphMatchableSubgraphsIterator( + gm_a, non_matchable_functions, non_matchable_modules, non_matchable_methods + ) + graph_b_iterator = _NSGraphMatchableSubgraphsIterator( + gm_b, non_matchable_functions, non_matchable_modules, non_matchable_methods + ) + results = collections.OrderedDict() + if base_name_to_sets_of_related_ops is None: + base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops() + type_a_related_to_b = get_type_a_related_to_b(base_name_to_sets_of_related_ops) + + existing_names_a: set[str] = set() + existing_names_b: set[str] = set() + + while True: + # fetch the next subgraphs from a and b + cur_subgraph_a, cur_subgraph_b = None, None + try: + cur_subgraph_a = next(graph_a_iterator) + except StopIteration: + pass + try: + cur_subgraph_b = next(graph_b_iterator) + except StopIteration: + pass + + # look up types of a and b for useful error messages + type_start_a, type_start_b = None, None + if cur_subgraph_a is not None: + type_start_a = _get_node_target_type(cur_subgraph_a.start_node, gm_a) + if cur_subgraph_b is not None: + type_start_b = _get_node_target_type(cur_subgraph_b.start_node, gm_b) + + # check for results and determine what to do next + if cur_subgraph_a is not None and cur_subgraph_b is not None: + # both nodes were fetched, check for subgraph_relationship + # note: subgraph_relationship is checked on the start node, i.e. + # if a linear-relu pattern is checked, we would check for subgraph_relationship + # of the linear + subgraph_relationship = _get_subgraph_relationship_type( + cur_subgraph_a, cur_subgraph_b, gm_a, gm_b, type_a_related_to_b + ) + if subgraph_relationship == SubgraphTypeRelationship.NOT_RELATED: + msg = f""" +The subgraphs +({cur_subgraph_a}, {type_start_a}) and +({cur_subgraph_b}, {type_start_b}) +are not related. Please ensure that the two models you pass in have the same number +of subgraphs, and each pair of subgraphs is related to each other.""" + raise GraphMatchingException(msg) + elif subgraph_relationship == SubgraphTypeRelationship.EQUAL_BUT_UKNOWN: + # skip matching but unknown types + continue + key_name_a = _get_name_for_subgraph( + cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops, existing_names_a + ) + key_name_b = _get_name_for_subgraph( + cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops, existing_names_b + ) + if key_name_a != key_name_b: + raise AssertionError( + f"Subgraph names {key_name_a} and {key_name_b} do not match" + ) + results[key_name_a] = (cur_subgraph_a, cur_subgraph_b) + continue + elif cur_subgraph_a is None and cur_subgraph_b is None: + # we reached the end of both graphs + break + else: + # only one node was fetched, no match possible, throw error + msg = f""" +Attempting to match +({cur_subgraph_a}, {type_start_a}) and +({cur_subgraph_b}, {type_start_b}), +one of which is empty. Please ensure that the two models you pass in have the same number +of subgraphs.""" + raise GraphMatchingException(msg) + + # The subgraph pairs are originally created by traversing the two graphs + # from the outputs to the inputs. Reverse the results to return the + # subgraphs in their order of execution. + results = collections.OrderedDict(reversed(results.items())) + + # pyrefly: ignore [bad-return] + return results diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py new file mode 100644 index 0000000000000000000000000000000000000000..338db28ce41d96ec5d3de38591f5937543d65394 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py @@ -0,0 +1,1155 @@ +# mypy: allow-untyped-defs +from collections.abc import Callable +from typing import Any + +import torch +from torch.ao.ns.fx.mappings import get_node_type_to_io_type_map +from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix +from torch.ao.quantization.observer import _is_activation_post_process +from torch.fx import GraphModule, map_arg +from torch.fx.graph import Graph, Node + +from .ns_types import NSNodeTargetType, NSSingleResultValuesType, NSSubgraph +from .utils import ( + get_arg_indices_of_inputs_to_log, + get_node_first_input_and_output_type, + get_node_input_qparams, + get_normalized_nth_input, + get_number_of_non_param_args, + get_target_type_str, + getattr_from_fqn, + NodeInputOrOutputType, + op_type_supports_shadowing, + return_first_non_observer_node, +) + + +def _maybe_get_fqn(node: Node, gm: GraphModule) -> str | None: + fqn = None + if hasattr(gm, "_node_name_to_scope"): + # fqn on observers is not present, because they do not + # exist when the fqns are created during tracing. If this is + # an observer, get the fqn of the node being observed. + node_to_use_for_fqn = node + if node.op == "call_module": + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + module = getattr_from_fqn(gm, node.target) + if _is_activation_post_process(module): + node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0) + fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0] # type: ignore[index] + return fqn # type: ignore[return-value] + + +def _insert_logger_after_node( + node: Node, + gm: GraphModule, + logger_cls: Callable, + logger_node_name_suffix: str, + ref_node_name: str, + model_name: str, + ref_name: str, + ref_node_target_type: str, + results_type: str, + index_within_arg: int, + index_of_arg: int, + fqn: str | None, +) -> Node: + """ + Given a starting graph of + + prev_node -> node -> next_node + + This function creates a new logger_cls obj and adds it + after node, resulting in + + prev_node -> node -> logger_obj -> next_node + """ + # create new name + logger_node_name = get_new_attr_name_with_prefix( + node.name + logger_node_name_suffix + )(gm) + target_type = get_target_type_str(node, gm) + # create the logger object + logger_obj = logger_cls( + ref_node_name, + node.name, + model_name, + ref_name, + target_type, + ref_node_target_type, + results_type, + index_within_arg, + index_of_arg, + fqn, + ) + # attach the logger object to the parent module + setattr(gm, logger_node_name, logger_obj) + logger_node = node.graph.create_node("call_module", logger_node_name, (node,), {}) + return logger_node + + +def add_loggers_to_model( + gm: GraphModule, + node_to_instrument_inputs_to_ref_node_name: dict[Node, tuple[str, str]], + node_to_instrument_outputs_to_ref_node_name: dict[Node, tuple[str, str]], + logger_cls: Callable, + model_name: str, +) -> GraphModule: + """ + Takes the graph of gm, adds loggers to the output + of each node in nodes_to_instrument. Returns a GraphModule with the new + graph. + """ + + new_graph = Graph() + env: dict[str, Any] = {} + + def load_arg(a): + return map_arg(a, lambda node: env[node.name]) + + for node in gm.graph.nodes: + if node.op == "output": + new_graph.output(map_arg(get_normalized_nth_input(node, gm, 0), load_arg)) + continue + + if (node in node_to_instrument_inputs_to_ref_node_name) or ( + node in node_to_instrument_outputs_to_ref_node_name + ): + fqn = _maybe_get_fqn(node, gm) + + if node in node_to_instrument_inputs_to_ref_node_name: + ref_name, ref_node_type = node_to_instrument_inputs_to_ref_node_name[ + node + ] + # Ops such add and mul are special because either + # one or two of the first two arguments can be tensors, + # and if one argument is a tensor it can be first or + # second (x + 1 versus 1 + x). + arg_indices_to_log = get_arg_indices_of_inputs_to_log(node) + for node_arg_idx in arg_indices_to_log: + node_arg = get_normalized_nth_input(node, gm, node_arg_idx) + if type(node_arg) is Node: + # create a single input logger + prev_node = env[node_arg.name] + env[node_arg.name] = _insert_logger_after_node( + prev_node, + gm, + logger_cls, + "_ns_logger_", + node.name, + model_name, + ref_name, + ref_node_type, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, + index_of_arg=node_arg_idx, + fqn=fqn, + ) + elif ( + type(node_arg) is torch.fx.immutable_collections.immutable_list + ): + # create N input loggers, one for each node + for arg_idx, arg in enumerate(node_arg): # type: ignore[var-annotated, arg-type] + prev_node = env[arg.name] + env[prev_node.name] = _insert_logger_after_node( + prev_node, + gm, + logger_cls, + "_ns_logger_", + node.name, + model_name, + ref_name, + ref_node_type, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=arg_idx, + index_of_arg=node_arg_idx, + fqn=fqn, + ) + + # ensure env is populated with base node + # Note: runs for both inputs and outputs + env[node.name] = new_graph.node_copy(node, load_arg) + + if node in node_to_instrument_outputs_to_ref_node_name: + ref_name, ref_node_type = node_to_instrument_outputs_to_ref_node_name[ + node + ] + # add the logger after the base node + env[node.name] = _insert_logger_after_node( + env[node.name], + gm, + logger_cls, + "_ns_logger_", + node.name, + model_name, + ref_name, + ref_node_type, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, + index_of_arg=0, + fqn=fqn, + ) + + else: + env[node.name] = new_graph.node_copy(node, load_arg) + + new_gm = GraphModule(gm, new_graph) + return new_gm + + +def _insert_quantize_per_tensor_node( + prev_node_c: Node, + node_a: Node, + gm_b: GraphModule, + graph_c: Graph, + scale: torch.Tensor | float, + zero_point: torch.Tensor | int, + dtype_cast_name: str, +) -> Node: + # copy scale + scale_node_name = get_new_attr_name_with_prefix(node_a.name + "_input_scale_")(gm_b) + setattr(gm_b, scale_node_name, scale) + scale_node = graph_c.create_node( + "get_attr", scale_node_name, (), {}, scale_node_name + ) + # copy zero_point + zero_point_node_name = get_new_attr_name_with_prefix( + node_a.name + "_input_zero_point_" + )(gm_b) + setattr(gm_b, zero_point_node_name, zero_point) + zero_point_node = graph_c.create_node( + "get_attr", zero_point_node_name, (), {}, zero_point_node_name + ) + # create the quantize_per_tensor call + return graph_c.create_node( + "call_function", + torch.quantize_per_tensor, + (prev_node_c, scale_node, zero_point_node, torch.quint8), + {}, + dtype_cast_name, + ) + + +def _insert_dtype_cast_after_node( + node_a: Node, + node_c: Node, + prev_node_c: Node | list[Node], + gm_a: GraphModule, + gm_b: GraphModule, + graph_c: Graph, + node_name_prefix: str, + logger_cls: Callable, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]], +) -> Node | list[Node]: + """ + Given a starting graph C (derived from graph B) of + + ... -> prev_node_c -> node_c -> ... + + And a corresponding related node_a, inserts the correct dtype + cast node after prev_node_c to cast into the dtype expected + by node_a, resulting in: + + dtype_cast + / + ... -> prev_node_c -> node_c -> ... + + For example, if node_c is an int8 op and node_a is an fp32 op, this function + will insert a dequant. + """ + dtype_cast_op = None + dtype_cast_mod_cls = None + dtype_cast_method = None + dtype_cast_method_dtype = None + dtype_cast_scale = None + dtype_cast_zero_point = None + node_input_type_a, _node_output_type_a = get_node_first_input_and_output_type( + node_a, gm_a, logger_cls, node_type_to_io_type_map + ) + node_input_type_c, _node_output_type_c = get_node_first_input_and_output_type( + node_c, gm_b, logger_cls, node_type_to_io_type_map + ) + + if ( + ( + node_input_type_a == NodeInputOrOutputType.FP32 + and node_input_type_c == NodeInputOrOutputType.INT8 + ) + or ( + node_input_type_a == NodeInputOrOutputType.FP32 + and node_input_type_c == NodeInputOrOutputType.FP16 + ) + or + # TODO(future PR): determine the actual dtype of node_c, + # the current code only works because dequantize works with + # multiple input dtypes. + ( + node_input_type_a == NodeInputOrOutputType.FP32 + and node_input_type_c == NodeInputOrOutputType.FP32_OR_INT8 + ) + ): + dtype_cast_op = torch.dequantize + elif ( + node_input_type_a == node_input_type_c + and node_input_type_a != NodeInputOrOutputType.UNKNOWN + ): + dtype_cast_mod_cls = torch.nn.Identity + elif ( + node_input_type_a == NodeInputOrOutputType.INT8 + and node_input_type_c == NodeInputOrOutputType.FP32 + ): + # int8 shadows fp32, the dtype cast needs to quantize to int8 + # with the right qparams. + node_a_input_qparams = get_node_input_qparams( + node_a, gm_a, node_type_to_io_type_map + ) + if node_a_input_qparams is not None: + dtype_cast_op = torch.quantize_per_tensor # type: ignore[assignment] + dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams + elif ( + node_input_type_a == NodeInputOrOutputType.FP16 + and node_input_type_c == NodeInputOrOutputType.FP32 + ): + dtype_cast_method = "to" + dtype_cast_method_dtype = torch.float16 + else: + raise AssertionError( + f"dtype cast from {node_input_type_c} {node_c.format_node()} to " + + f"{node_input_type_a} {node_a.format_node()} needs to be implemented" + ) + + if isinstance(prev_node_c, Node): + new_dtype_cast_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + if dtype_cast_op: + if dtype_cast_scale is not None and dtype_cast_zero_point is not None: + return _insert_quantize_per_tensor_node( + prev_node_c, + node_a, + gm_b, + graph_c, + dtype_cast_scale, + dtype_cast_zero_point, + new_dtype_cast_name, + ) + else: + return graph_c.create_node( + "call_function", + dtype_cast_op, + (prev_node_c,), + {}, + new_dtype_cast_name, + ) + elif dtype_cast_method: + return graph_c.create_node( + "call_method", + dtype_cast_method, + (prev_node_c, dtype_cast_method_dtype), + {}, + new_dtype_cast_name, + ) + else: + if not dtype_cast_mod_cls: + raise AssertionError("Expected dtype_cast_mod_cls to be not None") + dtype_cast_mod = dtype_cast_mod_cls() + setattr(gm_b, new_dtype_cast_name, dtype_cast_mod) + return graph_c.create_node( + "call_module", + new_dtype_cast_name, + (prev_node_c,), + {}, + new_dtype_cast_name, + ) + elif isinstance(prev_node_c, list): + results = [] + for prev_node_c_inner in prev_node_c: + new_dtype_cast_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + if dtype_cast_op: + # TODO(future PR): add handling for quantize_per_tensor + new_dtype_cast_node = graph_c.create_node( + "call_function", + dtype_cast_op, + (prev_node_c_inner,), + {}, + new_dtype_cast_name, + ) + results.append(new_dtype_cast_node) + else: + if not dtype_cast_mod_cls: + raise AssertionError("Expected dtype_cast_mod_cls to be not None") + dtype_cast_mod = dtype_cast_mod_cls() + setattr(gm_b, new_dtype_cast_name, dtype_cast_mod) + new_dtype_cast_node = graph_c.create_node( + "call_module", + new_dtype_cast_name, + (prev_node_c_inner,), + {}, + new_dtype_cast_name, + ) + results.append(new_dtype_cast_node) + return results + else: + raise AssertionError(f"type f{type(prev_node_c)} is not handled") + + +# TODO(future PR): look into using copy_node API instead +def _copy_node_from_a_to_c( + node_a: Node, + gm_a: GraphModule, + gm_b: GraphModule, + graph_c: Graph, +) -> Node: + """ + Simple copy of node_a to graph_c. + """ + if node_a.op == "get_attr": + node_a_copy_name = get_new_attr_name_with_prefix(node_a.name + "_shadow_copy_")( + gm_b + ) + node_a_obj = getattr_from_fqn(gm_a, node_a.target) # type: ignore[arg-type] + if torch.is_tensor(node_a_obj): + node_a_obj = node_a_obj.detach() + setattr(gm_b, node_a_copy_name, node_a_obj) + node_a_copy = graph_c.create_node( + node_a.op, node_a_copy_name, (), {}, node_a_copy_name + ) + return node_a_copy + elif node_a.op == "call_method": + if node_a.target not in ("dequantize", "to"): + raise AssertionError(f"target {node_a.target} is not implemented") + if node_a.target == "dequantize": + arg_copy = _copy_node_from_a_to_c( + get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c + ) # type: ignore[arg-type] + node_a_copy_name = get_new_attr_name_with_prefix( + node_a.name + "_shadow_copy_" + )(gm_b) + node_a_copy = graph_c.create_node( + node_a.op, node_a.target, (arg_copy,), {}, node_a_copy_name + ) + return node_a_copy + else: # to + arg_copy = _copy_node_from_a_to_c( + get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c + ) # type: ignore[arg-type] + node_a_copy_name = get_new_attr_name_with_prefix( + node_a.name + "_shadow_copy_" + )(gm_b) + node_a_copy = graph_c.create_node( + node_a.op, + node_a.target, + (arg_copy, get_normalized_nth_input(node_a, gm_a, 1)), + {}, + node_a_copy_name, + ) + return node_a_copy + + else: + raise AssertionError( + f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented" + ) + + +def _can_insert_copy_of_subgraph_a( + subgraph_a: NSSubgraph, + gm_a: GraphModule, + num_non_param_args_node_a: int, +) -> bool: + """ + This function returns `False` if the input subgraph cannot be copied by + `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means + that there is a corner case logic for which copy is not yet implemented. + """ + # populate the list of nodes we need to check + nodes = [] + cur_node = subgraph_a.end_node + while cur_node != subgraph_a.start_node: + nodes.append(cur_node) + cur_node = get_normalized_nth_input(cur_node, gm_a, 0) # type: ignore[assignment] + nodes.append(cur_node) + nodes.reverse() + + def _can_insert(node_a_arg, gm_a): + if isinstance(node_a_arg, Node): + arg_a = return_first_non_observer_node(node_a_arg, gm_a) + if arg_a.op == "call_method": + return arg_a.target in ("dequantize", "to") + elif arg_a.op == "get_attr": + return True + else: + return False + elif isinstance(node_a_arg, (list, tuple)): + for el in node_a_arg: + if not isinstance(el, Node): + return False + return True + + # For each node, check if we handle the copy behavior. This follows the + # logic in `_insert_copy_of_subgraph_a_after_input_node_c`. + for node_a in nodes: + local_num_non_param_args_node_a = ( + num_non_param_args_node_a if node_a is nodes[0] else 1 + ) + + norm_args_kwargs = node_a.normalized_arguments( + gm_a, normalize_to_only_use_kwargs=True + ) + if norm_args_kwargs is not None: + norm_args, norm_kwargs = norm_args_kwargs + else: + norm_args, norm_kwargs = node_a.args, node_a.kwargs + + cur_idx = 0 + + while cur_idx < len(norm_args): + if cur_idx == 0: + pass + elif cur_idx == 1 and local_num_non_param_args_node_a == 2: + pass + else: + if not _can_insert(norm_args[cur_idx], gm_a): + return False + cur_idx += 1 + + for kwarg_val in norm_kwargs.values(): + # stitch the inputs from base graph + if cur_idx == 0: + pass + elif cur_idx == 1 and local_num_non_param_args_node_a == 2: + pass + else: + if not _can_insert(kwarg_val, gm_a): + return False + cur_idx += 1 + + return True + + +def _insert_copy_of_subgraph_a_after_input_node_c( + input_node_c: Node | list[Node], + input_node_c_2: Node | list[Node] | None, + subgraph_a: NSSubgraph, + gm_a: GraphModule, + gm_b: GraphModule, + node_name_prefix: str, +) -> Node: + """ + TODO(before land): real docblock + """ + if not isinstance(input_node_c, (Node, list)): + raise AssertionError(f"Expected Node or list, got {type(input_node_c)}") + + # create a sequential list of the subgraphs' nodes from start to end, + # because we need to add the nodes to graph C in non-reverse order + nodes_of_a = [subgraph_a.end_node] + cur_node = subgraph_a.end_node + while cur_node != subgraph_a.start_node: + cur_node = get_normalized_nth_input(cur_node, gm_a, 0) # type: ignore[assignment] + nodes_of_a.insert(0, cur_node) + + # go through nodes of a in order, and insert them into the graph of c + # sequentially + cur_node_a = nodes_of_a[0] + cur_node_c = _insert_copy_of_node_a_after_input_node_c( + input_node_c, input_node_c_2, cur_node_a, gm_a, gm_b, node_name_prefix + ) + for cur_idx_a in range(1, len(nodes_of_a)): + cur_node_a = nodes_of_a[cur_idx_a] + prev_node_c = cur_node_c # previous added node is the input to next node + cur_node_c = _insert_copy_of_node_a_after_input_node_c( + prev_node_c, + # TODO(future PR): enable multiple inputs for nodes which are not at start of subgraph + None, + cur_node_a, + gm_a, + gm_b, + node_name_prefix, + ) + # return the last inserted node + return cur_node_c + + +def _insert_copy_of_node_a_after_input_node_c( + input_node_c: Node | list[Node], + input_node_c_2: Node | list[Node] | None, + node_a: Node, + gm_a: GraphModule, + gm_b: GraphModule, + node_name_prefix: str, +) -> Node: + """ + Assume that node_a from graph_a has + args (input, (input2)?, arg1, ...), and + kwargs {kw0: kwarg0, ...} + + Note: input2 is optional. If it equals to None, we assume that the op + has a single non-param input. If it is specified, we assume that the op + has two non-param inputs. + + Copies the underlying values of arg1..argn and kwarg0..kwargn into gm_b, + and creates the corresponding nodes in graph_c. Note: observers are ignored, + so if an arg is an observer we navigate up until we find a non-observer parent. + + If node_a is a call_module, points the module pointed to by node_a to gm_b. + + Creates the copy of node_a in graph_c, with input as the first arg, + and all other args and kwargs pointing to the copies of the objects + in gm_b created above. + + An example in pictures: + + graph A: + ======== + + input -------------> node_a + / / / + (input_2)?----------/ / / + / / + weight -> weight_obs / + / + bias ---------------- + + graph C (derived from B): + ========================= + + input_node_c --> node_a_copy + / / / + (input_node_c_2)? / / + / / + weight_copy ----/ / + / + bias_copy ------/ + """ + if isinstance(input_node_c, Node): + graph_c = input_node_c.graph + else: + if not isinstance(input_node_c, list): + raise AssertionError(f"Expected list, got {type(input_node_c)}") + graph_c = input_node_c[0].graph + + norm_args_kwargs = node_a.normalized_arguments( + gm_a, normalize_to_only_use_kwargs=True + ) + if norm_args_kwargs is not None: + norm_args, norm_kwargs = norm_args_kwargs + else: + norm_args, norm_kwargs = node_a.args, node_a.kwargs + + new_args = [] + new_kwargs = {} + + def _copy_arg(arg): + # copy the other inputs from the other graph + if isinstance(arg, Node): + arg = return_first_non_observer_node(arg, gm_a) + arg = _copy_node_from_a_to_c(arg, gm_a, gm_b, graph_c) + return arg + elif isinstance(arg, (int, float, torch.dtype)): + return arg + elif isinstance(kwarg_val, (list, tuple)): + for el in kwarg_val: + if isinstance(el, Node): + raise AssertionError( + "handling of Node inside list is not implemented" + ) + return arg + else: + raise AssertionError( + f"handling for kwarg of type {type(kwarg_val)} is not implemented" + ) + + cur_idx = 0 + + while cur_idx < len(norm_args): + if cur_idx == 0: + new_arg = input_node_c + elif cur_idx == 1 and input_node_c_2 is not None: + new_arg = input_node_c_2 + else: + new_arg = _copy_arg(norm_args[cur_idx]) + new_args.append(new_arg) + cur_idx += 1 + + for kwarg_name, kwarg_val in norm_kwargs.items(): + # stitch the inputs from base graph + if cur_idx == 0: + new_kwargs[kwarg_name] = input_node_c + elif cur_idx == 1 and input_node_c_2 is not None: + new_kwargs[kwarg_name] = input_node_c_2 + else: + new_kwargs[kwarg_name] = _copy_arg(kwarg_val) + cur_idx += 1 + + new_args = tuple(new_args) # type: ignore[assignment] + + node_a_shadows_c_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + + if node_a.op == "call_module": + # if target is a module, we point to the module from gm_b + new_mod_copy_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + # fetch the corresponding module from gm_a + if not isinstance(node_a.target, str): + raise AssertionError(f"Expected str, got {type(node_a.target)}") + mod_a = getattr_from_fqn(gm_a, node_a.target) + setattr(gm_b, new_mod_copy_name, mod_a) + node_a_shadows_c = graph_c.create_node( + node_a.op, + new_mod_copy_name, + new_args, # type: ignore[arg-type] + new_kwargs, # type: ignore[arg-type] + node_a_shadows_c_name, + ) + return node_a_shadows_c + else: + if node_a.op not in ("call_function", "call_method"): + raise AssertionError(f"Unexpected op: {node_a.op}") + node_a_shadows_c = graph_c.create_node( + node_a.op, + node_a.target, + new_args, # type: ignore[arg-type] + new_kwargs, # type: ignore[arg-type] + node_a_shadows_c_name, + ) + return node_a_shadows_c + + +def create_a_shadows_b( + name_a: str, + gm_a: GraphModule, + name_b: str, + gm_b: GraphModule, + matched_subgraph_pairs: dict[str, tuple[NSSubgraph, NSSubgraph]], + logger_cls: Callable, + should_log_inputs: bool, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None, +) -> GraphModule: + """ + Creates a new GraphModule consisting of the graph of C, with the meaningful + nodes of A shadowing the corresponding nodes of B. For example, + + Graph A: + a0 -> op0_fp32 -> a1 -> op1_fp32 -> a2 + + Graph B: + b0 -> op0_int8 -> b1 -> op1_int8 -> b2 + + matched_node_pairs: {'op0': (op0_fp32, op0_int8), 'op1': (op1_fp32, op1_int8)} + + Graph C (A shadows B): + + / dequant0 -> op0_fp32 -> logger_a_0 / dequant_1 -> op1_fp32 -> logger_a_1 + / / + b0 -------------> op0_int8 -> logger_b_0 --------------> op1_int8 -> logger_b_1 + + In a nutshell, this function does the following for each node pair: + * copies the necessary attributes and modules from gm_a to gm_b, + keeping names unique + * adds a dtype cast op (dequant, quant, etc) + * adds a copy of node_a in gm_b's graph + * adds loggers to the outputs of node_a and node_b + """ + + if node_type_to_io_type_map is None: + node_type_to_io_type_map = get_node_type_to_io_type_map() + + # graph_c is the graph created from copying the nodes of graph_b and inserting + # the shadows with the nodes copied from graph_a + graph_c = Graph() + env_c: dict[str, Any] = {} + + def load_arg(a): + return map_arg(a, lambda node: env_c[node.name]) + + start_node_b_to_matched_subgraph_a_and_name = {} + end_node_b_to_matched_subgraph_a_and_name = {} + for match_name, match in matched_subgraph_pairs.items(): + subgraph_a, subgraph_b = match + ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a) + ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b) + start_node_b_to_matched_subgraph_a_and_name[subgraph_b.start_node] = ( + subgraph_a, + match_name, + ref_node_type_a, + ref_node_type_b, + ) + end_node_b_to_matched_subgraph_a_and_name[subgraph_b.end_node] = ( + subgraph_a, + match_name, + ref_node_type_a, + ref_node_type_b, + ) + + for node_b in gm_b.graph.nodes: + if node_b.op == "output": + graph_c.output(map_arg(node_b.args[0], load_arg)) + continue + + # calculate the flags to determine what to do with this node + node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name + node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name + + if node_b_is_start_node or node_b_is_end_node: + if node_b_is_start_node: + ( + subgraph_a, + ref_name, + ref_node_type_a, + ref_node_type_b, + ) = start_node_b_to_matched_subgraph_a_and_name[node_b] + else: + if not node_b_is_end_node: + raise AssertionError("Expected node_b_is_end_node to be not false") + ( + subgraph_a, + ref_name, + ref_node_type_a, + ref_node_type_b, + ) = end_node_b_to_matched_subgraph_a_and_name[node_b] + + all_op_types_support_shadowing = op_type_supports_shadowing( + subgraph_a.start_node + ) and op_type_supports_shadowing(node_b) + if not all_op_types_support_shadowing: + print( + f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}" + + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}" + + ", unsupported" + ) + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + # For both start_node and end_node verify that we know how to do + # the dtype cast. If we do not, skip. + ( + node_input_type_a, + node_output_type_a, + ) = get_node_first_input_and_output_type( + subgraph_a.start_node, gm_a, logger_cls, node_type_to_io_type_map + ) + ( + node_input_type_b, + node_output_type_b, + ) = get_node_first_input_and_output_type( + node_b, gm_b, logger_cls, node_type_to_io_type_map + ) + node_io_types_known_a_and_b = ( + node_input_type_a != NodeInputOrOutputType.UNKNOWN + and node_output_type_a != NodeInputOrOutputType.UNKNOWN + and node_input_type_b != NodeInputOrOutputType.UNKNOWN + and node_output_type_b != NodeInputOrOutputType.UNKNOWN + ) + if not node_io_types_known_a_and_b: + print( + f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}" + + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}" + + ", unknown dtype cast" + ) + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + # If we are shadowing from fp32 to int8, we need to insert + # quantize_per_tensor call with qparams from the previous node. + # Only do this if we are able to infer these qparams from the graph. + if ( + node_input_type_a == NodeInputOrOutputType.INT8 + and node_input_type_b == NodeInputOrOutputType.FP32 + ): + node_a_input_qparams = get_node_input_qparams( + subgraph_a.start_node, gm_a, node_type_to_io_type_map + ) + if not node_a_input_qparams: + print( + f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}" + + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}" + + ", unknown input qparams" + ) + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + num_non_param_args_node_a = get_number_of_non_param_args( + subgraph_a.start_node, gm_a + ) + if not _can_insert_copy_of_subgraph_a( + subgraph_a, gm_a, num_non_param_args_node_a + ): + print( + f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}" + + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}" + + ", unhandled logic in subgraph copy" + ) + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a) + fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b) # type: ignore[possibly-undefined] + + if node_b_is_start_node: + # if necessary, log the input of node_c + if should_log_inputs: + prev_node_b = get_normalized_nth_input(node_b, gm_b, 0) + if isinstance(prev_node_b, Node): + prev_node_c = env_c[prev_node_b.name] + env_c[prev_node_c.name] = _insert_logger_after_node( + prev_node_c, + gm_b, + logger_cls, + "_ns_logger_b_inp_", + node_b.name, + name_b, + ref_name, + ref_node_type_b, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, + index_of_arg=0, + fqn=fqn_base_b, + ) + elif isinstance(prev_node_b, list): + # first, save the prev_node instances, because they + # will be overwritten in the env after the first logger + # is added + prev_node_c_list = [env_c[arg.name] for arg in prev_node_b] + + for arg_idx, prev_node_c in enumerate(prev_node_c_list): + env_c[prev_node_c.name] = _insert_logger_after_node( + prev_node_c, + gm_b, + logger_cls, + "_ns_logger_b_inp_", + node_b.name, + name_b, + ref_name, + ref_node_type_b, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=arg_idx, + index_of_arg=0, + fqn=fqn_base_b, + ) + else: + # logging of inputs which are not lists is not supported yet + raise AssertionError( + f"type {type(prev_node_b)} is not handled yet" + ) + # subgraph so far: + # + # (prev_node_c)+ -> (logger_c_input)? + + # Note: this if statement is always True, spelling it out to clarify code + # intent. + if node_b_is_start_node or node_b_is_end_node: + # ensure env_c is populated with base node + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + node_c = env_c[node_b.name] + + # after this point, + # + # node_a is the original node from graph_a, with parent module gm_a + # node_b is the original node from graph_b, with parent module gm_b + # node_c is the copy of node_b in graph_c + # + # subgraph so far: + # + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if node_b_is_start_node: + # cast dtype from the dtype of node_c's input to the dtype of + # node_a's input (dequant, etc) + # prev_node_c = node_c.args[0] + prev_node_c = get_normalized_nth_input(node_c, gm_b, 0) # type: ignore[possibly-undefined] + if should_log_inputs: + # skip the input logger when inserting a dtype cast + if isinstance(prev_node_c, Node): + # pyrefly: ignore [unbound-name] + prev_node_c = get_normalized_nth_input(node_c, gm_b, 0) + elif isinstance(prev_node_c, list): + prev_node_c = [ + get_normalized_nth_input(arg, gm_b, 0) + for arg in prev_node_c + ] + dtype_cast_node = _insert_dtype_cast_after_node( + subgraph_a.start_node, + # pyrefly: ignore [unbound-name] + node_c, + prev_node_c, + gm_a, + gm_b, + graph_c, + node_b.name + "_dtype_cast_", + logger_cls, + node_type_to_io_type_map, + ) + # note: not inserting to env_c because all nodes which use the dtype + # casts are copied from graph_a + # + # subgraph so far: + # + # (dtype_cast_node)+ + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + # if input logging is enabled, log the input to the subgraph + if should_log_inputs: + # TODO: explain this + ref_node_name = "" + if isinstance(dtype_cast_node, Node): + dtype_cast_node = _insert_logger_after_node( + dtype_cast_node, + gm_b, + logger_cls, + "_ns_logger_a_inp_", + ref_node_name, + name_a, + ref_name, + ref_node_type_a, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, + index_of_arg=0, + fqn=fqn_base_a, + ) + input_logger: Node | list[Node] = dtype_cast_node + else: + if not isinstance(dtype_cast_node, list): + raise AssertionError( + f"Expected list, got {type(dtype_cast_node)}" + ) + new_loggers = [] + for dtype_cast_idx, dtype_cast_node_inner in enumerate( + dtype_cast_node + ): + dtype_cast_logger = _insert_logger_after_node( + dtype_cast_node_inner, + gm_b, + logger_cls, + "_ns_logger_a_inp_", + ref_node_name, + name_a, + ref_name, + ref_node_type_a, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=dtype_cast_idx, + index_of_arg=0, + fqn=fqn_base_a, + ) + new_loggers.append(dtype_cast_logger) + dtype_cast_node = new_loggers + input_logger = dtype_cast_node + # subgraph so far: + # + # (dtype_cast_node)+ -> (logger_a_input)? + # / + # prev_node_c -> (logger_c_input)? -> node_start_c + + # hook up the new mod_a copy to be in the graph, receiving the + # same inputs as mod_b does, with dtype cast to match a + # Some ops, such as LSTMs, have two non-param inputs. If we have + # such an op, pass the second param as well. Note: dtype casting + # for the second param is not implemented yet, it can be added + # later if there is a use case. + node_c_second_non_param_arg = None + num_non_param_args_node_a = get_number_of_non_param_args( + subgraph_a.start_node, gm_a + ) + if num_non_param_args_node_a == 2: + # node_c_second_non_param_arg = node_c.args[1] + node_c_second_non_param_arg = get_normalized_nth_input( + # pyrefly: ignore [unbound-name] + node_c, + gm_b, + 1, + ) + node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c( + dtype_cast_node, + node_c_second_non_param_arg, + subgraph_a, + gm_a, + gm_b, + # pyrefly: ignore [unbound-name] + node_c.name + "_shadow_copy_", + ) + env_c[node_a_shadows_c.name] = node_a_shadows_c + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy(args/kwargs not shown) + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if should_log_inputs: + # When we created the input logger, we left the ref_node_name + # as an empty string, because the subgraph copy did not exist + # yet. Now that the subgraph copy exists, we modify this name + # to its true value. + # Note: the alternative to this is to create the input logger + # after creating the subgraph, which is slightly more + # complicated. This is the lesser of two evils. + # input_logger = env_c[dtype_cast_node.name] + # Find the first node in the subgraph + cur_node = node_a_shadows_c + while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger: # type: ignore[possibly-undefined] + cur_node = get_normalized_nth_input(cur_node, gm_b, 0) # type: ignore[assignment] + # pyrefly: ignore [unbound-name] + if isinstance(input_logger, Node): + # pyrefly: ignore [unbound-name] + input_logger_mod = getattr(gm_b, input_logger.name) + input_logger_mod.ref_node_name = cur_node.name + else: + # pyrefly: ignore [unbound-name] + if not isinstance(input_logger, list): + raise AssertionError( + # pyrefly: ignore [unbound-name] + f"Expected list, got {type(input_logger)}" + ) + # pyrefly: ignore [unbound-name] + for input_logger_inner in input_logger: + input_logger_mod = getattr(gm_b, input_logger_inner.name) + input_logger_mod.ref_node_name = cur_node.name + + # hook up a logger to the mod_a copy + env_c[node_a_shadows_c.name] = _insert_logger_after_node( + env_c[node_a_shadows_c.name], + gm_b, + logger_cls, + "_ns_logger_a_", + node_a_shadows_c.name, + name_a, + ref_name, + ref_node_type_a, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, + index_of_arg=0, + fqn=fqn_base_a, + ) + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if node_b_is_end_node: + # hook up a logger to the mod_b copy + env_c[node_b.name] = _insert_logger_after_node( + env_c[node_b.name], + gm_b, + logger_cls, + "_ns_logger_b_", + node_b.name, + name_b, + ref_name, + ref_node_type_b, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, + index_of_arg=0, + fqn=fqn_base_b, + ) + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a + # / + # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c + # + # Note: node_start_c may be the same node as node_end_c, or they + # may have nodes in between. + + else: + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + + gm_c = GraphModule(gm_b, graph_c) + return gm_c diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..275291789f1c5461af366038d7702801bf5fc303 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py @@ -0,0 +1,763 @@ +import operator +from typing import TYPE_CHECKING + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +import torch.ao.nn.intrinsic.quantized as nniq +import torch.ao.nn.intrinsic.quantized.dynamic as nniqd +import torch.ao.nn.qat as nnqat +import torch.ao.nn.qat.dynamic as nnqatd +import torch.ao.nn.quantized as nnq +import torch.ao.nn.quantized.dynamic as nnqd +import torch.ao.quantization.fx._lower_to_native_backend as _lower_to_native_backend +import torch.ao.quantization.quantization_mappings as quantization_mappings +import torch.nn as nn +import torch.nn.functional as F +from torch.ao.quantization.backend_config import get_native_backend_config + +from .ns_types import NSNodeTargetType + + +if TYPE_CHECKING: + from collections.abc import Callable + + +toq = torch.ops.quantized + + +def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]: + # note: this set is modified below by items from backend_config + sets_of_related_ops: list[set[NSNodeTargetType]] = [ + # conv modules + { + nn.Conv1d, + }, + { + nn.Conv2d, + }, + { + nn.Conv3d, + }, + # conv functionals + { + F.conv1d, + }, + { + F.conv2d, + }, + { + F.conv3d, + }, + # linear modules + { + nn.Linear, + }, + # linear functionals + { + F.linear, + }, + # average pool + { + nn.AvgPool1d, + torch.avg_pool1d, + }, + { + nn.AvgPool2d, + torch._C._nn.avg_pool2d, + }, + { + nn.AvgPool3d, + torch._C._nn.avg_pool3d, + }, + # adaptive average pool + { + nn.AdaptiveAvgPool1d, + F.adaptive_avg_pool1d, + }, + { + nn.AdaptiveAvgPool2d, + F.adaptive_avg_pool2d, + }, + { + nn.AdaptiveAvgPool3d, + F.adaptive_avg_pool3d, + }, + # LSTM + { + nn.LSTM, + }, + # add + { + torch.add, + operator.add, # x + y + }, + # cat + { + torch.cat, + }, + # mul + { + torch.mul, + operator.mul, + }, + # relu + { + F.relu, + nn.ReLU, + "relu", + "relu_", + torch.relu, + }, + # maxpool + { + nn.MaxPool1d, + F.max_pool1d, + }, + { + nn.MaxPool2d, + F.max_pool2d, + }, + { + nn.MaxPool3d, + F.max_pool3d, + }, + # sigmoid + { + torch.sigmoid, + "sigmoid", + "sigmoid_", + nn.Sigmoid, + F.sigmoid, + }, + # BatchNorm + { + nn.BatchNorm2d, + }, + { + nn.BatchNorm3d, + }, + # ConvTranspose + { + nn.ConvTranspose1d, + }, + { + nn.ConvTranspose2d, + }, + { + nn.ConvTranspose3d, + }, + # functional transposed conv + { + F.conv_transpose1d, + }, + { + F.conv_transpose2d, + }, + { + F.conv_transpose3d, + }, + # ELU + { + nn.ELU, + }, + # Embedding + { + nn.Embedding, + }, + # EmbeddingBag + { + nn.EmbeddingBag, + }, + # GroupNorm + { + nn.GroupNorm, + }, + # Hardswish + { + nn.Hardswish, + }, + # InstanceNorm + { + nn.InstanceNorm1d, + }, + { + nn.InstanceNorm2d, + }, + { + nn.InstanceNorm3d, + }, + # LayerNorm + { + nn.LayerNorm, + }, + # LeakyReLU + { + nn.LeakyReLU, + }, + # ReLU6 + { + nn.ReLU6, + F.relu6, + }, + # F.elu + { + F.elu, + }, + # F.hardswish + { + F.hardswish, + }, + # F.group_norm + { + F.group_norm, + }, + # F.instance_norm + { + F.instance_norm, + }, + # F.layer_norm + { + F.layer_norm, + }, + # F.leaky_relu + { + F.leaky_relu, + }, + # F.silu + { + nn.SiLU, + F.silu, + }, + # F.mish + { + nn.Mish, + F.mish, + }, + # F.tanh + { + nn.Tanh, + F.tanh, + torch.tanh, + "tanh_", + "tanh", + }, + # F.hardsigmoid + { + "hardsigmoid_", + "hardsigmoid", + F.hardsigmoid, + nn.Hardsigmoid, + }, + # F.hardtanh + { + nn.Hardtanh, + F.hardtanh, + F.hardtanh_, + }, + # floordiv + { + operator.floordiv, + }, + # unsqueeze + { + torch.unsqueeze, + }, + # stack + { + torch.stack, + }, + # squeeze + { + torch.squeeze, + }, + # sort + { + torch.sort, + }, + # repeat_interleave + { + torch.repeat_interleave, + }, + # min + { + torch.min, + }, + # mean + { + torch.mean, + }, + # max + { + torch.max, + }, + # transpose + { + torch.transpose, + }, + # flatten + { + torch.flatten, + }, + # clamp + { + torch.clamp, + }, + # chunk + { + torch.chunk, + }, + # interpolate + { + torch.nn.functional.interpolate, + }, + # dropout + { + nn.Dropout, + }, + # F.dropout + { + F.dropout, + }, + # matmul + { + torch.matmul, + }, + # Softmax + { + nn.Softmax, + }, + # PReLU + { + nn.PReLU, + nnq.PReLU, + }, + # F.prelu + { + F.prelu, + toq.prelu, + }, + # pixel shuffle + { + nn.PixelShuffle, + }, + { + F.pixel_shuffle, + }, + # pixel unshuffle + { + nn.PixelUnshuffle, + }, + { + F.pixel_unshuffle, + }, + # narrow + { + torch.narrow, + }, + ] + + # for each floating point op, add versions of the op added by + # backend_config + backend_config = get_native_backend_config() + + new_connections: list[tuple[Callable, Callable]] = [ + # technical debt edge case + (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear), + ] + + for pattern, config in backend_config._pattern_complex_format_to_config.items(): + # pattern format: (c, (b, a)) + first_element = pattern + # look from the end, because pattern is in reverse order + while isinstance(first_element, (list, tuple)): + first_element = first_element[-1] + + if config.fused_module is not None: + # case 1: pattern fuses a pattern of ops into an op + # example: nn.Conv1d, nn.ReLU fused into nni.ConvReLU1d + new_connections.append((first_element, config.fused_module)) + + if config.qat_module is not None: + # case 2: pattern swaps a module into a QAT module + # example: nni.ConvReLU1d swapped into nniqat.ConvReLU1d + new_connections.append((first_element, config.qat_module)) + + if config.reference_quantized_module is not None: + # case 3: reference version of floating point module, such as + # nn.Conv2d and nnqr.Conv2d + new_connections.append((first_element, config.reference_quantized_module)) + + # + # Add reference module swaps from default lowering path + # + + for source_to_target in ( + _lower_to_native_backend.STATIC_LOWER_MODULE_MAP, + _lower_to_native_backend.DYNAMIC_LOWER_MODULE_MAP, + _lower_to_native_backend.WEIGHT_ONLY_LOWER_MODULE_MAP, + _lower_to_native_backend.SPECIAL_PATTERN_LOWER_MODULE_MAP, + ): + for source, target in source_to_target.items(): # type: ignore[attr-defined] + new_connections.append((source, target)) + + for source_to_double_target in ( + _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP, + _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP, + _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP, + ): + for source, (target1, target2) in source_to_double_target.items(): # type: ignore[attr-defined] + new_connections.append((source, target1)) + new_connections.append((source, target2)) + + # + # Add function swaps from default lowering path + # + + for source, ( # type:ignore[assignment] + target1, + target2, + ) in _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items(): + new_connections.append((source, target1)) + # pyrefly: ignore [bad-argument-type] + new_connections.append((source, target2)) + + for source_to_target in ( + _lower_to_native_backend.QBIN_OP_MAPPING, + _lower_to_native_backend.QBIN_RELU_OP_MAPPING, + quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS, + ): + for source, target in source_to_target.items(): # type:ignore[assignment] + # pyrefly: ignore [bad-argument-type] + new_connections.append((source, target)) + + # + # Add other swaps, ideally in the future this could be removed + # after the lowering code stops using these. + # + for source_to_target in ( + quantization_mappings.DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, + ): + for source, target in source_to_target.items(): # type:ignore[assignment] + new_connections.append((source, target)) + + # add the new connections from backend_config + for item1, item2 in new_connections: + for set_of_related_ops in sets_of_related_ops: + if item1 in set_of_related_ops or item2 in set_of_related_ops: + set_of_related_ops.add(item1) + set_of_related_ops.add(item2) + break + + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] = {} + + for counter, set_of_related_ops in enumerate(sets_of_related_ops): + base_name = str(counter) + base_name_to_sets_of_related_ops[base_name] = set_of_related_ops + + return base_name_to_sets_of_related_ops + + +def get_base_name_for_op( + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]], + op: NSNodeTargetType, +) -> str | None: + for base_name, set_of_related_ops in base_name_to_sets_of_related_ops.items(): + if op in set_of_related_ops: + return base_name + return None + + +def add_op_to_sets_of_related_ops( + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]], + op: NSNodeTargetType, + related_op: NSNodeTargetType | None, +) -> None: + if related_op is not None: + for set_of_related_ops in base_name_to_sets_of_related_ops.values(): + if related_op in set_of_related_ops: + set_of_related_ops.add(op) + return + # if we got here, related_op was not found + raise AssertionError(f"{related_op} was not found") + else: + counter = 0 + while str(counter) in base_name_to_sets_of_related_ops: + counter += 1 + base_name_to_sets_of_related_ops[str(counter)] = {op} + + +# TODO(future PR): clean this up +def get_node_type_to_io_type_map() -> dict[str, set[NSNodeTargetType]]: + FUNS_IO_TYPE_FP32: set[NSNodeTargetType] = { + F.linear, + F.conv1d, + F.conv2d, + F.conv3d, + torch.cat, + F.elu, + F.hardswish, + F.instance_norm, + F.layer_norm, + F.leaky_relu, + F.dropout, + F.silu, + F.mish, + operator.add, + torch.add, + operator.mul, + torch.mul, + torch.sum, + F.prelu, + } + + FUNS_IO_TYPE_FP16: set[NSNodeTargetType] = set() + + FUNS_IO_TYPE_INT8: set[NSNodeTargetType] = { + toq.linear, + toq.linear_relu, + toq.conv1d, + toq.conv1d_relu, + toq.conv2d, + toq.conv2d_relu, + toq.conv3d, + toq.conv3d_relu, + toq.cat, + toq.elu, + toq.hardswish, + toq.instance_norm, + toq.layer_norm, + toq.leaky_relu, + toq.dropout, + toq.prelu, + # TODO(future PR): implement shadowing for binary ops and + # uncomment below + # toq.add, + # toq.mul, + } + + FUNS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = { + F.relu, + F.tanh, + torch.tanh, + F.sigmoid, + torch.sigmoid, + F.hardsigmoid, + operator.floordiv, + torch.adaptive_avg_pool1d, + F.adaptive_avg_pool2d, + F.adaptive_avg_pool3d, + F.dropout, + F.hardtanh, + F.hardtanh_, + F.interpolate, + F.max_pool1d, + F.max_pool2d, + F.max_pool3d, + F.relu6, + F.pixel_shuffle, + F.pixel_unshuffle, + torch.avg_pool1d, + torch._C._nn.avg_pool2d, + torch._C._nn.avg_pool3d, + torch.cat, + torch.chunk, + torch.clamp, + torch.flatten, + torch.transpose, + torch.max, + torch.mean, + torch.min, + torch.narrow, + torch.repeat_interleave, + torch.sort, + torch.squeeze, + torch.stack, + torch.unsqueeze, + operator.add, + } + + MODS_IO_TYPE_FP32: set[NSNodeTargetType] = { + nn.Linear, + nnqat.Linear, + nnqatd.Linear, + nnqd.Linear, + torch.nn.modules.linear.NonDynamicallyQuantizableLinear, + nn.Conv1d, + nn.Conv2d, + nn.Conv3d, + nnqat.Conv1d, + nnqat.Conv2d, + nnqat.Conv3d, + nnqat.Embedding, + nnqat.EmbeddingBag, + nn.LSTM, + # note: nnqd.Linear is an instance of nnq.Linear, so this + # check has to happen before the int8 module check + nnqd.LSTM, + nn.BatchNorm2d, + nn.BatchNorm3d, + nn.Dropout, + nn.ConvTranspose1d, + nn.ConvTranspose2d, + nn.ConvTranspose3d, + nn.ELU, + nn.GroupNorm, + nn.InstanceNorm1d, + nn.InstanceNorm2d, + nn.InstanceNorm3d, + nn.LayerNorm, + nn.Hardswish, + nn.LeakyReLU, + nn.ReLU6, + nn.SiLU, + nn.Mish, + nn.Softmax, + nn.PReLU, + nni.BNReLU2d, + nni.BNReLU3d, + nni.ConvReLU1d, + nni.ConvReLU2d, + nni.ConvReLU3d, + nni.LinearReLU, + nni.LinearBn1d, + nni.ConvBn1d, + nni.ConvBn2d, + nni.ConvBn3d, + nniqat.ConvBn1d, + nniqat.ConvBn2d, + nniqat.ConvBn3d, + nniqat.ConvBnReLU1d, + nniqat.ConvBnReLU2d, + nniqat.ConvBnReLU3d, + nniqat.ConvReLU1d, + nniqat.ConvReLU2d, + nniqat.ConvReLU3d, + nniqat.LinearReLU, + nniqat.LinearBn1d, + nniqd.LinearReLU, + nni.LinearLeakyReLU, + nni.LinearTanh, + nni.ConvAdd2d, + nni.ConvAddReLU2d, + } + + MODS_IO_TYPE_INT8: set[NSNodeTargetType] = { + nnq.Linear, + nnq.Conv1d, + nnq.Conv2d, + nnq.Conv3d, + nnq.BatchNorm2d, + nnq.BatchNorm3d, + nnq.Dropout, + nnq.ConvTranspose1d, + nnq.ConvTranspose2d, + nnq.ELU, + nnq.InstanceNorm1d, + nnq.InstanceNorm2d, + nnq.InstanceNorm3d, + nnq.LayerNorm, + nnq.Hardswish, + nnq.LeakyReLU, + nnq.Embedding, + nnq.EmbeddingBag, + nnq.Dropout, + nnq.Softmax, + nnq.PReLU, + nniq.BNReLU2d, + nniq.BNReLU3d, + nniq.ConvReLU1d, + nniq.ConvReLU2d, + nniq.ConvReLU3d, + nniq.LinearReLU, + nniq.LinearLeakyReLU, + nniq.LinearTanh, + nniq.ConvAdd2d, + nniq.ConvAddReLU2d, + } + + MODS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = { + nn.ReLU, + nn.Tanh, + nn.Sigmoid, + nn.Hardsigmoid, + nn.AdaptiveAvgPool1d, + nn.AdaptiveAvgPool2d, + nn.AdaptiveAvgPool3d, + nn.AvgPool1d, + nn.AvgPool2d, + nn.AvgPool3d, + nn.Dropout, + nn.Hardtanh, + nn.Identity, + nn.MaxPool1d, + nn.MaxPool2d, + nn.MaxPool3d, + nn.PixelShuffle, + nn.PixelUnshuffle, + nn.ReLU6, + } + + METHS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = { + "sigmoid_", + "sigmoid", + "tanh_", + "tanh", + "hardsigmoid_", + "hardsigmoid", + "relu_", + "relu", + } + + return { + "funs_io_type_fp32": FUNS_IO_TYPE_FP32, + "funs_io_type_fp16": FUNS_IO_TYPE_FP16, + "funs_io_type_int8": FUNS_IO_TYPE_INT8, + "funs_io_type_fp32_or_int8": FUNS_IO_TYPE_FP32_OR_INT8, + "mods_io_type_fp32": MODS_IO_TYPE_FP32, + "mods_io_type_int8": MODS_IO_TYPE_INT8, + "mods_io_type_fp32_or_int8": MODS_IO_TYPE_FP32_OR_INT8, + "meths_io_type_fp32_or_int8": METHS_IO_TYPE_FP32_OR_INT8, + } + + +def get_unmatchable_types_map() -> dict[str, set[NSNodeTargetType]]: + FUNS_UNMATCHABLE: set[NSNodeTargetType] = { + torch.quantize_per_tensor, + operator.getitem, + } + + MODS_UNMATCHABLE: set[NSNodeTargetType] = { + nn.Identity, + } + + METHS_UNMATCHABLE: set[NSNodeTargetType] = { + "to", + "dequantize", + "reshape", + "view", + "unsqueeze_", + "unsqueeze", + "transpose", + "squeeze_", + "squeeze", + "size", + "shape", + "resize_", + "repeat_interleave", + "repeat", + "permute", + "numel", + "mean", + "detach_", + "detach", + "contiguous", + "clamp", + "chunk", + } + + return { + "funs_unmatchable": FUNS_UNMATCHABLE, + "mods_unmatchable": MODS_UNMATCHABLE, + "meths_unmatchable": METHS_UNMATCHABLE, + } diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..95d467d9337ea24d676d282740df042d5bdd16f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py @@ -0,0 +1,1416 @@ +# mypy: allow-untyped-defs +import collections +import copy +import operator +from collections.abc import Callable +from typing import Any + +import torch +import torch.fx +from torch.ao.ns.fx.graph_passes import _maybe_get_fqn +from torch.ao.ns.fx.ns_types import NSResultsType, NSSingleResultValuesType +from torch.ao.ns.fx.utils import ( # TODO(future PR): make this work correctly for methods + get_normalized_nth_input, + get_target_type_str, +) +from torch.ao.quantization import QConfigMapping +from torch.ao.quantization.fx.match_utils import _MatchResult +from torch.ao.quantization.qconfig import QConfigAny +from torch.ao.quantization.utils import getattr_from_fqn +from torch.fx import Graph, GraphModule, Node +from torch.utils._pytree import tree_map + + +SHADOW_NODE_NAME_PREFIX = "shadow" +SHADOW_WRAPPER_NODE_NAME_PREFIX = "shadow_wrapper" + +# TODO(future PR): reuse existing mapping instead of creating a new one +BINARY_FUNCTIONS = { + torch.add, + torch.Tensor.add, + operator.add, + torch.mul, + torch.Tensor.mul, + operator.mul, +} + + +def _get_attr_name(subgraph_idx, subgraph_candidate_idx): + return f"{SHADOW_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}" + + +def _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx): + return f"{SHADOW_WRAPPER_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}" + + +class OutputProp: + """ + Output propagation (modeled from shape propagation). + + Given a GraphModule and an example input, saves the output flowing + through each node on `node.traced_result`. + + Code based on the example from + https://pytorch.org/docs/stable/fx.html#the-interpreter-pattern + """ + + def __init__(self, mod): + self.mod = mod + self.graph = mod.graph + self.modules = dict(self.mod.named_modules()) + + def propagate(self, *args): + args_iter = iter(args) + env: dict[str, Node] = {} + + def load_arg(a): + return torch.fx.graph.map_arg(a, lambda n: env[n.name]) + + def fetch_attr(target: str): + target_atoms = target.split(".") + attr_itr = self.mod + for i, atom in enumerate(target_atoms): + if not hasattr(attr_itr, atom): + raise RuntimeError( + f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}" + ) + attr_itr = getattr(attr_itr, atom) + return attr_itr + + for node in self.graph.nodes: + if node.op == "placeholder": + result = next(args_iter) + elif node.op == "get_attr": + result = fetch_attr(node.target) + elif node.op == "call_function": + result = node.target(*load_arg(node.args), **load_arg(node.kwargs)) + elif node.op == "call_method": + self_obj, *args = load_arg(node.args) + kwargs = load_arg(node.kwargs) + result = getattr(self_obj, node.target)(*args, **kwargs) + elif node.op == "call_module": + result = self.modules[node.target]( + *load_arg(node.args), **load_arg(node.kwargs) + ) + + if isinstance(result, torch.Tensor): # type: ignore[possibly-undefined] + # pyrefly: ignore [unbound-name] + node.traced_result = result + + # pyrefly: ignore [unsupported-operation] + # pyrefly: ignore [unbound-name] + env[node.name] = result + + return None + + +def _get_dedup_subgraphs(matches: dict[str, _MatchResult]) -> dict[str, list[Node]]: + # the original matches variable is unique by node, make it unique by subgraph + # instead + seen_nodes = set() + subgraphs_dedup = {} + + # Dict items are not reversible until Python 3.8, so we hack it + # to be compatible with previous Python versions + # TODO(future PR): try reversed(list(matches.items())) + matches_items_reversed: list[tuple[str, _MatchResult]] = list( + reversed(matches.items()) + ) + + # Note: the order is important. `matches` currently provides the matches + # in reverse order. We would like to process the matches in non-reverse + # order, so that we can create an intuitive naming scheme, such as + # naming the first op's submodules `shadow_0_0` through `shadow_0_(n-1)` + for name, cur_match in matches_items_reversed: # type: ignore[call-overload] + was_seen = False + for node_or_tuple in cur_match[1]: + # Cur_match[1] has an unusual type. It says that it's a `List[Node]`, + # but it is really not. Furthermore, the contents of this field + # can change from match results of multiple nodes of the same pattern + # + # For example, for conv -> bn -> relu, we see + # match_results = { + # 'conv': (relu, [(bn, conv), relu], ...), + # 'bn': (relu, [(bn, conv), relu], ...), + # 'relu': (relu, [(bn, conv), relu], ...), + # } + # + # Ideally we should clean up the `find_matches` function to make + # this more intuitive. For the purposes of this prototype, we hack + # around it. + + if isinstance(node_or_tuple, Node): + if node_or_tuple in seen_nodes: + was_seen = True + seen_nodes.add(node_or_tuple) + + else: + if not isinstance(node_or_tuple, tuple): + raise AssertionError(f"Expected tuple, got {type(node_or_tuple)}") + for node in node_or_tuple: + if not isinstance(node, Node): + raise AssertionError(f"Expected Node, got {type(node)}") + if node in seen_nodes: + was_seen = True + seen_nodes.add(node) + + if was_seen: + continue + + # Start with the unusual type, convert it to [op_0, ..., op_n] + list_of_nodes = [] + + if len(cur_match[1]) == 1: + list_of_nodes = cur_match[1] + else: + if len(cur_match[1]) != 2: + raise ValueError( + f"Expected cur_match[1] to have length 2, got {len(cur_match[1])}" + ) + # either (a, b), or ((a, b), c) or (c, (a, b)) + # cannot make any assumptions on order, not clear what the + # _find_matches function is doing to populate this + # TODO(future PR): make this code less confusing, see discussion + # in https://github.com/pytorch/pytorch/pull/80521/files#r975918836 + + def _order_nodes(node_a, node_b, node_c) -> list[Node]: + nodes = [node_a, node_b, node_c] + first_node = None + mid_node = None + last_node = None + for n in nodes: + prev_n = n.args[0] + next_n = next(iter(n.users)) + if prev_n not in nodes: + first_node = n + elif next_n not in nodes: + last_node = n + else: + mid_node = n + if first_node is None or mid_node is None or last_node is None: + raise AssertionError("Expected all nodes to be non-None") + if mid_node.args[0] is not first_node: + raise AssertionError("Expected mid_node.args[0] to be first_node") + if last_node.args[0] is not mid_node: + raise AssertionError("Expected last_node.args[0] to be mid_node") + return [last_node, mid_node, first_node] + + if isinstance(cur_match[1][0], Node) and isinstance(cur_match[1][1], Node): + # (a, b) + list_of_nodes = cur_match[1] + elif isinstance(cur_match[1][0], tuple): + # ((a, b), c) + node_a, node_b = cur_match[1][0] + node_c = cur_match[1][1] + list_of_nodes = _order_nodes(node_a, node_b, node_c) + elif isinstance(cur_match[1][1], tuple): + # (a, (b, c)) + node_a, node_b = cur_match[1][1] + node_c = cur_match[1][0] + list_of_nodes = _order_nodes(node_a, node_b, node_c) + + # [node_n, ..., node_0], note that the order is reversed + # to make it chronological for simple subgraphs + list_of_nodes.reverse() + subgraphs_dedup[name] = list_of_nodes + + return subgraphs_dedup + + +def _get_logger_for_subgraph( + model: GraphModule, + first_node: Node, + last_node: Node, + subgraph_idx: int, + subgraph_candidate_idx: int, + qconfig_str: str, + logger_cls: Callable, + fqn: str | None, +) -> torch.nn.Module: + """ + Given a model and a linear subgraph starting from `first_node` and + ending with `last_node`, creates a logger for the end of this + subgraph. + """ + if fqn is None: + fqn = "" + logger_mod_orig = logger_cls( + first_node.name, # ref_node_name + last_node.name, # prev_node_name + f"subgraph_{subgraph_idx}_{subgraph_candidate_idx}", # model_name + "model", # ref_name + get_target_type_str(last_node, model), # prev_node_target_type + get_target_type_str(first_node, model), # ref_node_target_type + NSSingleResultValuesType.NODE_OUTPUT.value, # results_type + 0, # index_within_arg + 0, # index_of_arg + fqn, # fqn + qconfig_str, + ) + # Usually we expect the user to add loggers, then calibrate, then convert, + # and then populate loggers. This is why the loggers start disabled. + # TODO(future PR): reconsider the design to make this more intuitive. + logger_mod_orig.enabled = False + return logger_mod_orig + + +def create_submodule_from_subgraph( + model: torch.nn.Module, + first_node: Node, + last_node: Node, +) -> GraphModule: + """ + Input: a model, and a linear subgraph within the model from first_node to + last_node. + + Output: a new submodule containing a copy of the subgraph, with the inputs + to the first node becoming the inputs to the submodule, and all other + nodes in the subgraph being copied. + + Example inputs: + + `model`: a module with graph + + x0 -> op1 -> x1 -> op2 -> x2 + | + arg1 + + `first_node`: op1 + `last_node`: op2 + + Example output: a new module with graph + + input1 -> op1_copy -> x1 -> op2_copy -> output1 + | + arg1 + """ + + # + # create a blank GraphModule with an empty graph + # + + class M(torch.nn.Module): + def forward(self, x): + pass + + m = M() + gm = torch.fx.symbolic_trace(m) + g = gm.graph + for node in reversed(gm.graph.nodes): + g.erase_node(node) + + # + # modify the graph to have a copy of our subgraph + # + + cur_node_orig = first_node + + cur_name_idx = 0 + + iteration_limit = 100 + cur_iteration = 0 + + while True: + if cur_node_orig is first_node: + # we are at the first node, we need to set up graph inputs + # TODO(future): some graphs could have placeholders which are unrelated + # to the first node, need to handle this + cur_args_copy = [] + cur_kwargs_copy = {} + seen_names: set[str] = set() + old_name_to_new_node: dict[str, Node] = {} + + def _add_placeholder( + g: Graph, node: Node, seen_names, old_name_to_new_node + ): + # note: for graphs starting with patterns such as `y = x + x`, we + # need to ensure we do not add multiple placeholders with the + # same name + counter = 0 + while node.name + "_" + str(counter) in seen_names: + counter += 1 + cur_name = node.name + "_" + str(counter) + seen_names.add(cur_name) + placeholder = g.placeholder(cur_name) + old_name_to_new_node[node.name] = placeholder + return placeholder + + for arg in cur_node_orig.args: + if isinstance(arg, Node): + p = _add_placeholder(g, arg, seen_names, old_name_to_new_node) + cur_args_copy.append(p) + elif isinstance(arg, (list, tuple)): + new_arg = [] + for inner_arg in arg: + if isinstance(inner_arg, Node): + new_arg.append( + _add_placeholder( + g, inner_arg, seen_names, old_name_to_new_node + ) + ) + else: + new_arg.append(inner_arg) + cur_args_copy.append(new_arg) + else: + cur_args_copy.append(arg) + + # TODO(future PR): handle non-normalized kwargs + for kwarg_name, kwarg in cur_node_orig.kwargs.items(): + if isinstance(kwarg, Node): + cur_kwargs_copy[kwarg_name] = _add_placeholder( + g, kwarg, seen_names, old_name_to_new_node + ) + elif isinstance(kwarg, (list, tuple)): + new_kwarg = [] + for inner_kwarg in kwarg: + p = _add_placeholder( + g, + inner_kwarg, # type: ignore[arg-type] + seen_names, + old_name_to_new_node, + ) + new_kwarg.append(p) + cur_kwargs_copy[kwarg_name] = new_kwarg + else: + cur_kwargs_copy[kwarg_name] = kwarg + + cur_args_copy = tuple(cur_args_copy) # type: ignore[assignment] + else: + # we are not at first node, first arg is from the previous node, + # and all other args are copied + + # the current implementation is simplistic and cannot handle + # ops with two or more arguments which need to be passed from + # the previous op, so we assert them out + if cur_node_orig.target in BINARY_FUNCTIONS: + raise AssertionError( + f"Unexpected binary function target: {cur_node_orig.target}" + ) + + # at this point in the code, cur_node_copy is pointing to the copy + # of the previous node + # TODO(future PR): this is not handling complicated graphs correctly, need to + # look at actual relationships instead of assuming sequential graph + # TODO(future PR): this is ignoring kwargs, will need to support kwargs + # for any fusion pattern which has them for a node that is not the + # first node. + cur_args_copy = [cur_node_copy] # type: ignore[has-type, possibly-undefined] # noqa: F821 + + if len(cur_node_orig.args) > 1: + for arg in cur_node_orig.args[1:]: + if isinstance(arg, torch.nn.Parameter): + new_arg = arg.detach().clone() # type: ignore[assignment] + mod_name = f"mod_{cur_name_idx}" + cur_name_idx += 1 + setattr(gm, mod_name, new_arg) + new_arg_placeholder = gm.placeholder(mod_name) # type: ignore[operator] + # pyrefly: ignore [missing-attribute] + cur_args_copy.append(new_arg_placeholder) + elif isinstance(arg, (float, int, torch.dtype)): + # pyrefly: ignore [missing-attribute] + cur_args_copy.append(arg) + else: + raise AssertionError(f"arg of type {type(arg)} not handled yet") + cur_args_copy = tuple(cur_args_copy) # type: ignore[assignment] + + # copy the node + if cur_node_orig.op == "call_module": + orig_mod = getattr_from_fqn(model, cur_node_orig.target) # type: ignore[arg-type] + orig_mod_copy = copy.deepcopy(orig_mod) + mod_name = f"mod_{cur_name_idx}" + setattr(gm, mod_name, orig_mod_copy) + cur_name_idx += 1 + cur_node_copy = g.call_module(mod_name, cur_args_copy, cur_kwargs_copy) # type: ignore[possibly-undefined,arg-type] + + elif cur_node_orig.op == "call_function": + cur_node_copy = g.call_function( + cur_node_orig.target, # type: ignore[arg-type] + cur_args_copy, # type: ignore[arg-type] + cur_kwargs_copy, # type: ignore[possibly-undefined] + ) + + elif cur_node_orig.op == "call_method": + cur_node_copy = g.call_method( + cur_node_orig.target, # type: ignore[arg-type] + cur_args_copy, # type: ignore[arg-type] + cur_kwargs_copy, # type: ignore[possibly-undefined] + ) + + else: + raise AssertionError(f"{cur_node_orig.op} not supported yet") + + if cur_node_orig is last_node: + break + + # go to next node + if len(cur_node_orig.users.keys()) != 1: + raise AssertionError( + f"{cur_node_orig} has more than 1 users, not supported yet" + ) + cur_node_orig = next(iter(cur_node_orig.users.keys())) + cur_iteration += 1 + if cur_iteration > iteration_limit: + raise AssertionError("iteration limit exceeded") + + # set up outputs + g.output(cur_node_copy) + + gm.recompile() + return gm + + +def create_one_transformed_and_logged_copy_of_subgraph( + mt: GraphModule, + subgraph_idx: int, + subgraph_candidate_idx: int, + first_node: Node, + last_node: Node, + fqn: str | None, + list_of_node_name_to_qconfig: list[dict[str, QConfigAny]], + example_inputs: Any, + last_added_shadow_node_list: list[Node | None], + custom_prepare_fn: Callable | None = None, + custom_prepare_kwargs: dict[str, Any] | None = None, +) -> None: + """ + Given a subgraph in `mt` and a subgraph candidate idx, inserts the + subgraph candidate copy and instruments it with loggers. + + If subgraph_candidate_idx is 0, this is the baseline fp32 subgraph and we just + add a logger to the end. + + If subgraph_candidate_idx is not 0, we create a copy of the subgraph and + prepare it with `prepare_fx`. + """ + + # TODO(future PR): move logger classes to utils to remove circular dependency + from torch.ao.ns._numeric_suite_fx import OutputComparisonLogger, OutputLogger + + if subgraph_candidate_idx == 0: + # idx = 0 is the floating point (original) version of the subgraph + # We keep the subgraph as is, and add a logger at the end + + qconfig_str = "" + logger_mod_orig = _get_logger_for_subgraph( + mt, + first_node, + last_node, + subgraph_idx, + subgraph_candidate_idx, + qconfig_str, + OutputLogger, + fqn, + ) + + attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx) + if hasattr(mt, attr_name): + raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}") + setattr(mt, attr_name, logger_mod_orig) + with mt.graph.inserting_after(last_node): + new_node = mt.graph.call_module(attr_name, args=(last_node,), kwargs={}) + last_added_shadow_node_list[0] = new_node + + else: + # idx > 0 means we have a candidate qconfig to try, so we need + # to make a copy of the subgraph, feed it with the right inputs, + # and add a logger at the end + + # get the qconfig + # subtract one because the first candidate is the floating point + # version of the subgraph + node_name_to_qconfig = list_of_node_name_to_qconfig[subgraph_candidate_idx - 1] + qconfig = node_name_to_qconfig[first_node.name] + + # if no quantization is requested, skip + # TODO(future PR): deduplicate equivalent qconfigs that come from + # different qconfig mapping objects + if qconfig is None: + return + + qconfig_mapping = QConfigMapping().set_global(qconfig) + + # create a copy of the submodule, wrapped in a separate module + orig_mod_copy_wrapped = create_submodule_from_subgraph( + mt, first_node, last_node + ) + + # add a call to prepare_fx on the wrapper module + if custom_prepare_fn is None: + orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx( + orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs + ) + else: + if custom_prepare_kwargs is None: + custom_prepare_kwargs = {} + for kwarg_name in [ + "example_inputs", + "prepare_custom_config", + "qconfig_mapping", + ]: + if kwarg_name in custom_prepare_kwargs: + raise AssertionError( + f"cannot specify {kwarg_name} in custom_prepare_kwargs" + ) + prepare_kwargs: dict[str, Any] = { + "example_inputs": example_inputs, + "qconfig_mapping": qconfig_mapping, + } + prepare_kwargs.update(custom_prepare_kwargs) + orig_mod_copy_wrapped = custom_prepare_fn( + orig_mod_copy_wrapped, **prepare_kwargs + ) + + # attach the wrapper to the model + attr_name = _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx) + if hasattr(mt, attr_name): + raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}") + setattr(mt, attr_name, orig_mod_copy_wrapped) + + # add a call to the wrapper module from the parent graph + insert_after_node = last_added_shadow_node_list[0] + with mt.graph.inserting_after(insert_after_node): + # TODO(future PR): handle fusion patterns where non-first nodes + # need inputs + + # pass in all node args and kwargs + + new_args = [] + for arg in first_node.args: + if isinstance(arg, Node): + new_args.append(arg) + elif ( + isinstance(arg, (list, tuple)) + and len(arg) + and isinstance(arg[0], Node) + ): + new_args.extend( + inner_arg for inner_arg in arg if isinstance(inner_arg, Node) + ) + + new_kwargs = {} + for name, old_kwarg in first_node.kwargs.items(): + if isinstance(old_kwarg, Node): + new_kwargs[name] = old_kwarg + elif isinstance(old_kwarg, (list, tuple)) and len(old_kwarg): + # TODO(future PR): clarify why we are adding kwargs to args + new_args.extend(old_kwarg) # type: ignore[arg-type] + + new_args = tuple(new_args) # type: ignore[assignment] + + new_node = mt.graph.call_module(attr_name, args=new_args, kwargs=new_kwargs) # type: ignore[arg-type] + + # add a logger to parent graph to observe the shadow wrapper + logger_mod_orig = _get_logger_for_subgraph( + mt, + first_node, + last_node, + subgraph_idx, + subgraph_candidate_idx, + str(qconfig), + OutputComparisonLogger, + fqn, + ) + + attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx) + if hasattr(mt, attr_name): + raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}") + setattr(mt, attr_name, logger_mod_orig) + with mt.graph.inserting_after(new_node): + logger = mt.graph.call_module( + attr_name, args=(new_node, last_node), kwargs={} + ) + last_added_shadow_node_list[0] = logger + + mt.recompile() + + +def create_n_transformed_and_logged_copies_of_subgraph( + mt: GraphModule, + subgraph_idx: int, + match_name: str, + nodes_in_this_subgraph: list[Any], + qconfig_mappings: list[QConfigMapping], + list_of_node_name_to_qconfig: list[dict[str, QConfigAny]], + custom_prepare_fn: Callable | None = None, + custom_prepare_kwargs: dict[str, Any] | None = None, +) -> None: + """ + Given a model `mt` and a subgraph_idx, creates the needed copies + of the subgraph for all qconfigs, and instruments them with loggers. + """ + # for now, assume that + # 1. the first node has one input + # 2. the last node has one output + + # for now, ignore all subgraphs that contain non-nodes (tuples, etc) + # TODO(future PR): implement this + if any(not isinstance(node, Node) for node in nodes_in_this_subgraph): + return + + first_node = nodes_in_this_subgraph[0] + last_node = nodes_in_this_subgraph[-1] + # We used output propagation to populate example values on each + # node. Use the example values from the previous node as the input + # to the current node. + prev_node = get_normalized_nth_input(first_node, mt, 0) + if isinstance(prev_node, list): + example_inputs = [x.traced_result for x in prev_node] + elif isinstance(prev_node, tuple): + example_inputs = (x.traced_result for x in prev_node) # type: ignore[assignment] + else: + # currently some customer models do not have a traced_result in + # every node, so we have to guard for this case since we cannot + # quantize without an example input + # TODO(future PR): add a test case for this once we have an easy + # repro, see https://github.com/pytorch/pytorch/pull/80521/files#r975940489 + # for additional context + if hasattr(prev_node, "traced_result"): + example_inputs = (prev_node.traced_result,) # type: ignore[attr-defined, assignment] + else: + print( + "unable to get example input for node " + + f"{first_node.format_node()}, skipping" + ) + return + + # If there are no quantization configs for this subgraph, skip adding + # loggers. This reduces memory usage for models where not all layers are + # quantized. + # TODO(future): consider making this configurable + found_at_least_one_qconfig = False + for subgraph_candidate_idx in range(len(qconfig_mappings) + 1): + if subgraph_candidate_idx == 0: + # fp32 baseline does not need a qconfig + continue + + # a. we have N shadows, so len(qconfig_mappings) is N + # b. we will have the fp32 layer + N shadows, so overall number of + # (original_op) + (*shadows) will be N+1 + # c. since `subgraph_candidate_idx` represents (b), we need + # to subtract 1 to query from (a) + node_name_to_qconfig = list_of_node_name_to_qconfig[subgraph_candidate_idx - 1] + qconfig = node_name_to_qconfig[first_node.name] + if qconfig is not None: + found_at_least_one_qconfig = True + break + if not found_at_least_one_qconfig: + print( + "unable to find at least one qconfig for node " + + f"{first_node.format_node()}, skipping" + ) + return + + fqn = _maybe_get_fqn(first_node, mt) + + # We want the results to contain the subgraphs in natural order, + # and the graph to also contain shadow wrappers and shadow loggers + # in natural order. + # If we just iterate in reverse, the graph will be in natural + # order but the eventual results will be in reverse order. + # So, we keep track of the last shadow logger we added and + # always insert after it. + last_added_shadow_node_list: list[Node | None] = [None] + for subgraph_candidate_idx in range(len(qconfig_mappings) + 1): + create_one_transformed_and_logged_copy_of_subgraph( + mt, + subgraph_idx, + subgraph_candidate_idx, + first_node, + last_node, + fqn, + list_of_node_name_to_qconfig, + example_inputs, + last_added_shadow_node_list, + custom_prepare_fn, + custom_prepare_kwargs, + ) + + +def create_add_loggers_graph( + model: GraphModule, + subgraphs_dedup: dict[str, list[Node]], + qconfig_mapping: QConfigMapping, + node_name_to_qconfig: dict[str, QConfigAny], +) -> None: + r""" + Given a model, a model graph partition (currently a set of matched + subgraphs) and instructions how to transform each subgraph + (currently quantizing it according to qconfig_mapping), modifies + the model graph to create an alternate path through the original graph, + with each of the subgraphs quantized. This is useful to compare + propagation error of a transformation such as quantization. + + For example, given layer op0 and op1, there are four cases when handling op1: + 1. op0 and op1 quantized + 2. op0 and op1 unquantized + 3. op0 quantized, op1 unquantized + 4. op0 unquantized, op1 quantized + + Example input, case 1: + + .. code:: + + x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log + \ \ \ \ # noqa: W605 + ---> op0_1 -> x1_1 ----> clog op1_1 -> x2_1 ----> clog + + Example output, case 1: + + .. code:: + + x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log + \ \ \ # noqa: W605 + ---> op0_1 -> x1_1 ----> clog -> op1_1 -> x2_1 ----> clog + + """ + # TODO(future PR): move logger classes to utils to remove circular dependency + from torch.ao.ns._numeric_suite_fx import OutputComparisonLogger, OutputLogger + + def _get_subgraph_containing_node(node, subgraphs_dedup): + for subgraph in subgraphs_dedup.values(): + if node in subgraph: + return subgraph + return None + + # First, we need to create shadow branches, going from + # + # x0 -> op0 -> x1 -> ... + # + # + # to + # + # x0 -> op0_0 -> x1_0 -> log -> ... + # \ \ + # -> op0_1 -> x1_1 -> clog + # + # Later, the outputs of each shadow will be rerouted to calculate + # propagation error. + + # Note: we cannot iterate over matched subgraphs because some nodes + # may not be matched. So, we iterate over nodes in the graph, and + # associate them to matched subgraphs if possible. + + nodes_to_skip = set() + # for each subgraph, save a mapping from first node of subgraph + # to first and last node of the shadow of this subgraph + orig_first_node_to_shadow_in_node = {} + orig_first_node_to_shadow_out_node = {} + # need to record original list because we will mutate the graph as we go + orig_nodes = list(model.graph.nodes) # type: ignore[union-attr, arg-type] + cur_subgraph_idx = 0 + for n in orig_nodes: + if n.op in ("placeholder", "get_attr", "output") or n in nodes_to_skip: + continue + + maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup) + insert_submodule_copy = False + if maybe_subgraph is not None: + first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1] + nodes_to_skip.update(maybe_subgraph) + qconfig = node_name_to_qconfig[first_node.name] + if qconfig is not None: + insert_submodule_copy = True + else: + first_node, last_node = n, n + + if insert_submodule_copy: + match_name = first_node.name + create_n_transformed_and_logged_copies_of_subgraph( + model, + cur_subgraph_idx, + match_name, + # pyrefly: ignore [bad-argument-type] + maybe_subgraph, + [qconfig_mapping], + [node_name_to_qconfig], + None, + None, # type: ignore[arg-type] + ) + # find the created shadow module and record it so we + # can find it easily in step 2 + expected_shadow_target = f"shadow_wrapper_{cur_subgraph_idx}_1" + new_shadow_mod = None + for maybe_shadow_mod in model.graph.nodes: + if ( + maybe_shadow_mod.op == "call_module" + and maybe_shadow_mod.target == expected_shadow_target + ): + new_shadow_mod = maybe_shadow_mod + break + if new_shadow_mod is None: + raise AssertionError("Expected new_shadow_mod to be non-None") + orig_first_node_to_shadow_in_node[first_node] = new_shadow_mod + orig_first_node_to_shadow_out_node[first_node] = new_shadow_mod + + else: + # create a copy of the subgraph by only copying FX nodes + # but not copying any parameters, to minimize memory usage + subgraph_to_use = ( + maybe_subgraph if maybe_subgraph is not None else [first_node] + ) + + # add a regular logger after last_node + qconfig_str = "" + subgraph_candidate_idx = 0 + fqn = _maybe_get_fqn(first_node, model) + logger_mod_orig = _get_logger_for_subgraph( + model, + first_node, + last_node, + cur_subgraph_idx, + subgraph_candidate_idx, + qconfig_str, + OutputLogger, + fqn, + ) + attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx) + if hasattr(model, attr_name): + raise AssertionError( + f"Unexpected attribute '{attr_name}' found in {model}" + ) + setattr(model, attr_name, logger_mod_orig) + insertion_point = last_node + with model.graph.inserting_after(insertion_point): + logger = model.graph.call_module( + attr_name, args=(last_node,), kwargs={} + ) + insertion_point = logger + + # create a copy of the subgraph + cur_node_orig = first_node + cur_node_copy = None + first_node_copy = None + # pyrefly: ignore [bad-assignment] + while cur_node_orig in subgraph_to_use: + # TODO(future PR): make this support all possible args/kwargs + if cur_node_orig is first_node: + new_args = cur_node_orig.args + new_kwargs = cur_node_orig.kwargs + else: + first_arg_for_copy: Node | None = cur_node_copy + new_args = (first_arg_for_copy, *cur_node_orig.args[1:]) + new_kwargs = cur_node_orig.kwargs + # make a copy of cur_node_orig + with model.graph.inserting_after(insertion_point): + cur_node_copy = model.graph.create_node( + cur_node_orig.op, + cur_node_orig.target, + new_args, + new_kwargs, + # cur_node_orig.name, # TODO(future PR): set name explicitly + ) + if first_node_copy is None: + first_node_copy = cur_node_copy + # since now only linear subgraphs are supported, all nodes + # except the last one must have only one user + if cur_node_orig != last_node: + if len(cur_node_orig.users.keys()) != 1: + raise AssertionError( + f"Expected exactly 1, but got {len(cur_node_orig.users)}" + ) + cur_node_orig = next(iter(cur_node_orig.users.keys())) + if cur_node_orig.name.startswith(SHADOW_NODE_NAME_PREFIX): + raise AssertionError( + "cur_node_orig should not start with SHADOW_NODE_NAME_PREFIX" + ) + insertion_point = cur_node_copy + + # add a comparison logger after last_node's copy + subgraph_candidate_idx = 1 + logger_mod_orig = _get_logger_for_subgraph( + model, + first_node, + last_node, + cur_subgraph_idx, + subgraph_candidate_idx, + qconfig_str, + OutputComparisonLogger, + fqn, + ) + attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx) + if hasattr(model, attr_name): + raise AssertionError( + f"Unexpected attribute '{attr_name}' found in {model}" + ) + setattr(model, attr_name, logger_mod_orig) + with model.graph.inserting_after(insertion_point): + logger = model.graph.call_module( + attr_name, args=(cur_node_copy, last_node), kwargs={} + ) + + # save the final node so we can use it in step 2 + orig_first_node_to_shadow_in_node[first_node] = first_node_copy + orig_first_node_to_shadow_out_node[first_node] = cur_node_copy + + cur_subgraph_idx += 1 + + model.recompile() + + # Now, we go from + # + # x0 -> op0_0 -> x1_0 -> log -> x1 -> op1_0 -> ... + # \ \ \ + # -> op0_1 -> x1_1 -> clog -> op1_1 -> ... + # + # to + # + # x0 -> op0_0 -> x1_0 -> log --> x1_0 -> op1_0 -> ... + # \ \ + # -> op0_1 -> x1_1 -> clog -> x1_1 -> op1_1 -> ... + # + # sample values of key internal variables for the example above: + # + # orig_first_node_to_shadow_in_node = {op0_0: op0_1, op1_0: op1_1} + # orig_first_node_to_shadow_out_node = {op0_0: op0_1, op1_0: op1_1} + # + # note: for subgraphs with more than one node, in_node will be different + # compared to out_node + + nodes_to_skip = set() + for n in orig_nodes: + if n.op in ("placeholder", "get_attr", "output") or n in nodes_to_skip: + continue + + maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup) + if maybe_subgraph is not None: + first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1] + nodes_to_skip.update(maybe_subgraph) + else: + first_node, last_node = n, n + + def maybe_remap_node_to_shadow(node): + """ + If unshadowed `node` has a shadow version, return that. If not, + return `node`. + """ + if not isinstance(node, Node): + # handle scalars + return node + + if node.op in ("placeholder", "get_attr"): + return node + + # Find the shadowed version of this arg from the previous + # subgraph. For this, we need to: + # 1. navigate to the first node of the previous subgraph + # 2. get the output of the shadow wrapper which has (1) as an input + + # For now, assume the arg is in matched subgraphs. In the + # future we may have to handle the case where this is not true. + prev_subgraph = _get_subgraph_containing_node(node, subgraphs_dedup) + if prev_subgraph is None: + prev_subgraph = [node] + prev_first_node = prev_subgraph[0] + prev_shadow_output = orig_first_node_to_shadow_out_node[prev_first_node] + return prev_shadow_output + + cur_shadow_input = orig_first_node_to_shadow_in_node[first_node] + if cur_shadow_input is None: + raise AssertionError("Expected cur_shadow_input to be non-None") + cur_shadow_input.args = tree_map( + maybe_remap_node_to_shadow, cur_shadow_input.args + ) + cur_shadow_input.kwargs = tree_map( + maybe_remap_node_to_shadow, cur_shadow_input.kwargs + ) + + model.recompile() + + +def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module): + # input: shadow wrapper module + # output if shadow wrapper module has a weighted op: + # (quantize_fn, (quantize_fn_args)) + # output if shadow wrapper module doesn't have a weighted op: + # None + + # For now, assume that the weight is the second input + # to the shadow module. If that changes, we can fix it later. + placeholders_seen = 0 + for shadow_n in shadow_wrapper.graph.nodes: # type: ignore[union-attr] + if shadow_n.op != "placeholder": + continue + + placeholders_seen += 1 + if placeholders_seen != 2: + continue + + # the subgraph looks like + # + # _input_scale_1 = self._input_scale_1 + # _input_zero_point_1 = self._input_zero_point_1 + # quantize_per_channel = torch.quantize_per_channel( + # w2_0, _input_scale_1, _input_zero_point_1, + # 0, torch.qint8) + # + # we have `w2_0`, and are navigating this subgraph + # to get `_input_scale_1` and `_input_zero_point_1` + + if len(shadow_n.users) != 1: + raise AssertionError(f"Expected exactly 1, got {len(shadow_n.users)}") + quant_node = next(iter(shadow_n.users.keys())) + new_args: Any = None + if quant_node.target is torch.quantize_per_channel: + _weight, scale_node, zp_node, axis, dtype = quant_node.args + scale_val = getattr_from_fqn(shadow_wrapper, scale_node.target) + zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target) + new_args = (scale_val, zp_val, axis, dtype) + else: + if quant_node.target != torch.quantize_per_tensor: + raise AssertionError( + f"Expected torch.quantize_per_tensor, but got {quant_node.target}" + ) + _weight, scale_node, zp_node, dtype = quant_node.args + scale_val = getattr_from_fqn(shadow_wrapper, scale_node.target) + zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target) + new_args = (scale_val, zp_val, dtype) + return (quant_node.target, new_args) + + return None + + +def extract_weight_comparison(m: GraphModule) -> NSResultsType: + # example graph: + # + # w1 = self.w1 + # b1 = self.b1 + # linear = torch._C._nn.linear(x, w1, b1) + # shadow_0_0 = self.shadow_0_0(linear) + # shadow_wrapper_0_1 = self.shadow_wrapper_0_1(x, w1, b1) + # shadow_0_1 = self.shadow_0_1(shadow_wrapper_0_1, linear) + # + # algorithm: + # 1. for each call_function node matching our allowlist: + # 2. if corresponding shadow wrapper exists, extract the weight pair + # + # Note: this is not super robust, but that's ok because this is + # just for legacy customers who depend on the previous two-model version + # of this API. TBD if we need to make this robust. + # Note: modules are not supported, since existing customers only + # use functions. + + # TODO(future PR): move this to config + weighted_ops = { + torch.nn.functional.linear, + } + + results: NSResultsType = {"model": {NSSingleResultValuesType.WEIGHT.value: {}}} + + for n in m.graph.nodes: # type: ignore[union-attr] + if not (n.op == "call_function" and n.target in weighted_ops): + continue + + # Check if we have a corresponding shadow wrapper + # TODO(future PR, if needed): support kwargs + # TODO(future PR, if needed): support multiple shadow users + first_arg = n.args[0] + shadow_wrapper_node = None + for user in first_arg.users: + # TODO(before land): fix string match + if user.op == "call_module" and user.target.startswith("shadow_wrapper"): + shadow_wrapper_node = user + break + + if shadow_wrapper_node is None: + continue + + shadow_wrapper = getattr_from_fqn(m, shadow_wrapper_node.target) # type: ignore[arg-type] + weight_info = _get_weight_info_from_shadow_wrapper(shadow_wrapper) + if weight_info is None: + continue + + # get weight + w_node = n.args[1] + w_obj = getattr_from_fqn(m, w_node.target).detach() + + # get a quantized version of weight + quant_fn, quant_fn_args_except_first = weight_info + new_args = (w_obj, *quant_fn_args_except_first) + w_obj_q = quant_fn(*new_args) + + # add a comparison + ref_node_name = n.name + prev_node_name = n.name + ref_node_type = get_target_type_str(n, m) + prev_node_type = ref_node_type + fqn = None + if hasattr(m, "_node_name_to_scope"): + fqn = m._node_name_to_scope[n.name][0] # type: ignore[index] + comparison = torch.ao.ns.fx.utils.compute_sqnr(w_obj, w_obj_q) + result_fp32 = { + "res_type": NSSingleResultValuesType.WEIGHT.value, + "values": [w_obj], + "prev_node_name": prev_node_name, + "prev_node_target_type": prev_node_type, + "ref_node_name": ref_node_name, + "ref_node_target_type": ref_node_type, + "index_within_arg": 0, + "index_of_arg": 0, + "fqn": fqn, + "qconfig_str": "", + "comparisons": [comparison], + "comparison_fn_name": "sqnr", + } + result_q = { + "res_type": NSSingleResultValuesType.WEIGHT.value, + "values": [w_obj_q], + "prev_node_name": prev_node_name, + "prev_node_target_type": prev_node_type, + "ref_node_name": ref_node_name, + "ref_node_target_type": ref_node_type, + "index_within_arg": 0, + "index_of_arg": 0, + "fqn": fqn, + "qconfig_str": "", + "comparisons": [comparison], + "comparison_fn_name": "sqnr", + } + + # go from subgraph_n_1 to subgraph_n_0 + _1, _2, node_idx, _3 = shadow_wrapper_node.target.split("_") + name_fp32 = f"subgraph_{node_idx}_0" + name_q = f"subgraph_{node_idx}_1" + + results["model"][NSSingleResultValuesType.WEIGHT.value][name_fp32] = [ + result_fp32 + ] + results["model"][NSSingleResultValuesType.WEIGHT.value][name_q] = [result_q] + + return results + + +# TODO(future PR): redesign this to make it easier to consume outputs +def group_results_by_subgraph(results: NSResultsType) -> Any: + """ + Creates a comparison of results + + Input: + + { + 'model': { + 'node_output': { + 'subgraph_0_0': [ + 'values': [torch.tensor(...), ...], ... + 'ref_node_name': ..., + 'ref_node_target_type': ..., + 'qconfig_str': ..., + 'comparisons': [], ... + 'comparison_fn_name': '', + 'fqn': '...', + ], + 'subgraph_0_1': [ + 'values': [torch.tensor(...), ...], ... + 'ref_node_name': ..., + 'ref_node_target_type': ..., + 'qconfig_str': ..., + 'comparisons': [torch.tensor(...), ...], ... + 'comparison_fn_name': '...', + 'fqn': '...', + ], + ... + }, + }, + } + + Output: + { + 'subgraph_0': { + '0': { + 'ref_node_name': '...', + 'ref_node_target_type': ..., + 'values': [torch.tensor(...), ...], + 'qconfig_str': None, + 'comparisons': [torch.tensor(...), ...], ... + 'comparison_fn_name': '...', + 'fqn': '...', + }, + '1': { + 'ref_node_name': '...', + 'ref_node_target_type': ..., + 'values': [torch.tensor(...), ...], + 'qconfig_str': '...', + 'comparisons': [torch.tensor(...), ...], ... + 'comparison_fn_name': '...', + 'fqn': '...', + }, + }, + } + + """ + subgraph_name_to_subgraph_results: Any = collections.defaultdict(dict) + + # node_output or weight + key_to_use = next(iter(results["model"].keys())) + + for subgraph_name_with_idx, subgraph_candidate_results in results["model"][ + key_to_use + ].items(): + # convert from `subgraph_m_n` to `subgraph_m` and `n` + ( + subgraph_str, + subgraph_idx, + subgraph_candidate_idx, + ) = subgraph_name_with_idx.split("_") + subgraph_name = f"{subgraph_str}_{subgraph_idx}" + + subgraph_results = { + "ref_node_name": subgraph_candidate_results[0]["ref_node_name"], + "ref_node_target_type": subgraph_candidate_results[0][ + "ref_node_target_type" + ], + "fqn": subgraph_candidate_results[0]["fqn"], + "values": subgraph_candidate_results[0]["values"], + "qconfig_str": subgraph_candidate_results[0]["qconfig_str"], + "comparisons": subgraph_candidate_results[0]["comparisons"], + "comparison_fn_name": subgraph_candidate_results[0]["comparison_fn_name"], + } + + subgraph_name_to_subgraph_results[subgraph_name][subgraph_candidate_idx] = ( + subgraph_results + ) + + return dict(subgraph_name_to_subgraph_results) + + +# TODO(future PR): redesign this to make it easier to consume outputs +def create_results_comparison( + results_grouped, +) -> Any: + """ + Input: + + { + 'subgraph_0': { + '0': { + 'ref_node_name': '...', + 'ref_node_target_type': ..., + 'values': [torch.tensor(...), ...], + 'qconfig_str': '', + 'comparisons': [], + 'comparison_fn_name': '', + 'fqn': '...', + }, + '1': { + 'ref_node_name': '...', + 'ref_node_target_type': ..., + 'values': [torch.tensor(...), ...], + 'qconfig_str': '...', + 'comparisons': [torch.tensor(...), ...], + 'comparison_fn_name': 'sqnr', + 'fqn': '...', + }, + }, + } + + Output: + { + 'subgraph_0': { + 'ref_node_name': '...', + 'ref_node_target_type': '...', + 'fqn': '...', + 'candidates': { + '1': { + 'qconfig_str': ..., + 'comparison_fn_name': 'sqnr', + 'cmp_raw': [..., ...], + 'cmp_mean': ..., + }, + ..., + }, + }, + } + """ + + results_comparison = {} + + for subgraph_name, subgraph_results in results_grouped.items(): + candidates = {} + for subgraph_inner_name, subgraph_inner_result in subgraph_results.items(): + # skip comparing baseline to baseline + if subgraph_inner_name == "0": + continue + + # we expect the comparisons to be precalculated from + # calibration, so we just fetch them here + cmp_raw = subgraph_inner_result["comparisons"] + cmp_raw_tensor = torch.stack(cmp_raw) + + candidates[subgraph_inner_name] = { + "qconfig_str": subgraph_inner_result["qconfig_str"], + "comparison_fn_name": subgraph_inner_result["comparison_fn_name"], + "cmp_raw": cmp_raw_tensor, + "cmp_mean": torch.mean(cmp_raw_tensor), + } + + results_comparison[subgraph_name] = { + "ref_node_name": subgraph_results["0"]["ref_node_name"], + "ref_node_target_type": subgraph_results["0"]["ref_node_target_type"], + "fqn": subgraph_results["0"]["fqn"], + "candidates": candidates, + } + + return results_comparison + + +# TODO(future PR): redesign this to make it easier to consume outputs +def print_n_shadows_summary( + results_comparison, +) -> None: + """ + Input: + + { + 'subgraph_0': { + 'ref_node_name': 'linear1', + 'ref_node_target_type': '...', + 'fqn': '...', + 'candidates': { + '1': { + 'qconfig_str': ..., + 'comparison_fn_name': ..., + 'cmp_raw': [45.0, 55.0], + 'cmp_mean': 50.0, + }, + ..., + }, + }, + } + + Prints: + + node_name | node_type | fqn | 0 | 1 | ... + linear1 | ... | ... | 45.0 | 50.0 | ... + """ + + try: + from tabulate import tabulate + except ImportError: + print( + "`print_tabular` relies on the library `tabulate`, " + "which could not be found on this machine. Run `pip " + "install tabulate` to install the library." + ) + return + + results = [] + for subgraph_data in results_comparison.values(): + mean_all_candidates = [ + candidate["cmp_mean"] + for candidate_name, candidate in subgraph_data["candidates"].items() + ] + + data_row = [ + subgraph_data["ref_node_name"], + subgraph_data["ref_node_target_type"], + subgraph_data["fqn"], + *mean_all_candidates, + ] + results.append(data_row) + + max_candidate_idx_len = -1 + for data_row in results: + max_candidate_idx_len = max(max_candidate_idx_len, len(data_row[1])) + candidate_idx_headers = [str(x) for x in range(max_candidate_idx_len)] + + headers = ["node_name", "node_type", "fqn", *candidate_idx_headers] + print(tabulate(results, headers=headers)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py new file mode 100644 index 0000000000000000000000000000000000000000..134fd485130e0069ab992197ea6e176e1e1e216b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py @@ -0,0 +1,66 @@ +import enum +from collections.abc import Callable +from typing import Any, NamedTuple, Union + +from torch.fx.graph import Node + + +class NSSingleResultValuesType(str, enum.Enum): + WEIGHT = "weight" + NODE_OUTPUT = "node_output" + NODE_INPUT = "node_input" + + +class NSSubgraph(NamedTuple): + start_node: Node + end_node: Node + base_op_node: Node + + +# TODO(future PR): see if we can use typing_extensions's TypedDict instead +# to properly type the various keys +# { +# # one of NSSingleResultValuesType +# 'type': 'weight', +# # the values of type specified above +# 'values': [torch.tensor(...), ...], +# # name of the node directly before the logger +# 'prev_node_name': 'linear1', +# # type of the underlying function or module +# 'prev_node_target_type': torch.nn.functional.linear # or torch.nn.Linear, etc +# # name of the node responsible for adding this logger +# # Note: this may differ from prev_node_name if we are logging inputs +# 'ref_node_name': 'linear1', +# # index of this node within the arg of the input/output node +# # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1 +# 'index_within_arg': 0, +# # index of this node within the args of the input/output node +# # for example, in add(x1, x2), x2 would have index_of_arg == 1 +# 'index_of_arg': 0, +# # precomputed comparisons of logger values to reference values +# 'comparisons': [torch.tensor(...), ...] +# # name of function used for precomputed comparisons +# 'comparison_fn_name': 'sqnr', +# # string representation of qconfig responsible for creating this logger +# 'qconfig_str': 'QConfig(...)', +# } +NSSingleResultType = dict[str, Any] + +# { +# 'layer_name_1': { # subgraph name +# 'node_output': { # results type (node_output, node_input, weight) +# 'model_name_a': # model name +# [NSSingleResultType, ...], # results, ordered by index_within_arg +# 'model_name_b': +# [NSSingleResultType, ...], +# }, +# }, +# } +# +NSResultsType = dict[str, dict[str, dict[str, list[NSSingleResultType]]]] + +# Defines the underlying target type of a node, for example: +# `F.conv1d` for a `call_function` conv node +# `nn.Conv1d` for a `call_module` node calling the forward of a `nn.Conv1d` module +# `'sigmoid'` for a `call_method` node calling `x.sigmoid()` +NSNodeTargetType = Union[Callable, str] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d10fdd39da9080144d3f6ef577d3ca5aca313538 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py @@ -0,0 +1,214 @@ +from collections.abc import Callable +from typing import Any, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.ao.quantization import FakeQuantizeBase, ObserverBase +from torch.ao.quantization.backend_config import get_native_backend_config +from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers +from torch.ao.quantization.utils import getattr_from_fqn +from torch.fx import GraphModule +from torch.fx.graph import Node + +from .ns_types import NSNodeTargetType + + +toq = torch.ops.quantized + + +def get_type_a_related_to_b( + base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]], +) -> set[tuple[NSNodeTargetType, NSNodeTargetType]]: + # TODO(future PR): allow customizations + # TODO(future PR): reuse existing quantization mappings + # TODO(future PR): add the rest of modules and ops here + type_a_related_to_b: set[tuple[NSNodeTargetType, NSNodeTargetType]] = set() + + for s in base_name_to_sets_of_related_ops.values(): + s_list = list(s) + # add every bidirectional pair + for idx_0 in range(len(s_list)): + for idx_1 in range(idx_0, len(s_list)): + type_a_related_to_b.add((s_list[idx_0], s_list[idx_1])) + type_a_related_to_b.add((s_list[idx_1], s_list[idx_0])) + + return type_a_related_to_b + + +NSFusionElType = Union[ + Callable, # call_function or call_module type, example: F.linear or nn.Conv2d + str, # call_method name, example: "dequantize" + tuple[ + str, Any + ], # call_method name and first argument, example: ("to", torch.float16) +] +NSFusionType = Union[ + tuple[NSFusionElType, NSFusionElType], + tuple[NSFusionElType, NSFusionElType, NSFusionElType, NSFusionElType], +] + + +def get_reversed_fusions() -> list[tuple[NSFusionType, int]]: + """ + Set of potential fusions, in reverse order. The order is reversed + to match how fusion patterns are defined in quantization code. + + Fusion format: + ((fusion_op_0, fusion_op_1), base_op_idx) + + Where base_op_idx is the idx of the op we should use to match other related + ops. Note: base_op_idx is specified in non-reverse order, i.e. a base_op_idx + of 0 represents the first op in regular (non-reverse) order, 1 represents the + second op, etc. + """ + results: list[tuple[NSFusionType, int]] = [] + + # Possible syntaxes: + # * single op: torch.nn.Conv2d + # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d) + # For fusions, we only care about patterns composed of multiple ops. + # TODO(future PR): allow customizations from default patterns. + all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config()) + + default_base_op_idx = 0 + for quant_pattern in all_quant_patterns: + # TODO: this is a temporary hack to flatten the patterns from quantization so + # that it works with the ns matcher function, maybe we should use `_is_match` + # in torch.ao.quantization.fx.match_utils to match the patterns + if ( + isinstance(quant_pattern, tuple) + and len(quant_pattern) == 2 + and isinstance(quant_pattern[1], tuple) + and len(quant_pattern[1]) == 2 + ): + # flatten the pattern with form (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d)) + quant_pattern = (quant_pattern[0], quant_pattern[1][0], quant_pattern[1][1]) + + # Only patterns of multiple ops are fusions, ignore + # patterns which contain a single ops (they get matched + # without caring about fusions). + if isinstance(quant_pattern, tuple): + results.append((quant_pattern, default_base_op_idx)) # type: ignore[arg-type] + + # For each pattern, add additional patterns with observers and + # fake quants at the end. + # TODO(future PR): if needed, implement matching for a node + # having multiple output observers. + for cls in (ObserverBase, FakeQuantizeBase): + if isinstance(quant_pattern, tuple): + new_pattern = (cls, *quant_pattern) + else: + new_pattern = (cls, quant_pattern) + results.append((new_pattern, default_base_op_idx)) # type: ignore[arg-type] + + # After this point, results contains values such as + # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...] + + # Patterns for matching fp16 emulation are not specified in the quantization + # fusion mappings. For now, define them here. + fp16_em_base_op_idx = 1 + patterns_to_add = [ + # linear-relu fp16 emulation: + # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16 + ( + (("to", torch.float16), F.relu, F.linear, "dequantize"), + fp16_em_base_op_idx, + ), + # Conv-BN fusion (this happens outside of quantization patterns, + # which is why it is defined separately here). + ((nn.BatchNorm1d, nn.Conv1d), default_base_op_idx), + ((nn.BatchNorm2d, nn.Conv2d), default_base_op_idx), + ((nn.BatchNorm3d, nn.Conv3d), default_base_op_idx), + ((nn.ReLU, nn.BatchNorm1d, nn.Conv1d), default_base_op_idx), + ((nn.ReLU, nn.BatchNorm2d, nn.Conv2d), default_base_op_idx), + ((nn.ReLU, nn.BatchNorm3d, nn.Conv3d), default_base_op_idx), + ] + for p in patterns_to_add: + results.append(p) # type: ignore[arg-type] + results.append(((ObserverBase, *p[0]), p[1])) # type: ignore[arg-type] + results.append(((FakeQuantizeBase, *p[0]), p[1])) # type: ignore[arg-type] + + return results + + +def end_node_matches_reversed_fusion( + end_node: Node, + reversed_fusion: NSFusionType, + gm: GraphModule, + seen_nodes: set[Node], +) -> bool: + """ + Returns true if a pattern ending with `end_node` matches + the fusion pattern. + """ + cur_node = end_node + for fusion_idx in range(len(reversed_fusion)): + # each node can only belong to one matched pattern + if cur_node in seen_nodes: + return False + + cur_fusion_el = reversed_fusion[fusion_idx] + + if cur_node.op == "call_function": + fusion_el_is_fun = (not isinstance(cur_fusion_el, str)) and ( + not isinstance(cur_fusion_el, type) + ) + if fusion_el_is_fun: + if cur_node.target != cur_fusion_el: + return False + if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node): + cur_node = cur_node.args[0] + else: + return False + else: + return False + + elif cur_node.op == "call_module": + fusion_el_is_mod = isinstance(cur_fusion_el, type) + if fusion_el_is_mod: + if not isinstance(cur_node.target, str): + raise AssertionError(f"Expected str, got {type(cur_node.target)}") + target_mod = getattr_from_fqn(gm, cur_node.target) + if not isinstance(cur_fusion_el, type): + return False + if not isinstance(target_mod, cur_fusion_el): + return False + if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node): + cur_node = cur_node.args[0] + else: + return False + else: + return False + + elif cur_node.op == "call_method": + fusion_el_is_meth_with_second_arg = ( + isinstance(cur_fusion_el, tuple) and len(cur_fusion_el) == 2 + ) + fusion_el_is_meth_without_args = isinstance(cur_fusion_el, str) + if fusion_el_is_meth_without_args or fusion_el_is_meth_with_second_arg: + if fusion_el_is_meth_without_args: + if cur_node.target != cur_fusion_el: + return False + else: + if not isinstance(cur_fusion_el, tuple): + raise AssertionError( + f"Expected tuple, got {type(cur_fusion_el)}" + ) + if cur_node.target != cur_fusion_el[0]: + return False + elif len(cur_node.args) < 2: + return False + elif cur_node.args[1] != cur_fusion_el[1]: + return False + + if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node): + cur_node = cur_node.args[0] + else: + return False + else: + return False + else: + return False + + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..d36914b46929d7eb8311097cd6b5d0fdc0c82f12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py @@ -0,0 +1,251 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import copy +from typing import Any, TYPE_CHECKING + +import torch +from torch.ao.quantization import QConfigMapping +from torch.ao.quantization.qconfig_mapping import _QCONFIG_STYLE_ORDER + + +if TYPE_CHECKING: + from collections.abc import Callable + + from torch.ao.quantization.qconfig import QConfigAny + +__all__ = ["QConfigMultiMapping"] + +_QCONFIG_STYLE_TO_METHOD: dict[str, str] = { + "global_qconfig": "set_global", + "object_type_qconfigs": "set_object_type", + "module_name_regex_qconfigs": "set_module_name_regex", + "module_name_qconfigs": "set_module_name", + "module_name_object_type_order_qconfigs": "set_module_name_object_type_order", +} + + +def _remove_duplicates_and_none(qconfig_list: list[QConfigAny]) -> None: + to_remove = [] + for index, cur_qconfig in enumerate(qconfig_list): + if cur_qconfig is None: + to_remove.append(index) + break + for checked_qconfig in qconfig_list[:index]: + if torch.ao.quantization.qconfig_equals(cur_qconfig, checked_qconfig): + to_remove.append(index) + break + for index in to_remove[::-1]: + qconfig_list.pop(index) + + +class QConfigMultiMapping: + """ + This class, used with the prepare_n_shadows_model API, stores a list of :class:`torch.ao.quantization.QConfigMapping`s + so that multiple QConfigs can be specified for each QConfig matching style. + + The user can specify QConfigs using the following methods (in increasing match priority): + + ``set_global`` : sets the global (default) QConfigs + + ``set_object_type`` : sets the QConfigs for a given module type, function, or method name + + ``set_module_name_regex`` : sets the QConfigs for modules matching the given regex string + + ``set_module_name`` : sets the QConfigs for modules matching the given module name + + ``set_module_name_object_type_order`` : sets the QConfigs for modules matching a combination + of the given module name, object type, and the index at which the module appears + + Note: Usage of set methods is the same as in QConfigMapping except with a passed in list of QConfigs rather than a + single QConfig. + + Example usage:: + + qconfig_mapping = QConfigMultiMapping() + .set_global([qconfig1, qconfig2]) + .set_object_type(torch.nn.Linear, [qconfig2, qconfig3]) + .set_object_type(torch.nn.ReLU, [qconfig1]) + .set_module_name_regex("foo.*bar.*conv[0-9]+", [qconfig2]) + .set_module_name_regex("foo.*", [qconfig1, qconfig2, qconfig3]) + .set_module_name("module1", [None]) + .set_module_name("module2", [qconfig2]) + .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, [qconfig3]) + + """ + + def __init__(self) -> None: + # initialize this with 1 QConfigMapping to avoid corner cases + self.qconfig_mappings_list: list[QConfigMapping] = [QConfigMapping()] + + def _handle_list_size_mismatch( + self, qconfig_list: list[QConfigAny], style: str + ) -> None: + # this method handles cases where the size of qconfig_list does not match + # the size of qconfig_mappings_list. + # Issue: Consider a user inserting global_qconfig A and B first, then inserting + # qconfig C as an object_type_qconfig for conv ops. If we internally store + # 1 QConfigMapping with A and C and another with just B, then the + # second QConfigMapping will match B to conv ops (which is not wanted), since B is global. + + # we avoid this by maintaining the invariant that if any QConfigMapping + # has a qconfig style+key with a qconfig in it, all QConfigMappings must + # have either a qconfig or None for that same style+key. In the above + # example, a None qconfig would prevent the unwanted match in the + # second QConfigMapping + + if len(qconfig_list) > len(self.qconfig_mappings_list): + # Case: we have more qconfigs (in qconfig_list) than QConfigMappings + + # Add new QConfigMappings (initialized so we maintain the `invariant`) + + new_qconfig_mapping = QConfigMapping() + # searches other QConfigMappings for qconfig style+keys + # that need to be inserted as `None` into the new QConfigMapping + for qconfig_mapping in self.qconfig_mappings_list: + # global_qconfig has None by default + for check_style in _QCONFIG_STYLE_ORDER[1:]: + qconfigs_dict = getattr(qconfig_mapping, check_style) + target_qconfigs_dict = getattr(new_qconfig_mapping, check_style) + for key in qconfigs_dict: + target_qconfigs_dict[key] = None + break + + # insert copies of this new QConfigMapping until all entries + # in qconfig_list can fit among the QConfigMappings + while len(qconfig_list) > len(self.qconfig_mappings_list): + self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping)) + else: + # Case: we have fewer qconfigs in qconfig_list than QConfigMappings + + # pad qconfig_list with `None` until length is same + while len(qconfig_list) < len(self.qconfig_mappings_list): + qconfig_list.append(None) + + # this function applies the insertion method across each QConfigMapping + def _insert_qconfig_list( + self, + style: str, + args: list[str | int | Callable], + qconfig_list: list[QConfigAny], + ) -> None: + # we remove duplicates and None to make the ordering of qconfigs + # deterministic upon insertion. + _remove_duplicates_and_none(qconfig_list) + + self._handle_list_size_mismatch(qconfig_list, style) + method_name = _QCONFIG_STYLE_TO_METHOD[style] + for qconfig_mapping, qconfig in zip(self.qconfig_mappings_list, qconfig_list): + # uses QConfigMapping set method to insert qconfig + set_method = getattr(qconfig_mapping, method_name) + set_method(*args, qconfig) + + def set_global(self, global_qconfig_list: list[QConfigAny]) -> QConfigMultiMapping: + """ + Set global QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_global()` for more info + """ + self._insert_qconfig_list("global_qconfig", [], global_qconfig_list) + return self + + def set_object_type( + self, object_type: Callable | str, qconfig_list: list[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set object type QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_object_type()` for more info + """ + self._insert_qconfig_list("object_type_qconfigs", [object_type], qconfig_list) + return self + + def set_module_name_regex( + self, module_name_regex: str, qconfig_list: list[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set module_name_regex QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_regex()` for more info + """ + self._insert_qconfig_list( + "module_name_regex_qconfigs", [module_name_regex], qconfig_list + ) + return self + + def set_module_name( + self, module_name: str, qconfig_list: list[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set module_name QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name()` for more info + """ + self._insert_qconfig_list("module_name_qconfigs", [module_name], qconfig_list) + return self + + def set_module_name_object_type_order( + self, + module_name: str, + object_type: Callable, + index: int, + qconfig_list: list[QConfigAny], + ) -> QConfigMultiMapping: + """ + Set module_name QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_object_type_order()` for more info + """ + self._insert_qconfig_list( + "module_name_object_type_order_qconfigs", + [module_name, object_type, index], + qconfig_list, + ) + return self + + def __repr__(self): + return ( + self.__class__.__name__ + + " [" + + "".join( + f"\n{qconfig_mapping.__repr__()}," + for qconfig_mapping in self.qconfig_mappings_list + ) + + "\n]" + ) + + @classmethod + def from_list_qconfig_mapping( + cls, qconfig_mapping_list: list[QConfigMapping] + ) -> QConfigMultiMapping: + """ + Creates a QConfigMultiMapping from a list of QConfigMappings + """ + new_qconfig_multi_mapping = cls() + + new_qconfig_multi_mapping.qconfig_mappings_list = copy.deepcopy( + qconfig_mapping_list + ) + + # we need to avoid the issue described in _handle_list_size_mismatch, + # so we reinsert all the qconfigs using the QConfigMultiMapping + # set methods + + # go through all qconfig styles + # note: global can be ignored since it is None by default + for style in _QCONFIG_STYLE_ORDER[1:]: + # gather all key+qconfigs for current style + # into qconfig_dict_list + qconfig_dict_list: dict[Any, list[QConfigAny]] = {} + for qconfig_mapping in qconfig_mapping_list: + qconfig_dict = getattr(qconfig_mapping, style) + for key, qconfig in qconfig_dict.items(): + if key not in qconfig_dict_list: + qconfig_dict_list[key] = [] + qconfig_dict_list[key].append(qconfig) + + # reinsert all gathered key+qconfigs + set_method_name = _QCONFIG_STYLE_TO_METHOD[style] + set_method = getattr(new_qconfig_multi_mapping, set_method_name) + for key, qconfig_list in qconfig_dict_list.items(): + if isinstance(key, tuple): + set_method(*key, qconfig_list) + else: + set_method(key, qconfig_list) + + return new_qconfig_multi_mapping diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..93e72ae2fd4b64ae1b529e06bb8af988a747f690 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py @@ -0,0 +1,579 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import enum +import operator +from collections.abc import Callable + +import torch +import torch.ao.nn.intrinsic.quantized as nniq +import torch.ao.nn.quantized as nnq +import torch.nn as nn +from torch.ao.quantization import FakeQuantizeBase, ObserverBase +from torch.ao.quantization.observer import _is_activation_post_process +from torch.ao.quantization.utils import getattr_from_fqn +from torch.fx import GraphModule +from torch.fx.graph import Node + +from .ns_types import NSNodeTargetType, NSResultsType + + +toq = torch.ops.quantized + + +# TODO(future PR): consider deleting this enum and using the torch types +# directly. This might be tricky because it is not a one to one mapping. +class NodeInputOrOutputType(enum.Enum): + FP32 = enum.auto() # torch.float + INT8 = enum.auto() # torch.qint8 or torch.quint8 + FP16 = enum.auto() # torch.float16 + UNKNOWN = enum.auto() # we cannot determine input/output dtype + # TODO(future PR): while these functions can support multiple dtypes, + # for the purposes of numerical debugging we want to get the actual + # dtype used in the model. We will likely need some kind of dtype + # propagation to estimate this. + FP32_OR_INT8 = enum.auto() # either torch.float or torch.quint8 or torch.qint8 + # TODO(future PRs): dynamic quant, fake quant, etc + + +def get_node_first_input_and_output_type( + node: Node, + gm: GraphModule, + logger_cls: Callable, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]], +) -> tuple[NodeInputOrOutputType, NodeInputOrOutputType]: + # TODO(future PR): clean this up + FUNS_IO_TYPE_FP32 = node_type_to_io_type_map["funs_io_type_fp32"] + FUNS_IO_TYPE_FP16 = node_type_to_io_type_map["funs_io_type_fp16"] + FUNS_IO_TYPE_INT8 = node_type_to_io_type_map["funs_io_type_int8"] + FUNS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["funs_io_type_fp32_or_int8"] + MODS_IO_TYPE_FP32 = node_type_to_io_type_map["mods_io_type_fp32"] + MODS_IO_TYPE_INT8 = node_type_to_io_type_map["mods_io_type_int8"] + MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"] + METHS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["meths_io_type_fp32_or_int8"] + + if node.op == "call_function": + if node.target in FUNS_IO_TYPE_FP32: + return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32) + if node.target in FUNS_IO_TYPE_FP16: + return (NodeInputOrOutputType.FP16, NodeInputOrOutputType.FP16) + elif node.target in FUNS_IO_TYPE_INT8: + return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8) + elif node.target in FUNS_IO_TYPE_FP32_OR_INT8: + first_arg = get_normalized_nth_input(node, gm, 0) + if not isinstance(first_arg, Node): + raise AssertionError(f"Expected Node, got {type(first_arg)}") + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + first_arg, gm, logger_cls, node_type_to_io_type_map + ) + return (prev_node_output_type, prev_node_output_type) + else: + return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) + + elif node.op == "call_module": + if node.op != "call_module": + raise AssertionError(f"Expected call_module, got '{node.op}'") + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, but got {type(node.target)}") + + mod = getattr_from_fqn(gm, node.target) + is_known_fp32_or_int8_input_module = any( + isinstance(mod, target_type) # type: ignore[arg-type] + for target_type in MODS_IO_TYPE_FP32_OR_INT8 + ) + if ( + isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase)) # type: ignore[arg-type] + or is_known_fp32_or_int8_input_module + ): + # A logger or observer's input and output type is the output + # type of the preceding node. + first_arg = get_normalized_nth_input(node, gm, 0) + if not isinstance(first_arg, Node): + raise AssertionError(f"Expected Node, got {type(first_arg)}") + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + first_arg, gm, logger_cls, node_type_to_io_type_map + ) + return (prev_node_output_type, prev_node_output_type) + is_known_fp32_input_module = any( + isinstance(mod, target_type) # type: ignore[arg-type] + for target_type in MODS_IO_TYPE_FP32 + ) + is_known_int8_input_module = any( + isinstance(mod, target_type) # type: ignore[arg-type] + for target_type in MODS_IO_TYPE_INT8 + ) + if is_known_fp32_input_module: + return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32) + elif is_known_int8_input_module: + return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8) + else: + return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) + + elif node.op == "call_method": + if node.target == "dequantize": + # Dequantize is a special node because it allows multiple input types. + # So, we look up the output type of the previous node and return that + # as the input type of this node instance. + prev_node = get_normalized_nth_input(node, gm, 0) + if not isinstance(prev_node, Node): + raise AssertionError(f"Expected Node, got {type(prev_node)}") + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + prev_node, gm, logger_cls, node_type_to_io_type_map + ) + return (prev_node_output_type, NodeInputOrOutputType.FP32) + + elif node.target == "to": + # to is a special node because it allows multiple input types. + # So, we look up the output type of the previous node and return that + # as the input type of this node instance. We also look up the target + # of to and return the correct output type. + prev_node = get_normalized_nth_input(node, gm, 0) + if not isinstance(prev_node, Node): + raise AssertionError(f"Expected Node, got {type(prev_node)}") + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + prev_node, gm, logger_cls, node_type_to_io_type_map + ) + + cur_node_dtype_target = get_normalized_nth_input(node, gm, 1) + if cur_node_dtype_target is not torch.float16: + raise AssertionError( + f"{cur_node_dtype_target} handling needs to be added" + ) + + return (prev_node_output_type, NodeInputOrOutputType.FP16) + + elif node.target in METHS_IO_TYPE_FP32_OR_INT8: + first_arg = get_normalized_nth_input(node, gm, 0) + if not isinstance(first_arg, Node): + raise AssertionError(f"Expected Node, got {type(first_arg)}") + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + first_arg, gm, logger_cls, node_type_to_io_type_map + ) + return (prev_node_output_type, prev_node_output_type) + + return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) + else: + return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) + + +def get_node_input_qparams( + node: Node, + gm: GraphModule, + node_type_to_io_type_map: dict[str, set[NSNodeTargetType]], +) -> tuple[torch.Tensor | float, torch.Tensor | int] | None: + """ + Returns the qparams (scale, zero_point) of the first input to `node`, + if they can be inferred from the graph. + """ + prev_node = get_normalized_nth_input(node, gm, 0) + + if not isinstance(prev_node, Node): + return None + + MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"] + + def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx): + scale_node = get_normalized_nth_input(node, gm, scale_arg_idx) + zp_node = get_normalized_nth_input(node, gm, zp_arg_idx) + if not isinstance(scale_node, Node): + raise AssertionError(f"Expected Node, got {type(scale_node)}") + if not isinstance(scale_node.target, str): + raise AssertionError(f"Expected str, got {type(scale_node.target)}") + if not isinstance(zp_node, Node): + raise AssertionError(f"Expected Node, got {type(zp_node)}") + if not isinstance(zp_node.target, str): + raise AssertionError(f"Expected str, got {type(zp_node.target)}") + scale_obj = getattr_from_fqn(gm, scale_node.target) + zp_obj = getattr_from_fqn(gm, zp_node.target) + return (scale_obj, zp_obj) + + if prev_node.op == "call_function": + # quantize - read the args directly + if prev_node.target is torch.quantize_per_tensor: + return _get_scale_zp_from_function_args(prev_node, gm, 1, 2) + elif prev_node.target in (toq.add, toq.add_relu, toq.mul, toq.mul_relu): + return _get_scale_zp_from_function_args(prev_node, gm, 2, 3) + + return None + # TODO(future PR): handle more functionals + # TODO(future PR): handle functional ops which inherit qparams from input + + elif prev_node.op == "call_module": + # get type of the module + if not isinstance(prev_node.target, str): + raise AssertionError(f"Expected str, got {type(prev_node.target)}") + module_obj = getattr_from_fqn(gm, prev_node.target) + if isinstance( + module_obj, + ( + nnq.Linear, + nnq.Conv1d, + nnq.Conv2d, + nniq.ConvReLU2d, + nnq.Conv3d, + nnq.BatchNorm2d, + nnq.BatchNorm3d, + nnq.ConvTranspose1d, + nnq.ConvTranspose2d, + nnq.ELU, + nnq.GroupNorm, + nnq.InstanceNorm1d, + nnq.InstanceNorm2d, + nnq.InstanceNorm3d, + nnq.LayerNorm, + nnq.Hardswish, + nnq.LeakyReLU, + nnq.ReLU6, + nniq.BNReLU2d, + nniq.BNReLU3d, + nniq.ConvReLU1d, + nniq.ConvReLU2d, + nniq.ConvReLU3d, + nniq.LinearReLU, + ), + ): + return (module_obj.scale, module_obj.zero_point) # type: ignore[return-value] + + is_known_fp32_or_int8_input_module = any( + isinstance(module_obj, target_type) # type: ignore[arg-type] + for target_type in MODS_IO_TYPE_FP32_OR_INT8 + ) + if is_known_fp32_or_int8_input_module: + return get_node_input_qparams(prev_node, gm, node_type_to_io_type_map) + + return None + + +def return_first_non_observer_node( + node: Node, + gm: GraphModule, +) -> Node: + """ + If node is not an observer, returns it. If node is an observer, + navigates up the graph and returns the first parent which is not an + observer. For example, + + graph: (node_non_obs), node = node_non_obs : returns node_non_obs + graph: (node_non_obs -> obs0), node = obs0 : returns node_non_obs + graph: (node_non_obs -> obs0 -> fq0), node = fq0 : returns node_non_obs + """ + if node.op == "call_module": + node_obj = getattr_from_fqn(gm, node.target) # type: ignore[arg-type] + if _is_activation_post_process(node_obj): + if len(node.args) != 1: + raise AssertionError( + f"Expected node.args to have length 1, got {len(node.args)}" + ) + if not isinstance(node.args[0], Node): + raise AssertionError(f"Expected Node, got {type(node.args[0])}") + node = node.args[0] + # code duplication intended, not worth refactoring + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + node_obj = getattr_from_fqn(gm, node.target) + if _is_activation_post_process(node_obj): + if len(node.args) != 1: + raise AssertionError( + f"Expected node.args to have length 1, got {len(node.args)}" + ) + if not isinstance(node.args[0], Node): + raise AssertionError(f"Expected Node, got {type(node.args[0])}") + node = node.args[0] + return node + + +def get_number_of_non_param_args( + node: Node, + gm: GraphModule, +) -> int: + """ + Assumes that all non-param args occur first. Returns the number of + non-param args expected for a node. For example, for + + F.linear(x, weight, bias) + + Returns 1, because x is a non-param arg and weight and bias are params. + For + + lstm_mod(x, hid) + + Returns 2, because both x and hid are non-param args. + """ + if node.op == "call_module": + node_obj = getattr_from_fqn(gm, node.target) # type: ignore[arg-type] + if isinstance(node_obj, nn.LSTM): + return 2 + + # default is 1 + return 1 + + +def get_arg_indices_of_inputs_to_log(node: Node) -> list[int]: + """ + Returns the indices of args of the node which we should attach + loggers to, if input logging is enabled. + + For example, + * for (x + y), returns [0, 1] + * for (1 + y), returns [1] + * for (x + 1), returns [0] + * for (linear(x, w, b)) returns [0] + * by default, returns [0] + """ + if len(node.args) == 0: + return [] + if node.op == "call_function" and ( + # TODO(future PR): use relationship map instead of hardcoding + node.target in (torch.add, torch.ops.quantized.add, operator.add) + or node.target in (torch.mul, torch.ops.quantized.mul, operator.mul) + ): + result = [i for i in range(2) if type(node.args[i]) is Node] + return result + return [0] + + +def get_target_type_str(node: Node, gm: GraphModule) -> str: + """ + Returns a string representation of the type of the function or module + pointed to by this node, or '' for other node types. + """ + target_type = "" + if node.op in ("call_function", "call_method"): + target_type = torch.typename(node.target) + elif node.op == "call_module": + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + target_mod = getattr_from_fqn(gm, node.target) + target_type = torch.typename(target_mod) + return target_type + + +def rekey_logger_info_on_node_name_of_model( + results: NSResultsType, + model_name: str, +) -> NSResultsType: + """ + Rekeys the layer name of a results dictionary to use node names + from `model_name`. + + For example, transforms + + {'base_op_1_0': {'node_output': {'model_a': + [{'ref_node_name': 'linear1', ...}]}}} + + into + + {'linear1': {'node_output': {'model_a': + [{'ref_node_name': 'linear1', ...}]}}} + + Note: we cannot use these node names directly because they are not + guaranteed to be consistent across models. This is why we extract + the results first and rekey afterwards. + """ + new_results = {} + for old_layer_name, result_type_to_results in results.items(): + new_layer_name = None + for model_name_to_results in result_type_to_results.values(): + for cur_model_name, list_of_results in model_name_to_results.items(): + if cur_model_name == model_name: + if len(list_of_results) == 0: + raise AssertionError("Expected list_of_results to be not empty") + new_layer_name = list_of_results[0]["ref_node_name"] + else: + continue + if new_layer_name is not None: + new_results[new_layer_name] = result_type_to_results + else: + new_results[old_layer_name] = result_type_to_results + return new_results + + +def maybe_add_missing_fqns(results: NSResultsType) -> None: + """ + If `fqn` entries are filled in for one of the models in `results`, copies + them over to any models which do not have them filled out. + + A common use case benefitting from this is comparing a model prepared by + quantization to a quantized model. In this case, the model prepared by + quantization would have `fqn` entries, and the quantized model would not. + """ + + # Check in the first result to find any model with fqn entries defined. + model_name_with_fqns = None + for result_type_to_results in results.values(): + for model_name_to_results in result_type_to_results.values(): + for model_name, model_results in model_name_to_results.items(): + if len(model_results) > 0: + if model_results[0]["fqn"] is not None: + model_name_with_fqns = model_name + break + break + break + + if model_name_with_fqns: + for result_type_to_results in results.values(): + for model_name_to_results in result_type_to_results.values(): + ref_model_results = model_name_to_results[model_name_with_fqns] + for model_name, model_results in model_name_to_results.items(): + if model_name == model_name_with_fqns: + continue + + for i in range(len(model_results)): + fqn = ref_model_results[i]["fqn"] + model_results[i]["fqn"] = fqn + + +def maybe_dequantize_first_two_tensor_args_and_handle_tuples(f): + def inner(*args, **kwargs): + a0, a1, *a_other = args + + if (isinstance(a0, tuple) and isinstance(a1, tuple)) or ( + isinstance(a0, list) and isinstance(a1, list) + ): + results = [] + for el0, el1 in zip(a0, a1): + new_args = (el0, el1, *a_other) + results.append(inner(*new_args, **kwargs)) + return results + + elif isinstance(a0, torch.Tensor) and isinstance(a1, torch.Tensor): + if a0.is_quantized: + a0 = a0.dequantize() + if a1.is_quantized: + a1 = a1.dequantize() + + # for the purposes of this util, only handle floats + if a0.dtype != torch.float or a1.dtype != torch.float: + return None + + new_args = (a0, a1, *a_other) + return f(*new_args, **kwargs) + + return inner + + +@maybe_dequantize_first_two_tensor_args_and_handle_tuples +def compute_sqnr(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + Computes the SQNR between `x` and `y`. + + Args: + x: Tensor or tuple of tensors + y: Tensor or tuple of tensors + + Return: + float or tuple of floats + """ + Ps = torch.norm(x) + Pn = torch.norm(x - y) + return 20 * torch.log10(Ps / Pn) + + +@maybe_dequantize_first_two_tensor_args_and_handle_tuples +def compute_normalized_l2_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + Computes the normalized L2 error between `x` and `y`. + + Args: + x: Tensor or tuple of tensors + y: Tensor or tuple of tensors + + Return: + float or tuple of floats + """ + # pyrefly: ignore [unsupported-operation] + return torch.sqrt(((x - y) ** 2).sum() / (x**2).sum()) + + +@maybe_dequantize_first_two_tensor_args_and_handle_tuples +def compute_cosine_similarity(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + Computes the cosine similarity between `x` and `y`. + + Args: + x: Tensor or tuple of tensors + y: Tensor or tuple of tensors + + Return: + float or tuple of floats + """ + # For convolutions, the shape of the quantized weight has one additional + # dimension compared to the shape of the fp32 weight. Match the shapes + # to enable cosine similarity comparison. + x = x.reshape(1, -1) + y = y.reshape(1, -1) + return torch.nn.functional.cosine_similarity(x, y) + + +def op_type_supports_shadowing(node: Node) -> bool: + if node.op == "call_function": + if node.target in ( + torch.add, + torch.mul, + operator.add, + operator.mul, + torch.cat, + torch.stack, + ): + # shadowing for ops with multiple tensor inputs is not implemented yet + return False + return True + + +def get_normalized_nth_input(node: Node, gm: GraphModule, idx: int) -> Node: + """ + Given a node, gets the n'th input to that node, normalizing + args and kwargs to the best of its ability. + """ + try: + norm_args_and_kwargs = node.normalized_arguments( + gm, normalize_to_only_use_kwargs=True + ) + if norm_args_and_kwargs is not None: + norm_args, norm_kwargs = norm_args_and_kwargs + if len(norm_args) + len(norm_kwargs) <= idx: + raise AssertionError( + f"Index {idx} out of range: total = {len(norm_args) + len(norm_kwargs)}" + ) + if idx < len(norm_args): + return norm_args[idx] + else: + # note: in Python 3.7+ dicts are ordered + return list(norm_kwargs.values())[idx] + else: + if len(node.args) + len(node.kwargs) <= idx: + raise AssertionError( + f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}" + ) + if idx < len(node.args): + return node.args[idx] # type: ignore[return-value] + else: + kwargs_idx = idx + len(node.args) + return list(node.kwargs.values())[kwargs_idx] # type: ignore[return-value] + except RuntimeError: + # this RuntimeError happens when node argument normalization + # requires typehints to proceed, such as for torch.add where + # either the first, second or both arguments could be tensors + if len(node.args) + len(node.kwargs) <= idx: + raise AssertionError( + f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}" + ) from None + if idx < len(node.args): + return node.args[idx] # type: ignore[return-value] + else: + kwargs_idx = idx + len(node.args) + return list(node.kwargs.values())[kwargs_idx] # type: ignore[return-value] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6bff44215e46174856918883f35aac92b4491c25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py @@ -0,0 +1,302 @@ +from collections.abc import Callable + +import torch +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +import torch.ao.nn.intrinsic.quantized as nniq +import torch.ao.nn.qat as nnqat +import torch.ao.nn.quantized as nnq +import torch.ao.nn.quantized.dynamic as nnqd +import torch.nn as nn +import torch.nn.functional as F +from torch.fx import GraphModule +from torch.fx.graph import Node + +from .ns_types import NSSingleResultType, NSSingleResultValuesType +from .utils import get_target_type_str, getattr_from_fqn, return_first_non_observer_node + + +toq = torch.ops.quantized + + +def mod_weight_detach(mod: nn.Module) -> torch.Tensor: + return mod.weight.detach() # type: ignore[operator] + + +def mod_0_weight_detach(mod: nn.Module) -> torch.Tensor: + return mod[0].weight.detach() # type: ignore[index] + + +def mod_weight_bias_0(mod: nn.Module) -> torch.Tensor: + return mod._weight_bias()[0] # type: ignore[operator] + + +def get_lstm_weight(mod: nn.Module) -> list[torch.Tensor]: + res = [] + for idx, param_name in enumerate(mod._flat_weights_names): # type: ignore[arg-type] + if "weight_ih_l" in param_name or "weight_hh_l" in param_name: + param_value = mod._flat_weights[idx].detach() # type: ignore[index,union-attr] + res.append(param_value) + return res + + +def get_qlstm_weight(mod: nn.Module) -> list[torch.Tensor]: + res = [] + for weight_value in mod._all_weight_values: # type: ignore[union-attr] + res.append(weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0]) + res.append(weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0]) + return res + + +def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor: + if isinstance(mod, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + return mod.weight.detach() + elif isinstance(mod, (nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d)): + return mod[0].weight.detach() # type: ignore[operator] + else: + return mod._weight_bias()[0] # type: ignore[operator] + + +def get_linear_mod_weight(mod: nn.Module) -> torch.Tensor: + if isinstance(mod, nn.Linear): + return mod.weight.detach() + elif isinstance(mod, nni.LinearReLU): + return mod[0].weight.detach() # type: ignore[operator] + else: + return mod._weight_bias()[0] # type: ignore[operator] + + +def get_lstm_mod_weights(mod: nn.Module) -> list[torch.Tensor]: + # TODO(future PR): make more generic, handle everything + if isinstance(mod, nn.LSTM): + res = [] + for idx, param_name in enumerate(mod._flat_weights_names): + if "weight_ih_l" in param_name or "weight_hh_l" in param_name: + param_value = mod._flat_weights[idx].detach() # type: ignore[index,union-attr] + res.append(param_value) + return res + else: + if not isinstance(mod, nnqd.LSTM): + raise AssertionError(f"type {type(mod)} not handled yet") + res = [] + for weight_value in mod._all_weight_values: + res.append( + weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0] # type: ignore[index] + ) + res.append( + weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0] # type: ignore[index] + ) + return res + + +def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor: + # traverse backwards from the weight arg, accounting for any observers + weight_arg_node = node.args[1] + if not isinstance(weight_arg_node, Node): + raise AssertionError(f"Expected Node, got {type(weight_arg_node)}") + weight_node = return_first_non_observer_node(weight_arg_node, gm) + if not isinstance(weight_node, Node): + raise AssertionError(f"Expected Node, got {type(weight_node)}") + if weight_node.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {weight_node.op}") + weight = getattr_from_fqn(gm, weight_node.target) # type: ignore[arg-type] + return weight.detach() + + +def get_qconv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor: + # qconv state is arg 1 + qconv_state_node = node.args[1] + if not isinstance(qconv_state_node, Node): + raise AssertionError(f"Expected Node, got {type(qconv_state_node)}") + if qconv_state_node.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {qconv_state_node.op}") + qconv_state_obj = getattr_from_fqn(gm, qconv_state_node.target) # type: ignore[arg-type] + return qconv_state_obj.weight() + + +def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor: + # traverse backwards from the weight arg, accounting for any observers + # supported patterns: + # weight -> obs -> linear + # weight -> to(torch.float16) -> dequantize -> linear + linear_second_arg = node.args[1] + if not isinstance(linear_second_arg, Node): + raise AssertionError(f"Expected Node, got {type(linear_second_arg)}") + + if linear_second_arg.op == "call_module": + # weight -> obs -> linear + weight_arg_node = node.args[1] + if not isinstance(weight_arg_node, Node): + raise AssertionError(f"Expected Node, got {type(weight_arg_node)}") + weight_node = weight_arg_node.args[0] + if not isinstance(weight_node, Node): + raise AssertionError(f"Expected Node, got {type(weight_node)}") + if weight_node.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {weight_node.op}") + weight = getattr_from_fqn(gm, weight_node.target) # type: ignore[arg-type] + return weight.detach() + elif linear_second_arg.op == "call_method": + # weight -> to(torch.float16) -> dequantize -> linear + if linear_second_arg.op != "call_method": + raise AssertionError(f"Expected call_method, got {linear_second_arg.op}") + dequant_node = node.args[1] + if not isinstance(dequant_node, Node): + raise AssertionError(f"Expected Node, got {type(dequant_node)}") + to_fp16_node = dequant_node.args[0] + if not isinstance(to_fp16_node, Node): + raise AssertionError(f"Expected Node, got {type(to_fp16_node)}") + # extract the dtype, so we can cast to it before returning + target_dtype = to_fp16_node.args[1] + weight_node = to_fp16_node.args[0] + if not isinstance(weight_node, Node): + raise AssertionError(f"Expected Node, got {type(weight_node)}") + if weight_node.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {weight_node.op}") + weight = getattr_from_fqn(gm, weight_node.target) # type: ignore[arg-type] + # return the weight with fp16 cast + return weight.detach().to(target_dtype) + else: + if linear_second_arg.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {linear_second_arg.op}") + weight = getattr_from_fqn(gm, linear_second_arg.target) # type: ignore[arg-type] + return weight.detach() + + +def get_qlinear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor: + # packed weight is arg 1 + packed_weight_node = node.args[1] + if not isinstance(packed_weight_node, Node): + raise AssertionError(f"Expected Node, got {type(packed_weight_node)}") + if packed_weight_node.op != "get_attr": + raise AssertionError(f"Expected get_attr, got {packed_weight_node.op}") + packed_weight = getattr_from_fqn(gm, packed_weight_node.target) # type: ignore[arg-type] + # TODO(future PR): why does packed_weight.unpack() not work? + (weight, _bias), _name = packed_weight.__getstate__() + return weight + + +def get_op_to_type_to_weight_extraction_fn() -> dict[str, dict[Callable, Callable]]: + op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] = { + "call_module": { + # Conv1d + nn.Conv1d: mod_weight_detach, + nni.ConvReLU1d: mod_0_weight_detach, + nnq.Conv1d: mod_weight_bias_0, + nnqat.Conv1d: mod_weight_detach, + nniqat.ConvBn1d: mod_weight_detach, + nniqat.ConvBnReLU1d: mod_weight_detach, + nniqat.ConvReLU1d: mod_weight_detach, + nniq.ConvReLU1d: mod_weight_bias_0, + # Conv2d + nn.Conv2d: mod_weight_detach, + nni.ConvReLU2d: mod_0_weight_detach, + nnq.Conv2d: mod_weight_bias_0, + nnqat.Conv2d: mod_weight_detach, + nniqat.ConvBn2d: mod_weight_detach, + nniqat.ConvBnReLU2d: mod_weight_detach, + nniqat.ConvReLU2d: mod_weight_detach, + nniq.ConvReLU2d: mod_weight_bias_0, + # Conv3d + nn.Conv3d: mod_weight_detach, + nni.ConvReLU3d: mod_0_weight_detach, + nnq.Conv3d: mod_weight_bias_0, + nnqat.Conv3d: mod_weight_detach, + nniqat.ConvBn3d: mod_weight_detach, + nniqat.ConvBnReLU3d: mod_weight_detach, + nniqat.ConvReLU3d: mod_weight_detach, + nniq.ConvReLU3d: mod_weight_bias_0, + # Linear + nn.Linear: mod_weight_detach, + nnq.Linear: mod_weight_bias_0, + nni.LinearReLU: mod_0_weight_detach, + nniq.LinearReLU: mod_weight_bias_0, + nnqat.Linear: mod_weight_detach, + nnqd.Linear: mod_weight_bias_0, + nniqat.LinearReLU: mod_weight_detach, + nniqat.LinearBn1d: mod_weight_detach, + nn.modules.linear.NonDynamicallyQuantizableLinear: mod_weight_detach, + # LSTM + nn.LSTM: get_lstm_weight, + nnqd.LSTM: get_qlstm_weight, + }, + "call_function": { + # Conv + F.conv1d: get_conv_fun_weight, + F.conv2d: get_conv_fun_weight, + F.conv3d: get_conv_fun_weight, + toq.conv1d: get_qconv_fun_weight, + toq.conv2d: get_qconv_fun_weight, + toq.conv3d: get_qconv_fun_weight, + toq.conv1d_relu: get_qconv_fun_weight, + toq.conv2d_relu: get_qconv_fun_weight, + toq.conv3d_relu: get_qconv_fun_weight, + # Linear + F.linear: get_linear_fun_weight, + toq.linear: get_qlinear_fun_weight, + toq.linear_relu: get_qlinear_fun_weight, + }, + } + + return op_to_type_to_weight_extraction_fn + + +def extract_weight_from_node( + node: Node, + gm: GraphModule, + op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] + | None = None, +) -> NSSingleResultType | None: + res_type = NSSingleResultValuesType.WEIGHT.value + + # Not all graphmodules have _node_name_to_scope, so only fill it + # out if it exists. + fqn = None + if hasattr(gm, "_node_name_to_scope"): + fqn = gm._node_name_to_scope[node.name][0] # type: ignore[index] + + if op_to_type_to_weight_extraction_fn is None: + op_to_type_to_weight_extraction_fn = get_op_to_type_to_weight_extraction_fn() + + ref_node_type = get_target_type_str(node, gm) + # for extracting weights, these are always the same + prev_node_type = ref_node_type + + if node.op == "call_function": + function_mapping = op_to_type_to_weight_extraction_fn["call_function"] + for target_fn_type, weight_extraction_fn in function_mapping.items(): + if node.target == target_fn_type: + weight = weight_extraction_fn(node, gm) + return { + "type": res_type, + "values": [weight], + "prev_node_name": node.name, + "prev_node_target_type": prev_node_type, + "ref_node_name": node.name, + "ref_node_target_type": ref_node_type, + "index_within_arg": 0, + "index_of_arg": 0, + "fqn": fqn, + } + + elif node.op == "call_module": + # for call_module, we need to look up the modules to do the type check + if not isinstance(node.target, str): + raise AssertionError(f"Expected str, got {type(node.target)}") + mod = getattr_from_fqn(gm, node.target) + module_mapping = op_to_type_to_weight_extraction_fn["call_module"] + for target_mod_type, weight_extraction_fn in module_mapping.items(): + if type(mod) is target_mod_type: + weight = weight_extraction_fn(mod) + return { + "type": res_type, + "values": [weight], + "prev_node_name": node.name, + "prev_node_target_type": prev_node_type, + "ref_node_name": node.name, + "ref_node_target_type": ref_node_type, + "index_within_arg": 0, + "index_of_arg": 0, + "fqn": fqn, + } + + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52fc301befd34642d51f1c27e07600a1f3ef26ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py @@ -0,0 +1,23 @@ +# Variables +from ._mappings import ( + get_dynamic_sparse_quantized_mapping, + get_static_sparse_quantized_mapping, +) + +# Scheduler +from .scheduler.base_scheduler import BaseScheduler +from .scheduler.cubic_scheduler import CubicSL +from .scheduler.lambda_scheduler import LambdaSL + +# Sparsifier +from .sparsifier.base_sparsifier import BaseSparsifier +from .sparsifier.nearly_diagonal_sparsifier import NearlyDiagonalSparsifier + +# Parametrizations +from .sparsifier.utils import ( + FakeSparsity, + fqn_to_module, + get_arg_info_from_tensor_fqn, + module_to_fqn, +) +from .sparsifier.weight_norm_sparsifier import WeightNormSparsifier diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f421363b5dfaa2c3b89e4bfee4ac5635f0edbefe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..646a778f3102a5f8aec972bb027ba221e3b189bf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21080ae3c00beb650088da033db1f22beff9c9ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a628bcfc8a34c8bcca7f336dba82815663bf49d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e80dac2c4f07b80fec598edae866e965ef28d09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..6fc2c4f10aef5585072f36116282a2048965197a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py @@ -0,0 +1,23 @@ +# mypy: allow-untyped-defs +__all__ = [ + "get_static_sparse_quantized_mapping", + "get_dynamic_sparse_quantized_mapping", +] + + +def get_static_sparse_quantized_mapping(): + import torch.ao.nn.sparse + + _static_sparse_quantized_mapping = { + torch.nn.Linear: torch.ao.nn.sparse.quantized.Linear, + } + return _static_sparse_quantized_mapping + + +def get_dynamic_sparse_quantized_mapping(): + import torch.ao.nn.sparse + + _dynamic_sparse_quantized_mapping = { + torch.nn.Linear: torch.ao.nn.sparse.quantized.dynamic.Linear, + } + return _dynamic_sparse_quantized_mapping diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07a54dc243ba800ec67e21234d63332ff759aa09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5f3934407f43c0f3ec7f09ee74185e912f525b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5aab3d7fb0d011c21c1e8ed428e176f008be6f43 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5705d37eaf3391f31c77d9fc48d365cf91eac65f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63ba0376ccd1419ea024c840e86a863b85bdc8ab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba65948903f8312e624210e7324512a310fa82ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3659b40d55baa55bd7773ef4abb61bcc4dc64763 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4383c7f0a73b5ea7ce5605a5f49165367e727255 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd37ddb155117d3a4bfeabe2c0cf4d24eacf36bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2efc24081b0c13d94b7ab256f635eafce8614543 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py @@ -0,0 +1,247 @@ +# mypy: allow-untyped-defs + +import sys +from collections.abc import Callable +from typing import Optional, Union + +import torch +from torch import Tensor + +from .fake_quantize import * # noqa: F403 +from .fuse_modules import fuse_modules, fuse_modules_qat # noqa: F403 +from .fuser_method_mappings import * # noqa: F403 +from .observer import * # noqa: F403 +from .pt2e._numeric_debugger import ( # noqa: F401 + compare_results, + CUSTOM_KEY, + extract_results_from_loggers, + generate_numeric_debug_handle, + NUMERIC_DEBUG_HANDLE_KEY, + prepare_for_propagation_comparison, +) +from .pt2e.export_utils import ( + _allow_exported_model_train_eval as allow_exported_model_train_eval, + _move_exported_model_to_eval as move_exported_model_to_eval, + _move_exported_model_to_train as move_exported_model_to_train, +) + +# pyrefly: ignore [deprecated] +from .qconfig import * # noqa: F403 +from .qconfig_mapping import * # noqa: F403 +from .quant_type import * # noqa: F403 +from .quantization_mappings import * # noqa: F403 # type: ignore[no-redef] +from .quantize import * # noqa: F403 +from .quantize_jit import * # noqa: F403 +from .stubs import * # noqa: F403 + + +# ensure __module__ is set correctly for public APIs +if sys.version_info < (3, 12): + ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase] + ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" +else: + from typing import TypeAliasType + + ObserverOrFakeQuantize = TypeAliasType( + "ObserverOrFakeQuantize", ObserverBase | FakeQuantizeBase + ) + +for _f in [ + compare_results, + extract_results_from_loggers, + generate_numeric_debug_handle, + prepare_for_propagation_comparison, +]: + _f.__module__ = "torch.ao.quantization" + +__all__ = [ + "DeQuantStub", + "FakeQuantize", + "FakeQuantizeBase", + "FixedQParamsFakeQuantize", + "FixedQParamsObserver", + "FusedMovingAvgObsFakeQuantize", + "HistogramObserver", + "MatchAllNode", + "MinMaxObserver", + "MovingAverageMinMaxObserver", + "MovingAveragePerChannelMinMaxObserver", + "NoopObserver", + "ObserverBase", + "ObserverOrFakeQuantize", + "Pattern", + "PerChannelMinMaxObserver", + "PlaceholderObserver", + "QConfig", + "QConfigAny", + "QConfigDynamic", + "QConfigMapping", + "QuantStub", + "QuantType", + "QuantWrapper", + "RecordingObserver", + "ReuseInputObserver", + "UniformQuantizationObserverBase", + "add_quant_dequant", + "convert", + "convert_dynamic_jit", + "convert_jit", + "default_affine_fixed_qparams_fake_quant", + "default_affine_fixed_qparams_observer", + "default_debug_observer", + "default_dynamic_fake_quant", + "default_dynamic_quant_observer", + "default_embedding_fake_quant", + "default_embedding_fake_quant_4bit", + "default_eval_fn", + "default_fake_quant", + "default_fixed_qparams_range_0to1_fake_quant", + "default_fixed_qparams_range_0to1_observer", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_fixed_qparams_range_neg1to1_observer", + "default_float_qparams_observer", + "default_float_qparams_observer_4bit", + "default_fused_act_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "default_fused_wt_fake_quant", + "default_histogram_fake_quant", + "default_histogram_observer", + "default_observer", + "default_per_channel_weight_fake_quant", + "default_per_channel_weight_observer", + "default_placeholder_observer", + "default_reuse_input_observer", + "default_symmetric_fixed_qparams_fake_quant", + "default_symmetric_fixed_qparams_observer", + "default_weight_fake_quant", + "default_weight_observer", + "disable_fake_quant", + "disable_observer", + "enable_fake_quant", + "enable_observer", + "fuse_conv_bn", + "fuse_conv_bn_jit", + "fuse_conv_bn_relu", + "fuse_convtranspose_bn", + "fuse_linear_bn", + "fuse_modules", + "fuse_modules_qat", + "fused_per_channel_wt_fake_quant_range_neg_127_to_127", + "fused_wt_fake_quant_range_neg_127_to_127", + "get_combined_dict", + "get_default_compare_output_module_list", + "get_default_custom_config_dict", + "get_default_dynamic_quant_module_mappings", + "get_default_dynamic_sparse_quant_module_mappings", + "get_default_float_to_quantized_operator_mappings", + "get_default_qat_module_mappings", + "get_default_qat_qconfig", + "get_default_qat_qconfig_dict", + "get_default_qat_qconfig_mapping", + "get_default_qconfig", + "get_default_qconfig_dict", + "get_default_qconfig_mapping", + "get_default_qconfig_propagation_list", + "get_default_static_quant_module_mappings", + "get_default_static_quant_reference_module_mappings", + "get_default_static_sparse_quant_module_mappings", + "get_dynamic_quant_module_class", + "get_embedding_qat_module_mappings", + "get_embedding_static_quant_module_mappings", + "get_fuser_method", + "get_fuser_method_new", + "get_observer_state_dict", + "get_quantized_operator", + "get_static_quant_module_class", + "load_observer_state_dict", + "move_exported_model_to_eval", + "move_exported_model_to_train", + "allow_exported_model_train_eval", + "no_observer_set", + "per_channel_weight_observer_range_neg_127_to_127", + "prepare", + "prepare_dynamic_jit", + "prepare_jit", + "prepare_qat", + "propagate_qconfig_", + "qconfig_equals", + "quantize", + "quantize_dynamic", + "quantize_dynamic_jit", + "quantize_jit", + "quantize_qat", + "script_qconfig", + "script_qconfig_dict", + "swap_module", + "weight_observer_range_neg_127_to_127", + "generate_numeric_debug_handle", + "CUSTOM_KEY", + "NUMERIC_DEBUG_HANDLE_KEY", + "prepare_for_propagation_comparison", + "extract_results_from_loggers", + "compare_results", + # from torchao, should be merged with torchao + # in the future + "AffineQuantizedObserverBase", + "Granularity", + "MappingType", + "PerAxis", + "PerBlock", + "PerGroup", + "PerRow", + "PerTensor", + "PerToken", + "TorchAODType", + "ZeroPointDomain", + "get_block_size", +] + + +def default_eval_fn(model, calib_data): + r"""Define the default evaluation function. + + Default evaluation function takes a torch.utils.data.Dataset or a list of + input Tensors and run the model on the dataset + """ + for data, _target in calib_data: + model(data) + + +class _DerivedObserverOrFakeQuantize(ObserverBase): + r"""This observer is used to describe an observer whose quantization parameters + are derived from other observers + """ + + def __init__( + self, + dtype: torch.dtype, + obs_or_fqs: list[ObserverOrFakeQuantize], + derive_qparams_fn: Callable[ + [list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor] + ], + quant_min: int | None = None, + quant_max: int | None = None, + qscheme: torch.qscheme | None = None, + ch_axis: int | None = None, + ): + super().__init__(dtype) + self.obs_or_fqs = obs_or_fqs + self.derive_qparams_fn = derive_qparams_fn + self.quant_min = quant_min + self.quant_max = quant_max + self.qscheme = qscheme + self.ch_axis = ch_axis + + from .utils import is_per_channel + + if is_per_channel(self.qscheme): + if self.ch_axis is None: + raise AssertionError( + "Must provide a valid ch_axis if qscheme is per channel" + ) + + def forward(self, x: Tensor) -> Tensor: + return x + + def calculate_qparams(self): # type:ignore[override] + return self.derive_qparams_fn(self.obs_or_fqs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90d024f87660250d10128e8042312fe55d37560a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50c4b8feb6400d8a6b68f0ff80c7fafef3b556a4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f29e5a6cbe8a8adeeeb299289305d20778040733 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e47f8810eaebcec5d6b00fe50f30bef2621abcb0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71c09c388aa7bbf7059b8b5f343f58b54811303c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3e89847b98e212ce24f1d86e400e54ca28f4ee2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0025f83a91e50570d030b1de3decd357d1ed9990 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..081e0f9e3569670ae81232a6ce487149d77731f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py new file mode 100644 index 0000000000000000000000000000000000000000..4309e4530cb72bd6620a69527cbe87e2a533c323 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py @@ -0,0 +1,156 @@ +# mypy: allow-untyped-defs +import torch +import torch.ao.nn.quantized as nnq +import torch.ao.ns._numeric_suite as ns +import torch.ao.quantization +import torch.nn as nn + + +__all__ = [ + "get_module", + "parent_child_names", + "get_param", + "MeanShadowLogger", + "bias_correction", +] + +_supported_modules = {nn.Linear, nn.Conv2d} +_supported_modules_quantized = {nnq.Linear, nnq.Conv2d} + + +def get_module(model, name): + """Given name of submodule, this function grabs the submodule from given model.""" + return dict(model.named_modules())[name] + + +def parent_child_names(name): + """Split full name of submodule into parent submodule's full name and submodule's name.""" + split_name = name.rsplit(".", 1) + if len(split_name) == 1: + return "", split_name[0] + else: + return split_name[0], split_name[1] + + +def get_param(module, attr): + """Get the parameter given a module and attribute. + + Sometimes the weights/bias attribute gives you the raw tensor, but sometimes + gives a function that will give you the raw tensor, this function takes care of that logic + """ + param = getattr(module, attr, None) + if callable(param): + return param() + else: + return param + + +class MeanShadowLogger(ns.Logger): + """Mean Logger for a Shadow module. + + A logger for a Shadow module whose purpose is to record the rolling mean + of the data passed to the floating point and quantized models + """ + + def __init__(self): + """Set up initial values for float and quantized stats, count, float sum, and quant sum.""" + super().__init__() + self.stats["float"] = None + self.stats["quantized"] = None + self.count = 0 + self.float_sum = None + self.quant_sum = None + + def forward(self, x, y): # type: ignore[override] + """Compute the average of quantized and floating-point data from modules. + + The inputs x,y are output data from the quantized and floating-point modules. + x is for the quantized module, y is for the floating point module + """ + if x.is_quantized: + x = x.dequantize() + + self.count += 1 + if self.stats["quantized"] is None: + self.stats["quantized"] = x + self.quant_sum = x + else: + self.quant_sum += x + self.stats["quantized"] = self.quant_sum / self.count + + if self.stats["float"] is None: + self.stats["float"] = y + self.float_sum = y + else: + self.float_sum += y + self.stats["float"] = self.float_sum / self.count + + def clear(self): + self.stats["float"] = None + self.stats["quantized"] = None + self.count = 0 + self.float_sum = None + self.quant_sum = None + + +def bias_correction( + float_model, + quantized_model, + img_data, + target_modules=_supported_modules_quantized, + neval_batches=None, +): + """Perform bias correction on a module. + + Using numeric suite shadow module, the expected output of the floating point and quantized modules + is recorded. Using that data the bias of supported modules is shifted to compensate for the drift caused + by quantization + Paper reference: https://arxiv.org/pdf/1906.04721.pdf (Section 4.2) + + Args: + float_model: a trained model that serves as a reference to what bias correction should aim for + quantized_model: quantized form of float_model that bias correction is to applied to + img_data: calibration data to estimate the expected output (used to find quantization error) + target_modules: specifies what submodules in quantized_model need bias correction (can be extended to + unquantized submodules) + neval_batches: a cap to the number of batches you want to be used for estimating the expected output + """ + ns.prepare_model_with_stubs( + float_model, quantized_model, _supported_modules, MeanShadowLogger + ) + + uncorrected_modules = { + name: submodule + for name, submodule in quantized_model.named_modules() + if type(submodule) in target_modules + } + + for uncorrected_module in uncorrected_modules: + quantized_submodule = get_module(quantized_model, uncorrected_module) + bias = get_param(quantized_submodule, "bias") + if bias is not None: + for count, data in enumerate(img_data, start=1): + quantized_model(data[0]) + if count == neval_batches: + break + ob_dict = ns.get_logger_dict(quantized_model) + parent_name, _ = parent_child_names(uncorrected_module) + + float_data = ob_dict[parent_name + ".stats"]["float"] + quant_data = ob_dict[parent_name + ".stats"]["quantized"] + + # math for expected_error + quantization_error = quant_data - float_data + dims = list(range(quantization_error.dim())) + # Note: we don't want to take the mean over the output channel dimension + dims.remove(1) + expected_error = torch.mean(quantization_error, dims) + + updated_bias = bias.data - expected_error + + bias.data = updated_bias + + # Resets the data contained in the loggers + for submodule in quantized_model.modules(): + if isinstance(submodule, MeanShadowLogger): + submodule.clear() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ff327f285aa4c17f05a9cbf61b7323a0536a12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py @@ -0,0 +1,279 @@ +# mypy: allow-untyped-defs +import copy +from itertools import chain +from typing import Any + +import torch + + +__all__ = [ + "set_module_weight", + "set_module_bias", + "has_bias", + "get_module_weight", + "get_module_bias", + "max_over_ndim", + "min_over_ndim", + "channel_range", + "get_name_by_module", + "cross_layer_equalization", + "process_paired_modules_list_to_name", + "expand_groups_in_paired_modules_list", + "equalize", + "converged", +] + +_supported_types = {torch.nn.Conv2d, torch.nn.Linear, torch.nn.Conv1d} +_supported_intrinsic_types = { + torch.ao.nn.intrinsic.ConvReLU2d, + torch.ao.nn.intrinsic.LinearReLU, + torch.ao.nn.intrinsic.ConvReLU1d, +} +_all_supported_types = _supported_types.union(_supported_intrinsic_types) + + +def set_module_weight(module, weight) -> None: + if type(module) in _supported_types: + module.weight = torch.nn.Parameter(weight) + else: + module[0].weight = torch.nn.Parameter(weight) + + +def set_module_bias(module, bias) -> None: + if type(module) in _supported_types: + module.bias = torch.nn.Parameter(bias) + else: + module[0].bias = torch.nn.Parameter(bias) + + +def has_bias(module) -> bool: + if type(module) in _supported_types: + return module.bias is not None + else: + return module[0].bias is not None + + +def get_module_weight(module): + if type(module) in _supported_types: + return module.weight + else: + return module[0].weight + + +def get_module_bias(module): + if type(module) in _supported_types: + return module.bias + else: + return module[0].bias + + +def max_over_ndim(input, axis_list, keepdim=False): + """Apply 'torch.max' over the given axes.""" + axis_list.sort(reverse=True) + for axis in axis_list: + input, _ = input.max(axis, keepdim) + return input + + +def min_over_ndim(input, axis_list, keepdim=False): + """Apply 'torch.min' over the given axes.""" + axis_list.sort(reverse=True) + for axis in axis_list: + input, _ = input.min(axis, keepdim) + return input + + +def channel_range(input, axis=0): + """Find the range of weights associated with a specific channel.""" + size_of_tensor_dim = input.ndim + axis_list = list(range(size_of_tensor_dim)) + axis_list.remove(axis) + + mins = min_over_ndim(input, axis_list) + maxs = max_over_ndim(input, axis_list) + + if mins.size(0) != input.size(axis): + raise AssertionError( + "Dimensions of resultant channel range does not match size of requested axis" + ) + return maxs - mins + + +def get_name_by_module(model, module): + """Get the name of a module within a model. + + Args: + model: a model (nn.module) that equalization is to be applied on + module: a module within the model + + Returns: + name: the name of the module within the model + """ + for name, m in model.named_modules(): + if m is module: + return name + raise ValueError("module is not in the model") + + +def cross_layer_equalization(module1, module2, output_axis=0, input_axis=1): + """Scale the range of Tensor1.output to equal Tensor2.input. + + Given two adjacent tensors', the weights are scaled such that + the ranges of the first tensors' output channel are equal to the + ranges of the second tensors' input channel + """ + if ( + type(module1) not in _all_supported_types + or type(module2) not in _all_supported_types + ): + raise ValueError( + "module type not supported:", type(module1), " ", type(module2) + ) + + bias = get_module_bias(module1) if has_bias(module1) else None + + weight1 = get_module_weight(module1) + weight2 = get_module_weight(module2) + + if weight1.size(output_axis) != weight2.size(input_axis): + raise TypeError( + "Number of output channels of first arg do not match \ + number input channels of second arg" + ) + + weight1_range = channel_range(weight1, output_axis) + weight2_range = channel_range(weight2, input_axis) + + # producing scaling factors to applied + weight2_range += 1e-9 + scaling_factors = torch.sqrt(weight1_range / weight2_range) + inverse_scaling_factors = torch.reciprocal(scaling_factors) + + if bias is not None: + bias = bias * inverse_scaling_factors + + # formatting the scaling (1D) tensors to be applied on the given argument tensors + # pads axis to (1D) tensors to then be broadcasted + size1 = [1] * weight1.ndim + size1[output_axis] = weight1.size(output_axis) + size2 = [1] * weight2.ndim + size2[input_axis] = weight2.size(input_axis) + + scaling_factors = torch.reshape(scaling_factors, size2) + inverse_scaling_factors = torch.reshape(inverse_scaling_factors, size1) + + weight1 = weight1 * inverse_scaling_factors + weight2 = weight2 * scaling_factors + + set_module_weight(module1, weight1) + if bias is not None: + set_module_bias(module1, bias) + set_module_weight(module2, weight2) + + +def process_paired_modules_list_to_name(model, paired_modules_list): + """Processes a list of paired modules to a list of names of paired modules.""" + + for group in paired_modules_list: + for i, item in enumerate(group): + if isinstance(item, torch.nn.Module): + group[i] = get_name_by_module(model, item) + elif not isinstance(item, str): + raise TypeError("item must be a nn.Module or a string") + return paired_modules_list + + +def expand_groups_in_paired_modules_list(paired_modules_list): + """Expands module pair groups larger than two into groups of two modules.""" + new_list = [] + + for group in paired_modules_list: + if len(group) == 1: + raise ValueError("Group must have at least two modules") + elif len(group) == 2: + new_list.append(group) + elif len(group) > 2: + new_list.extend([group[i], group[i + 1]] for i in range(len(group) - 1)) + + return new_list + + +def equalize(model, paired_modules_list, threshold=1e-4, inplace=True): + """Equalize modules until convergence is achieved. + + Given a list of adjacent modules within a model, equalization will + be applied between each pair, this will repeated until convergence is achieved + + Keeps a copy of the changing modules from the previous iteration, if the copies + are not that different than the current modules (determined by converged_test), + then the modules have converged enough that further equalizing is not necessary + + Reference is section 4.1 of this paper https://arxiv.org/pdf/1906.04721.pdf + + Args: + model: a model (nn.Module) that equalization is to be applied on + paired_modules_list (List(List[nn.module || str])): a list of lists + where each sublist is a pair of two submodules found in the model, + for each pair the two modules have to be adjacent in the model, + with only piece-wise-linear functions like a (P)ReLU or LeakyReLU in between + to get expected results. + The list can contain either modules, or names of modules in the model. + If you pass multiple modules in the same list, they will all be equalized together. + threshold (float): a number used by the converged function to determine what degree + of similarity between models is necessary for them to be called equivalent + inplace (bool): determines if function is inplace or not + """ + + paired_modules_list = process_paired_modules_list_to_name( + model, paired_modules_list + ) + + if not inplace: + model = copy.deepcopy(model) + + paired_modules_list = expand_groups_in_paired_modules_list(paired_modules_list) + + name_to_module: dict[str, torch.nn.Module] = {} + previous_name_to_module: dict[str, Any] = {} + name_set = set(chain.from_iterable(paired_modules_list)) + + for name, module in model.named_modules(): + if name in name_set: + name_to_module[name] = module + previous_name_to_module[name] = None + while not converged(name_to_module, previous_name_to_module, threshold): + for pair in paired_modules_list: + previous_name_to_module[pair[0]] = copy.deepcopy(name_to_module[pair[0]]) + previous_name_to_module[pair[1]] = copy.deepcopy(name_to_module[pair[1]]) + + cross_layer_equalization(name_to_module[pair[0]], name_to_module[pair[1]]) + + return model + + +def converged(curr_modules, prev_modules, threshold=1e-4): + """Test whether modules are converged to a specified threshold. + + Tests for the summed norm of the differences between each set of modules + being less than the given threshold + + Takes two dictionaries mapping names to modules, the set of names for each dictionary + should be the same, looping over the set of names, for each name take the difference + between the associated modules in each dictionary + + """ + if curr_modules.keys() != prev_modules.keys(): + raise ValueError( + "The keys to the given mappings must have the same set of names of modules" + ) + + summed_norms = torch.tensor(0.0) + if None in prev_modules.values(): + return False + for name in curr_modules: + curr_weight = get_module_weight(curr_modules[name]) + prev_weight = get_module_weight(prev_modules[name]) + + difference = curr_weight.sub(prev_weight) + summed_norms += torch.norm(difference) + return bool(summed_norms < threshold) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..00b824f8d1ecfe2086576eb3a4c16c4321e9e892 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py @@ -0,0 +1,199 @@ +# mypy: allow-untyped-defs + +import torch +from torch.nn.parameter import Parameter + + +__all__: list[str] = [] + + +class _LearnableFakeQuantize(torch.ao.quantization.FakeQuantizeBase): + r"""Generalized extension of the FakeQuantize module in fake_quantize.py. + + This is an extension of the FakeQuantize module in fake_quantize.py, which + supports more generalized lower-bit quantization and supports learning of the scale + and zero point parameters through backpropagation. + + In addition to the attributes in the original FakeQuantize module, the _LearnableFakeQuantize + module also includes the following attributes to support quantization parameter learning. + + * :attr:`channel_len` defines the length of the channel when initializing scale and zero point + for the per channel case. + + * :attr:`use_grad_scaling` defines the flag for whether the gradients for scale and zero point are + normalized by the constant, which is proportional to the square root of the number of + elements in the tensor. The related literature justifying the use of this particular constant + can be found here: https://openreview.net/pdf?id=rkgO66VKDS. + + * :attr:`fake_quant_enabled` defines the flag for enabling fake quantization on the output. + + * :attr:`static_enabled` defines the flag for using observer's static estimation for + scale and zero point. + + * :attr:`learning_enabled` defines the flag for enabling backpropagation for scale and zero point. + """ + + def __init__( + self, + observer, + quant_min=0, + quant_max=255, + scale=1.0, + zero_point=0.0, + channel_len=-1, + use_grad_scaling=False, + **observer_kwargs, + ): + super().__init__() + if quant_min >= quant_max: + raise AssertionError("quant_min must be strictly less than quant_max.") + self.quant_min = quant_min + self.quant_max = quant_max + # also pass quant_min and quant_max to observer + observer_kwargs["quant_min"] = quant_min + observer_kwargs["quant_max"] = quant_max + self.use_grad_scaling = use_grad_scaling + if channel_len == -1: + self.scale = Parameter(torch.tensor([scale])) + self.zero_point = Parameter(torch.tensor([zero_point])) + else: + if not (isinstance(channel_len, int) and channel_len > 0): + raise AssertionError("Channel size must be a positive integer.") + self.scale = Parameter(torch.tensor([scale] * channel_len)) + self.zero_point = Parameter(torch.tensor([zero_point] * channel_len)) + + self.activation_post_process = observer(**observer_kwargs) + if torch.iinfo(self.activation_post_process.dtype).min > quant_min: + raise AssertionError("quant_min out of bound") + if quant_max > torch.iinfo(self.activation_post_process.dtype).max: + raise AssertionError("quant_max out of bound") + self.dtype = self.activation_post_process.dtype + self.qscheme = self.activation_post_process.qscheme + self.ch_axis = ( + self.activation_post_process.ch_axis + if hasattr(self.activation_post_process, "ch_axis") + else -1 + ) + self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.uint8)) + self.register_buffer("static_enabled", torch.tensor([1], dtype=torch.uint8)) + self.register_buffer("learning_enabled", torch.tensor([0], dtype=torch.uint8)) + + bitrange = torch.tensor(quant_max - quant_min + 1).double() + self.bitwidth = int(torch.log2(bitrange).item()) + self.register_buffer("eps", torch.tensor([torch.finfo(torch.float32).eps])) + + @torch.jit.export + def enable_param_learning(self): + r"""Enable parameter learning over static observer estimates. + + Enables learning of quantization parameters and + disables static observer estimates. Forward path returns fake quantized X. + """ + self.toggle_qparam_learning(enabled=True).toggle_fake_quant( + enabled=True + ).toggle_observer_update(enabled=False) + return self + + @torch.jit.export + def enable_static_estimate(self): + """Enable static estimates of quantization parameters. + + Enables static observer estimates and disables learning of + quantization parameters. Forward path returns fake quantized X. + """ + self.toggle_qparam_learning(enabled=False).toggle_fake_quant( + enabled=True + ).toggle_observer_update(enabled=True) + + @torch.jit.export + def enable_static_observation(self): + """Enable accumulation of data without updating quantization parameters. + + Enables static observer accumulating data from input but doesn't + update the quantization parameters. Forward path returns the original X. + """ + self.toggle_qparam_learning(enabled=False).toggle_fake_quant( + enabled=False + ).toggle_observer_update(enabled=True) + + @torch.jit.export + def toggle_observer_update(self, enabled=True): + self.static_enabled[0] = int(enabled) # type: ignore[operator] + return self + + @torch.jit.export + def enable_observer(self, enabled=True): + self.toggle_observer_update(enabled) + + @torch.jit.export + def toggle_qparam_learning(self, enabled=True): + self.learning_enabled[0] = int(enabled) # type: ignore[operator] + self.scale.requires_grad = enabled + self.zero_point.requires_grad = enabled + return self + + @torch.jit.export + def toggle_fake_quant(self, enabled=True): + self.fake_quant_enabled[0] = int(enabled) + return self + + @torch.jit.export + def observe_quant_params(self): + print(f"_LearnableFakeQuantize Scale: {self.scale.detach()}") + print(f"_LearnableFakeQuantize Zero Point: {self.zero_point.detach()}") + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + self.scale.data.clamp_(min=self.eps.item()) # type: ignore[operator] + scale = self.scale.detach() + zero_point = ( + self.zero_point.detach() + .round() + .clamp(self.quant_min, self.quant_max) + .long() + ) + return scale, zero_point + + def forward(self, X): + if self.static_enabled[0] == 1: # type: ignore[index] + self.activation_post_process(X.detach()) + _scale, _zero_point = self.activation_post_process.calculate_qparams() + _scale = _scale.to(self.scale.device) + _zero_point = _zero_point.to(self.zero_point.device) + self.scale.data.copy_(_scale) + self.zero_point.data.copy_(_zero_point) + else: + self.scale.data.clamp_(min=self.eps.item()) # type: ignore[operator] + + if self.fake_quant_enabled[0] == 1: + if self.qscheme in ( + torch.per_channel_symmetric, + torch.per_tensor_symmetric, + ): + self.zero_point.data.zero_() + + if self.use_grad_scaling: + grad_factor = 1.0 / (X.numel() * self.quant_max) ** 0.5 + else: + grad_factor = 1.0 + if self.qscheme in (torch.per_channel_symmetric, torch.per_channel_affine): + X = torch._fake_quantize_learnable_per_channel_affine( + X, + self.scale, + self.zero_point, + self.ch_axis, + self.quant_min, + self.quant_max, + grad_factor, + ) + else: + X = torch._fake_quantize_learnable_per_tensor_affine( + X, + self.scale, + self.zero_point, + self.quant_min, + self.quant_max, + grad_factor, + ) + + return X diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..c4a380946c8a06dd884680fc52cf1350f49772f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py @@ -0,0 +1,663 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +"""Implements modules used to perform fake quantization.""" + +import re +from abc import ABC, abstractmethod +from typing import Any + +import torch +from torch.ao.quantization.observer import ( + _with_args, + default_fixed_qparams_range_0to1_observer, + default_fixed_qparams_range_neg1to1_observer, + FixedQParamsObserver, + HistogramObserver, + MovingAverageMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, +) +from torch.nn import Module + + +__all__ = [ + "FakeQuantizeBase", + "FakeQuantize", + "FixedQParamsFakeQuantize", + "FusedMovingAvgObsFakeQuantize", + "disable_fake_quant", + "disable_observer", + "enable_fake_quant", + "enable_observer", + "default_fake_quant", + "default_weight_fake_quant", + "default_dynamic_fake_quant", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_fixed_qparams_range_0to1_fake_quant", + "default_symmetric_fixed_qparams_fake_quant", + "default_affine_fixed_qparams_fake_quant", + "default_per_channel_weight_fake_quant", + "default_embedding_fake_quant", + "default_embedding_fake_quant_4bit", + "default_histogram_fake_quant", + "default_fused_act_fake_quant", + "default_fused_wt_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "fused_wt_fake_quant_range_neg_127_to_127", + "fused_per_channel_wt_fake_quant_range_neg_127_to_127", +] + + +def _is_per_channel(qscheme: "torch.qscheme") -> bool: + return qscheme in [ + torch.per_channel_symmetric, + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ] + + +def _is_per_tensor(qscheme: "torch.qscheme") -> bool: + return qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine] + + +def _is_symmetric_quant(qscheme: "torch.qscheme") -> bool: + return qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric] + + +def _is_float_qparams(qscheme: "torch.qscheme") -> bool: + return qscheme == torch.per_channel_affine_float_qparams + + +class FakeQuantizeBase(ABC, Module): + r"""Base fake quantize module. + + Base fake quantize module + Any fake quantize implementation should derive from this class. + + Concrete fake quantize module should follow the same API. In forward, they will update + the statistics of the observed Tensor and fake quantize the input. They should also provide a + `calculate_qparams` function that computes the quantization parameters given + the collected statistics. + + """ + + fake_quant_enabled: torch.Tensor + observer_enabled: torch.Tensor + + def __init__(self) -> None: + """Set fake_quant_enabled and observer_enabled.""" + super().__init__() + # fake_quant_enabled and observer_enabled are buffers to support their + # replication in DDP. Data type is uint8 because NCCL does not support + # bool tensors. + self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.uint8)) + self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.uint8)) + + @abstractmethod + def forward(self, x): + pass + + @abstractmethod + def calculate_qparams(self, **kwargs): + pass + + @torch.jit.export + def enable_fake_quant(self, enabled: bool = True) -> None: + self.fake_quant_enabled[0] = 1 if enabled else 0 + + @torch.jit.export + def disable_fake_quant(self): + self.enable_fake_quant(False) + + @torch.jit.export + def enable_observer(self, enabled: bool = True) -> None: + self.observer_enabled[0] = 1 if enabled else 0 + + @torch.jit.export + def disable_observer(self): + self.enable_observer(False) + + @classmethod + def with_args(cls, **kwargs): + fake_quant_constructor = _with_args(cls, **kwargs) + # need to assign the correct module to fake_quantize + # constructors to satisfy public v private requirements + fake_quant_constructor.__module__ = "torch.ao.quantization.fake_quantize" + return fake_quant_constructor + + +class FakeQuantize(FakeQuantizeBase): + r"""Simulate the quantize and dequantize operations in training time. + + The output of this module is given by:: + + x_out = ( + clamp(round(x / scale + zero_point), quant_min, quant_max) - zero_point + ) * scale + + * :attr:`is_dynamic` indicates whether the fake quantie is a placeholder for dynamic quantization + operators (choose_qparams -> q -> dq) or static quantization operators (q -> dq) + + * :attr:`scale` defines the scale factor used for quantization. + + * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to + + * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that + statistics can still be updated. + + * :attr:`observer_enabled` controls statistics collection on tensors + + * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization, + allowable values are torch.qint8 and torch.quint8. + + Args: + + observer (module): Module for observing statistics on input tensors and calculating scale + and zero-point. + observer_kwargs (optional): Arguments for the observer module + + Attributes: + activation_post_process (Module): User provided module that collects statistics on the input tensor and + provides a method to calculate scale and zero-point. + + """ + + scale: torch.Tensor + zero_point: torch.Tensor + + def __init__( + self, + observer=MovingAverageMinMaxObserver, + quant_min=None, + quant_max=None, + is_dynamic=False, + **observer_kwargs, + ): + super().__init__() + # Populate quant_min/quant_max to observer_kwargs if valid + if quant_min is not None and quant_max is not None: + if quant_min > quant_max: + raise AssertionError( + "quant_min must be less than or equal to quant_max" + ) + dtype = observer_kwargs.get("dtype", torch.quint8) + if hasattr(observer, "p"): + # In case observer is _PartialWrapper, dtype can be stored in + # observer.p.keywords["dtype"] + dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get( + "dtype", dtype + ) + # pyrefly: ignore [bad-argument-type] + if torch.iinfo(dtype).min > quant_min: + raise AssertionError("quant_min out of bound") + # pyrefly: ignore [bad-argument-type] + if quant_max > torch.iinfo(dtype).max: + raise AssertionError("quant_max out of bound") + observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max}) + observer_kwargs["is_dynamic"] = is_dynamic + self.activation_post_process = observer(**observer_kwargs) + # TODO: keeping self.quant_min/max for BC; remove after a couple releases + # Users should use self.activation_post_process.quant_min + self.quant_min = self.activation_post_process.quant_min + self.quant_max = self.activation_post_process.quant_max + self.is_dynamic = self.activation_post_process.is_dynamic + if _is_float_qparams(self.activation_post_process.qscheme): + zero_point_dtype = torch.float + else: + zero_point_dtype = torch.int + self.register_buffer("scale", torch.tensor([1.0], dtype=torch.float)) + self.register_buffer("zero_point", torch.tensor([0], dtype=zero_point_dtype)) + self.dtype = self.activation_post_process.dtype + self.qscheme = self.activation_post_process.qscheme + self.ch_axis = ( + self.activation_post_process.ch_axis + if hasattr(self.activation_post_process, "ch_axis") + else -1 + ) + if not (_is_per_channel(self.qscheme) or _is_per_tensor(self.qscheme)): + raise AssertionError( + "Only per channel and per tensor quantization are supported in fake quantize" + + " got qscheme: " + + str(self.qscheme) + ) + self.is_per_channel = _is_per_channel(self.qscheme) + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + return self.activation_post_process.calculate_qparams() + + def forward(self, X): + if self.observer_enabled[0] == 1: + self.activation_post_process(X.detach()) + _scale, _zero_point = self.calculate_qparams() + _scale, _zero_point = ( + _scale.to(self.scale.device), + _zero_point.to(self.zero_point.device), + ) + if self.scale.shape != _scale.shape: + self.scale.resize_(_scale.shape) + self.zero_point.resize_(_zero_point.shape) + self.scale.copy_(_scale) + self.zero_point.copy_(_zero_point) + + if self.fake_quant_enabled[0] == 1: + if self.is_per_channel: + X = torch.fake_quantize_per_channel_affine( + X, + self.scale, + self.zero_point, + self.ch_axis, + self.activation_post_process.quant_min, + self.activation_post_process.quant_max, + ) + else: + X = torch.fake_quantize_per_tensor_affine( + X, + self.scale, + self.zero_point, + self.activation_post_process.quant_min, + self.activation_post_process.quant_max, + ) + return X + + @torch.jit.export + def extra_repr(self): + return ( + f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, " + f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, " + f"dtype={self.dtype}, qscheme={self.qscheme}, ch_axis={self.ch_axis}, " + f"scale={self.scale}, zero_point={self.zero_point}" + ) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + # We cannot currently register scalar values as buffers, so need to manually + # specify serialization here. + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "scale"] = self.scale + destination[prefix + "zero_point"] = self.zero_point + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + # Removing this function throws an error that the size of the loaded tensor does not match the original size + # i.e., These buffers start out with numel 0 and become numel 1 once they have their first forward pass. + local_state = ["scale", "zero_point"] + for name in local_state: + key = prefix + name + if key in state_dict: + val = state_dict[key] + # Custom handling to allow loading scale and zero_point + # of size N into uninitialized buffers of size 0. The + # buffers are resized here, and the values are copied in + # the default state_dict loading code of the parent. + if name == "scale": + self.scale.resize_(val.shape) + else: + if name != "zero_point": + raise AssertionError( + "Expected 'zero_point' but got different state key" + ) + self.zero_point.resize_(val.shape) + # For torchscript module we need to update the attributes here since we do not + # call the `_load_from_state_dict` function defined module.py + if torch.jit.is_scripting(): + if name == "scale": + self.scale.copy_(val) + else: + if name != "zero_point": + raise AssertionError( + "Expected 'zero_point' but got different state key" + ) + self.zero_point.copy_(val) + elif strict: + missing_keys.append(key) + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +class FixedQParamsFakeQuantize(FakeQuantize): + """Simulate quantize and dequantize in training time. + + Simulate quantize and dequantize with fixed quantization + parameters in training time. Only per tensor quantization + is supported. + """ + + # TODO: rename observer to observer_ctr + def __init__(self, observer): + super().__init__(observer=observer) + if type(self.activation_post_process) is not FixedQParamsObserver: + raise AssertionError( + f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}" + ) + self._observer_ctr = observer + self.scale = self.activation_post_process.scale + self.zero_point = self.activation_post_process.zero_point + if not _is_per_tensor(self.qscheme): + raise AssertionError( + "Only per tensor quantization is supported" + + " FixedQParamsFakeQuantize module, got qscheme:" + + str(self.qscheme) + ) + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + return self.scale, self.zero_point + + @torch.jit.export + def extra_repr(self): + """Define a string representation of the object's attributes.""" + return ( + f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, " + f"scale={self.scale}, zero_point={self.zero_point}, " + f"dtype={self.dtype}, quant_min={self.activation_post_process.quant_min}, " + f"quant_max={self.activation_post_process.quant_max}, qscheme={self.qscheme}" + ) + + +class FusedMovingAvgObsFakeQuantize(FakeQuantize): + r"""Define a fused module to observe the tensor. + + Fused module that is used to observe the input tensor (compute min/max), compute + scale/zero_point and fake_quantize the tensor. + This module uses calculation similar MovingAverageMinMaxObserver for the inputs, + to compute the min/max values in order to compute the scale/zero_point. + The qscheme input in the observer is used to differentiate between symmetric/affine + quantization scheme. + + The output of this module is given by + x_out = (clamp(round(x/scale + zero_point), quant_min, quant_max)-zero_point)*scale + + Similar to :class:`~torch.ao.quantization.FakeQuantize`, and accepts the same attributes as the + base class. + + """ + + def __init__( + self, + observer: Any = MovingAverageMinMaxObserver, + quant_min: int = 0, + quant_max: int = 255, + **observer_kwargs: Any, + ) -> None: + super().__init__(observer, quant_min, quant_max, **observer_kwargs) + if not isinstance( + self.activation_post_process, + (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver), + ): + raise AssertionError( + "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver" + ) + self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long)) + self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long)) + self.is_symmetric_quant = _is_symmetric_quant( + self.activation_post_process.qscheme + ) + + @torch.jit.export + def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]: # type: ignore[override] + return self.activation_post_process.calculate_qparams() + + @torch.jit.export + def extra_repr(self) -> str: + return ( + f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, " + f"scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}, " + f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, " + f"qscheme={self.qscheme}, reduce_range={self.activation_post_process.reduce_range}" + ) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + return torch.fused_moving_avg_obs_fake_quant( + X, + self.observer_enabled, + self.fake_quant_enabled, + self.activation_post_process.min_val, + self.activation_post_process.max_val, + self.scale, + self.zero_point, + self.activation_post_process.averaging_constant, + self.activation_post_process.quant_min, + self.activation_post_process.quant_max, + self.ch_axis, + self.is_per_channel, + self.is_symmetric_quant, + ) + + +default_fake_quant = FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=True, +) +""" +Default fake_quant for activations. +""" + +default_weight_fake_quant = FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_tensor_symmetric, + reduce_range=False, +) +""" +Default fake_quant for weights. +Observer is memoryless since averaging_constant is 1. +""" + +default_dynamic_fake_quant = FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + is_dynamic=True, + dtype=torch.quint8, + averaging_constant=1, +) +""" +Default dynamic fake_quant for activations. +""" + +default_fixed_qparams_range_neg1to1_fake_quant = FixedQParamsFakeQuantize.with_args( + observer=default_fixed_qparams_range_neg1to1_observer +) +default_fixed_qparams_range_0to1_fake_quant = FixedQParamsFakeQuantize.with_args( + observer=default_fixed_qparams_range_0to1_observer +) +# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases +default_symmetric_fixed_qparams_fake_quant = ( + default_fixed_qparams_range_neg1to1_fake_quant +) +default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant + +default_per_channel_weight_fake_quant = FakeQuantize.with_args( + observer=MovingAveragePerChannelMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_channel_symmetric, + reduce_range=False, + ch_axis=0, +) +""" +Default fake_quant for per-channel weights. +Observer is memoryless since averaging_constant is 1. +""" +default_embedding_fake_quant = FakeQuantize.with_args( + observer=MovingAveragePerChannelMinMaxObserver, + qscheme=torch.per_channel_affine_float_qparams, + dtype=torch.quint8, + quant_min=0, + quant_max=255, + ch_axis=0, + averaging_constant=1, +) +""" +Default fake_quant for embeddings. +Observer is memoryless since averaging_constant is 1. +""" + +default_embedding_fake_quant_4bit = FakeQuantize.with_args( + observer=MovingAveragePerChannelMinMaxObserver, + qscheme=torch.per_channel_affine_float_qparams, + ch_axis=0, + dtype=torch.quint4x2, + averaging_constant=1, +) + +default_histogram_fake_quant = FakeQuantize.with_args( + observer=HistogramObserver, + quant_min=0, + quant_max=255, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=True, +) +""" +Fake_quant for activations using a histogram.. +""" + + +default_fused_act_fake_quant = FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + dtype=torch.quint8, +) + +""" +Fused version of `default_fake_quant`, with improved performance. +""" + + +default_fused_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_tensor_symmetric, +) +""" +Fused version of `default_weight_fake_quant`, with improved performance. +""" + +default_fused_per_channel_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAveragePerChannelMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_channel_symmetric, +) +""" +Fused version of `default_per_channel_weight_fake_quant`, with improved performance. +""" + +fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=-127, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_tensor_symmetric, + eps=2**-12, +) +""" +Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128. +""" + +fused_per_channel_wt_fake_quant_range_neg_127_to_127 = ( + FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAveragePerChannelMinMaxObserver, + quant_min=-127, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_channel_symmetric, + eps=2**-12, + ) +) + +""" +Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128. +""" + + +def _is_fake_quant_script_module(mod): + """Return true if given mod is an instance of FakeQuantize script module.""" + if isinstance(mod, torch.jit.RecursiveScriptModule): + # qualified name looks like '__torch__.torch.ao.quantization.fake_quantize.___torch_mangle_2.FakeQuantize' + suffix = mod._c.qualified_name.split(".", 1)[1] + name = re.sub(r"\.___torch_mangle_\d+", "", suffix) + return ( + name == "torch.ao.quantization.fake_quantize.FakeQuantize" + or name + == "torch.ao.quantization.fake_quantize.FusedMovingAvgObsFakeQuantize" + ) + return False + + +def disable_fake_quant(mod): + """Disable fake quantization for the module. + + Disable fake quantization for this module, if applicable. Example usage:: + + # model is any PyTorch model + model.apply(torch.ao.quantization.disable_fake_quant) + + """ + if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod): + mod.disable_fake_quant() + + +def enable_fake_quant(mod): + """Enable fake quantization for the module. + + Enable fake quantization for this module, if applicable. Example usage:: + + # model is any PyTorch model + model.apply(torch.ao.quantization.enable_fake_quant) + + """ + if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod): + mod.enable_fake_quant() + + +def disable_observer(mod): + """Disable observation for this module. + + Disable observation for this module, if applicable. Example usage:: + + # model is any PyTorch model + model.apply(torch.ao.quantization.disable_observer) + + """ + if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod): + mod.disable_observer() + + +def enable_observer(mod): + """Enable observation for this module. + + Enable observation for this module, if applicable. Example usage:: + + # model is any PyTorch model + model.apply(torch.ao.quantization.enable_observer) + + """ + if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod): + mod.enable_observer() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..4f664c699144917d3314eee7bdf5dd92f9697108 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py @@ -0,0 +1,215 @@ +# mypy: allow-untyped-defs +import copy + +import torch.nn as nn + +# for backward compatibility +from torch.ao.quantization.fuser_method_mappings import ( # noqa: F401 # noqa: F401 + fuse_conv_bn, + fuse_conv_bn_relu, + get_fuser_method, +) +from torch.nn.utils.parametrize import type_before_parametrizations + + +__all__ = [ + "fuse_known_modules", + "fuse_modules", + "fuse_modules_qat", +] + + +# Generalization of getattr +def _get_module(model, submodule_key): + tokens = submodule_key.split(".") + cur_mod = model + for s in tokens: + cur_mod = getattr(cur_mod, s) + return cur_mod + + +# Generalization of setattr +def _set_module(model, submodule_key, module): + tokens = submodule_key.split(".") + sub_tokens = tokens[:-1] + cur_mod = model + for s in sub_tokens: + cur_mod = getattr(cur_mod, s) + + setattr(cur_mod, tokens[-1], module) + + +def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None): + r"""Return a list of known fuse modules. + + Returns a list of modules that fuses the operations specified + in the input module list. + + Fuses only the following sequence of modules: + conv, bn + conv, bn, relu + conv, relu + linear, bn + linear, relu + For these sequences, the first element in the output module list performs + the fused operation. The rest of the elements are set to nn.Identity() + """ + types = tuple(type_before_parametrizations(m) for m in mod_list) + fuser_method = get_fuser_method(types, additional_fuser_method_mapping) + if fuser_method is None: + raise NotImplementedError(f"Cannot fuse modules: {types}") + new_mod: list[nn.Module | None] = [None] * len(mod_list) + fused = fuser_method(is_qat, *mod_list) + # NOTE: forward hooks not processed in the two following for loops will be lost after the fusion + # Move pre forward hooks of the base module to resulting fused module + for pre_hook_fn in mod_list[0]._forward_pre_hooks.values(): + fused.register_forward_pre_hook(pre_hook_fn) + mod_list[0]._forward_pre_hooks.clear() + # Move post forward hooks of the last module to resulting fused module + for hook_fn in mod_list[-1]._forward_hooks.values(): + fused.register_forward_hook(hook_fn) + mod_list[-1]._forward_hooks.clear() + new_mod[0] = fused + + for i in range(1, len(mod_list)): + identity = nn.Identity() + identity.training = mod_list[0].training + new_mod[i] = identity + + return new_mod + + +def _fuse_modules_helper( + model, + modules_to_fuse, + is_qat, + fuser_func=fuse_known_modules, + fuse_custom_config_dict=None, +): + if fuse_custom_config_dict is None: + fuse_custom_config_dict = {} + additional_fuser_method_mapping = fuse_custom_config_dict.get( + "additional_fuser_method_mapping", {} + ) + mod_list = [_get_module(model, item) for item in modules_to_fuse] + + # Fuse list of modules + new_mod_list = fuser_func(mod_list, is_qat, additional_fuser_method_mapping) + + # Replace original module list with fused module list + for i, item in enumerate(modules_to_fuse): + _set_module(model, item, new_mod_list[i]) + + +def _fuse_modules( + model, + modules_to_fuse, + is_qat, + inplace=False, + fuser_func=fuse_known_modules, + fuse_custom_config_dict=None, +): + if not inplace: + model = copy.deepcopy(model) + + if all(isinstance(module_element, str) for module_element in modules_to_fuse): + # Handle case of modules_to_fuse being a list + _fuse_modules_helper( + model, modules_to_fuse, is_qat, fuser_func, fuse_custom_config_dict + ) + else: + # Handle case of modules_to_fuse being a list of lists + for module_list in modules_to_fuse: + _fuse_modules_helper( + model, module_list, is_qat, fuser_func, fuse_custom_config_dict + ) + return model + + +def fuse_modules( + model, + modules_to_fuse, + inplace=False, + fuser_func=fuse_known_modules, + fuse_custom_config_dict=None, +): + r"""Fuse a list of modules into a single module. + + Fuses only the following sequence of modules: + conv, bn + conv, bn, relu + conv, relu + linear, relu + bn, relu + All other sequences are left unchanged. + For these sequences, replaces the first item in the list + with the fused module, replacing the rest of the modules + with identity. + + Args: + model: Model containing the modules to be fused + modules_to_fuse: list of list of module names to fuse. Can also be a list + of strings if there is only a single list of modules to fuse. + inplace: bool specifying if fusion happens in place on the model, by default + a new model is returned + fuser_func: Function that takes in a list of modules and outputs a list of fused modules + of the same length. For example, + fuser_func([convModule, BNModule]) returns the list [ConvBNModule, nn.Identity()] + Defaults to torch.ao.quantization.fuse_known_modules + `fuse_custom_config_dict`: custom configuration for fusion + + .. code-block:: python + + # Example of fuse_custom_config_dict + fuse_custom_config_dict = { + # Additional fuser_method mapping + "additional_fuser_method_mapping": { + (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn + }, + } + + Returns: + model with fused modules. A new copy is created if inplace=True. + + Examples:: + + >>> # xdoctest: +SKIP + >>> m = M().eval() + >>> # m is a module containing the sub-modules below + >>> modules_to_fuse = [ ['conv1', 'bn1', 'relu1'], ['submodule.conv', 'submodule.relu']] + >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse) + >>> output = fused_m(input) + + >>> m = M().eval() + >>> # Alternately provide a single list of modules to fuse + >>> modules_to_fuse = ['conv1', 'bn1', 'relu1'] + >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse) + >>> output = fused_m(input) + + """ + return _fuse_modules( + model, + modules_to_fuse, + is_qat=False, + inplace=inplace, + fuser_func=fuser_func, + fuse_custom_config_dict=fuse_custom_config_dict, + ) + + +def fuse_modules_qat( + model, + modules_to_fuse, + inplace=False, + fuser_func=fuse_known_modules, + fuse_custom_config_dict=None, +): + """QAT version for `fuse_modules`.""" + return _fuse_modules( + model, + modules_to_fuse, + is_qat=True, + inplace=inplace, + fuser_func=fuser_func, + fuse_custom_config_dict=fuse_custom_config_dict, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..d72a3579438bc3e5e2687982ab4b550c680d2110 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py @@ -0,0 +1,314 @@ +# mypy: allow-untyped-defs +import itertools +from collections.abc import Callable +from typing import Any + +import torch.ao.nn.intrinsic as nni +import torch.nn as nn +from torch.ao.quantization.utils import get_combined_dict, MatchAllNode, Pattern + + +__all__ = [ + "fuse_conv_bn", + "fuse_conv_bn_relu", + "fuse_linear_bn", + "fuse_convtranspose_bn", + "get_fuser_method", + "get_fuser_method_new", +] + + +def fuse_conv_bn(is_qat, conv, bn): + r"""Return the fused the conv and bn modules. + Given the conv and bn modules, fuses them and returns the fused module + + Args: + is_qat: a flag for whether we are using quantization aware training fusion + or post training quantization fusion + conv: Module instance of type conv2d/conv3d + bn: Spatial BN instance that needs to be fused with the conv + + Examples:: + + >>> m1 = nn.Conv2d(10, 20, 3) + >>> b1 = nn.BatchNorm2d(20) + >>> # xdoctest: +SKIP + >>> m2 = fuse_conv_bn(m1, b1) + """ + if conv.training != bn.training: + raise AssertionError( + "Conv and BN both must be in the same mode (train or eval)." + ) + + fused_module_class_map = { + nn.Conv1d: nni.ConvBn1d, + nn.Conv2d: nni.ConvBn2d, + nn.Conv3d: nni.ConvBn3d, + } + + if is_qat: + if bn.num_features != conv.out_channels: + raise AssertionError( + "Output channel of Conv2d must match num_features of BatchNorm2d." + ) + if not bn.affine: + raise AssertionError( + "Only support fusing BatchNorm2d with affine set to True" + ) + if not bn.track_running_stats: + raise AssertionError( + "Only support fusing BatchNorm2d with tracking_running_stats set to True" + ) + fused_module_class = fused_module_class_map.get((type(conv)), None) + if fused_module_class is not None: + return fused_module_class(conv, bn) + else: + raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn)}") + else: + return nn.utils.fuse_conv_bn_eval(conv, bn) + + +def fuse_conv_bn_relu(is_qat, conv, bn, relu): + r"""Return the fused conv and bv modules. + + Given the conv and bn modules, fuses them and returns the fused module + + Args: + is_qat: a flag for whether we are using quantization aware training fusion + or post training quantization fusion + conv: Module instance of type conv2d/conv3d + bn: Spatial BN instance that needs to be fused with the conv + + Examples:: + + >>> m1 = nn.Conv2d(10, 20, 3) + >>> b1 = nn.BatchNorm2d(20) + >>> r1 = nn.ReLU(inplace=False) + >>> # xdoctest: +SKIP + >>> m2 = fuse_conv_bn_relu(m1, b1, r1) + """ + if not (conv.training == bn.training == relu.training): + raise AssertionError( + "Conv and BN both must be in the same mode (train or eval)." + ) + fused_module: type[nn.Sequential] | None = None + if is_qat: + map_to_fused_module_train = { + nn.Conv1d: nni.ConvBnReLU1d, + nn.Conv2d: nni.ConvBnReLU2d, + nn.Conv3d: nni.ConvBnReLU3d, + } + if bn.num_features != conv.out_channels: + raise AssertionError( + "Output channel of Conv2d must match num_features of BatchNorm2d" + ) + if not bn.affine: + raise AssertionError( + "Only support fusing BatchNorm2d with affine set to True" + ) + if not bn.track_running_stats: + raise AssertionError( + "Only support fusing BatchNorm2d with tracking_running_stats set to True" + ) + fused_module = map_to_fused_module_train.get(type(conv), None) + if fused_module is not None: + return fused_module(conv, bn, relu) + else: + raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, relu)}") + else: + map_to_fused_module_eval = { + nn.Conv1d: nni.ConvReLU1d, + nn.Conv2d: nni.ConvReLU2d, + nn.Conv3d: nni.ConvReLU3d, + } + fused_module = map_to_fused_module_eval.get(type(conv), None) + if fused_module is not None: + fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn) + return fused_module(fused_conv, relu) + else: + raise NotImplementedError(f"Cannot fuse eval modules: {(conv, bn, relu)}") + + +def fuse_linear_bn(is_qat, linear, bn): + r"""Return the fused linear and bn modules. + Given the linear and bn modules, fuses them and returns the fused module + + Args: + is_qat: a flag for whether we are using quantization aware training fusion + or post training quantization fusion + linear: Module instance of type Linear + bn: BatchNorm1d instance that needs to be fused with the linear layer + + Examples:: + + >>> m1 = nn.Linear(20, 10) + >>> b1 = nn.BatchNorm1d(10) + >>> # xdoctest: +SKIP + >>> m2 = fuse_linear_bn(m1, b1) + """ + if linear.training != bn.training: + raise AssertionError( + "Linear and BN both must be in the same mode (train or eval)." + ) + + if is_qat: + if bn.num_features != linear.out_features: + raise AssertionError( + "Output features of Linear must match num_features of BatchNorm1d" + ) + if not bn.affine: + raise AssertionError( + "Only support fusing BatchNorm1d with affine set to True" + ) + if not bn.track_running_stats: + raise AssertionError( + "Only support fusing BatchNorm1d with tracking_running_stats set to True" + ) + return nni.LinearBn1d(linear, bn) + else: + return nn.utils.fusion.fuse_linear_bn_eval(linear, bn) + + +def fuse_convtranspose_bn(is_qat, convt, bn): + r"""Return the fused ConvTranspose and bn modules. + Given ConvTranspose and bn modules, fuses them and returns the fused module + + Args: + convt: Module instance of type ConvTransposeNd + bn: BatchNormNd instance that needs to be fused with the linear layer. + batch norm N should match the ConvTranspose N + + Examples:: + + >>> m1 = nn.ConvTranspose2d(10, 20, 3) + >>> b1 = nn.BatchNorm2d(20) + >>> # xdoctest: +SKIP + >>> m2 = fuse_convtranspose_bn(m1, b1) + """ + if convt.training != bn.training: + raise AssertionError( + "ConvTranspose and BN both must be in the same mode (train or eval)." + ) + + if is_qat: + raise Exception( # noqa: TRY002 + "Fusing ConvTranspose+BatchNorm not yet supported in QAT." + ) + else: + return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True) + + +def _sequential_wrapper2(sequential): + """Return a sequential wrapped that for is_qat and two modules. + Given a sequential class for two modules, return a function that takes + is_qat, and then two modules as argument, that ignores the is_qat flag + and always returns the sequential that combines the two input modules + """ + + def fuser_method(is_qat, m1, m2): + return sequential(m1, m2) + + return fuser_method + + +_DEFAULT_OP_LIST_TO_FUSER_METHOD: dict[tuple, nn.Sequential | Callable] = { + (nn.Conv1d, nn.BatchNorm1d): fuse_conv_bn, + (nn.Conv1d, nn.BatchNorm1d, nn.ReLU): fuse_conv_bn_relu, + (nn.Conv2d, nn.BatchNorm2d): fuse_conv_bn, + (nn.Conv2d, nn.BatchNorm2d, nn.ReLU): fuse_conv_bn_relu, + (nn.Conv3d, nn.BatchNorm3d): fuse_conv_bn, + (nn.Conv3d, nn.BatchNorm3d, nn.ReLU): fuse_conv_bn_relu, + (nn.Conv1d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU1d), + (nn.Conv2d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU2d), + (nn.Conv3d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU3d), + (nn.Linear, nn.BatchNorm1d): fuse_linear_bn, + (nn.Linear, nn.ReLU): _sequential_wrapper2(nni.LinearReLU), + (nn.BatchNorm2d, nn.ReLU): _sequential_wrapper2(nni.BNReLU2d), + (nn.BatchNorm3d, nn.ReLU): _sequential_wrapper2(nni.BNReLU3d), + (nn.ConvTranspose1d, nn.BatchNorm1d): fuse_convtranspose_bn, + (nn.ConvTranspose2d, nn.BatchNorm2d): fuse_convtranspose_bn, + (nn.ConvTranspose3d, nn.BatchNorm3d): fuse_convtranspose_bn, +} + + +def get_fuser_method(op_list, additional_fuser_method_mapping=None): + """Get fuser method for the given list of module types. + + Get fuser method for the given list of module types, + return None if fuser method does not exist + """ + if additional_fuser_method_mapping is None: + additional_fuser_method_mapping = {} + all_mappings = get_combined_dict( + _DEFAULT_OP_LIST_TO_FUSER_METHOD, additional_fuser_method_mapping + ) + fuser_method = all_mappings.get(op_list, None) + if fuser_method is None: + raise AssertionError(f"did not find fuser method for: {op_list} ") + return fuser_method + + +def _reverse2(f): + def reversed(is_qat, x, y): + return f(is_qat, y, x) + + return reversed + + +def _reverse3(f): + def reversed(is_qat, x, w): + y, z = w + return f(is_qat, z, y, x) + + return reversed + + +def _get_valid_patterns(op_pattern): + """Return a list of valid patterns generated from the op_pattern. + + Returns a list of valid patterns generated from the op_pattern, + since MatchAllNode can match all types of nodes, + e.g. pattern (torch.nn.Conv2d, torch.add) should also be able to match keys like + (MatchAllNode, torch.add) and (torch.nn.Conv2d, MatchAllNode) + + Example Input: + (torch.add, (torch.nn.ReLU, torch.nn.Conv2d)) + + Example Output: + [(torch.add, (torch.nn.ReLU, torch.nn.Conv2d)), + (torch.add, (torch.nn.ReLU, MatchAllNode)), + (torch.add, (MatchAllNode, torch.nn.Conv2d)), + (torch.add, (MatchAllNode, MatchAllNode)), + (MatchAllNode, (torch.nn.ReLU, torch.nn.Conv2d)), + (MatchAllNode, (torch.nn.ReLU, MatchAllNode)), + (MatchAllNode, (MatchAllNode, torch.nn.Conv2d)), + (MatchAllNode, (MatchAllNode, MatchAllNode)), + ] + """ + result: list[Any] + if isinstance(op_pattern, (tuple, list)): + sub_combs = [_get_valid_patterns(sub_pattern) for sub_pattern in op_pattern] + result = list(itertools.product(*sub_combs)) + else: + result = [op_pattern, MatchAllNode] + return result + + +def get_fuser_method_new( + op_pattern: Pattern, + fuser_method_mapping: dict[Pattern, nn.Sequential | Callable], +): + """Get fuser method. + + This will be made default after we deprecate the get_fuser_method + Would like to implement this first and have a separate PR for deprecation + """ + op_patterns = _get_valid_patterns(op_pattern) + fuser_method = None + for op_pattern in op_patterns: + fuser_method = fuser_method_mapping.get(op_pattern) + if fuser_method is not None: + break + if fuser_method is None: + raise AssertionError(f"did not find fuser method for: {op_pattern} ") + return fuser_method diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py new file mode 100644 index 0000000000000000000000000000000000000000..abb81c2a54d0091e16ff7cbbf6ef6bb2112485de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py @@ -0,0 +1,2155 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +# temporarily skip RUF for this file for now, we can re-enable +# after move the affine quantization related things to torchao +# noqa: RUF +""" +This module implements observers which are used to collect statistics about +the values observed during calibration (PTQ) or training (QAT). +""" + +import operator +import re +import warnings +from abc import ABCMeta, abstractmethod +from collections import OrderedDict +from functools import partial +from typing import Any + +import torch +import torch.nn as nn +from torch.ao.quantization.utils import ( + calculate_qmin_qmax, + check_min_max_valid, + is_per_channel, + is_per_tensor, + validate_qmin_qmax, +) +from torch.fx import Node + + +__all__ = [ + "default_affine_fixed_qparams_observer", + "default_debug_observer", + "default_dynamic_quant_observer", + "default_fixed_qparams_range_0to1_observer", + "default_fixed_qparams_range_neg1to1_observer", + "default_float_qparams_observer", + "default_float_qparams_observer_4bit", + "default_histogram_observer", + "default_observer", + "default_per_channel_weight_observer", + "default_placeholder_observer", + "default_reuse_input_observer", + "default_symmetric_fixed_qparams_observer", + "default_weight_observer", + "get_observer_state_dict", + "load_observer_state_dict", + "per_channel_weight_observer_range_neg_127_to_127", + "weight_observer_range_neg_127_to_127", + "FixedQParamsObserver", + "HistogramObserver", + "MinMaxObserver", + "MovingAverageMinMaxObserver", + "MovingAveragePerChannelMinMaxObserver", + "NoopObserver", + "ObserverBase", + "PerChannelMinMaxObserver", + "PlaceholderObserver", + "RecordingObserver", + "ReuseInputObserver", + "UniformQuantizationObserverBase", + "AffineQuantizedObserverBase", + "Granularity", + "MappingType", + "PerAxis", + "PerBlock", + "PerGroup", + "PerRow", + "PerTensor", + "PerToken", + "TorchAODType", + "ZeroPointDomain", + "get_block_size", +] + + +class _PartialWrapper: + def __init__(self, p): + self.p = p + self.callable_args = {} + + def __call__(self, *args, **keywords): + # call each arg in callable_args and add them partial, then run with keywords + # skip if arg_name in keywords so its possible to overwrite + for arg_name in self.callable_args: + if arg_name not in keywords: + keywords = {**keywords, arg_name: self.callable_args[arg_name]()} + return self.p(*args, **keywords) + + def __repr__(self): + return self.p.__repr__() + self.callable_args.__repr__() + + def with_args(self, **kwargs): + return _with_args(self, **kwargs) + + def with_callable_args(self, **kwargs): + result = _PartialWrapper(p=self.p) + result.callable_args = {**self.callable_args, **kwargs} + return result + + +def _with_args(cls_or_self, **kwargs): + r"""Wrapper that allows creation of class factories. + + This can be useful when there is a need to create classes with the same + constructor arguments, but different instances. Can be used in conjunction with + _callable_args + + Example:: + + >>> # xdoctest: +SKIP("Undefined vars") + >>> Foo.with_args = classmethod(_with_args) + >>> foo_builder = Foo.with_args(a=3, b=4).with_args(answer=42) + >>> foo_instance1 = foo_builder() + >>> foo_instance2 = foo_builder() + >>> id(foo_instance1) == id(foo_instance2) + False + """ + r = _PartialWrapper(partial(cls_or_self, **kwargs)) + return r + + +def _with_callable_args(cls_or_self, **kwargs): + r"""Wrapper that allows creation of class factories args that need to be + called at construction time. + + This can be useful when there is a need to create classes with the same + constructor arguments, but different instances and those arguments should only + be calculated at construction time. Can be used in conjunction with _with_args + + Example:: + + >>> # xdoctest: +SKIP("Undefined vars") + >>> Foo.with_callable_args = classmethod(_with_callable_args) + >>> Foo.with_args = classmethod(_with_args) + >>> foo_builder = Foo.with_callable_args(cur_time=get_time_func).with_args(name="dan") + >>> foo_instance1 = foo_builder() + >>> # wait 50 + >>> foo_instance2 = foo_builder() + >>> id(foo_instance1.creation_time) == id(foo_instance2.creation_time) + False + """ + r = _PartialWrapper(partial(cls_or_self)) + return r.with_callable_args(**kwargs) + + +ABC: Any = ABCMeta("ABC", (object,), {}) # compatible with Python 2 *and* 3: + + +class ObserverBase(ABC, nn.Module): + r"""Base observer Module. + Any observer implementation should derive from this class. + + Concrete observers should follow the same API. In forward, they will update + the statistics of the observed Tensor. And they should provide a + `calculate_qparams` function that computes the quantization parameters given + the collected statistics. + + Args: + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + is_dynamic: indicator for whether the observer is a placeholder for dynamic quantization + or static quantization + """ + + def __init__(self, dtype, is_dynamic: bool = False): + super().__init__() + self.dtype = dtype + self.is_dynamic = is_dynamic + + @abstractmethod + def forward(self, x): + pass + + @abstractmethod + def calculate_qparams(self, **kwargs): + pass + + with_args = classmethod(_with_args) + with_callable_args = classmethod(_with_callable_args) + + +class UniformQuantizationObserverBase(ObserverBase): + r"""Common base for all observers using uniform quantization to calculate + scale and zero_point. + + Args: + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + qscheme: Quantization scheme to be used. + reduce_range: Reduces the range of the quantized data type by 1 bit. + This is sometimes required to avoid instruction overflow. + quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. + quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + .. warning:: + + :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``. + or `torch.int8` or `torch.uint8` + + .. warning:: + + :attr:`qscheme` can only take one of the following options: + + - ``torch.per_tensor_affine`` + - ``torch.per_tensor_symmetric`` + - ``torch.per_channel_affine`` + - ``torch.per_channel_symmetric`` + """ + + # Note: the version is shared by all observer types + # + # Version 1/None + # self + # + # Version 2 (base class only, does not include child class buffers) + # self + # |--- eps : Tensor + # + # Version 3 + # for HistogramObserver only, changed the shape of uninitialized + # min_val and max_val buffers from torch.Size([0]) to torch.Size([]) + # for PerChannelObservers, changed the name of the buffers from min_vals + # to min_val and from max_vals to max_val. + _version = 3 + + eps: torch.Tensor + + def __init__( + self, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs) + self.qscheme = qscheme + if reduce_range: + warnings.warn( + "Please use quant_min and quant_max to specify the range for observers. \ + reduce_range will be deprecated in a future release of PyTorch.", + stacklevel=2, + ) + self.reduce_range = reduce_range + self.register_buffer("eps", torch.tensor([eps], **factory_kwargs)) + if self.qscheme not in ( + torch.per_tensor_affine, + torch.per_tensor_symmetric, + torch.per_channel_affine, + torch.per_channel_symmetric, + torch.per_channel_affine_float_qparams, + ): + raise AssertionError( + "Default Observer only works for per_tensor_affine, per_tensor_symmetric, " + "per_channel_affine, per_channel_symmetric and per_channel_float_qparams quantization scheme" + ) + + _ALLOWED_DTYPES = ( + torch.qint8, + torch.quint8, + torch.quint4x2, + torch.qint32, + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.float8_e5m2, + torch.float8_e4m3fn, + torch.uint16, + ) + + if self.dtype not in _ALLOWED_DTYPES: + raise AssertionError( + f"Default Observer only works for {_ALLOWED_DTYPES} data type" + ) + self.has_customized_qrange = (quant_min is not None) and (quant_max is not None) + if self.has_customized_qrange: + # pyrefly: ignore [bad-argument-type] + validate_qmin_qmax(quant_min, quant_max) + self.quant_min, self.quant_max = calculate_qmin_qmax( + # pyrefly: ignore [bad-argument-type] + quant_min, + # pyrefly: ignore [bad-argument-type] + quant_max, + self.has_customized_qrange, + self.dtype, + self.reduce_range, + ) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + + if version is None or version == 1: + # eps was moved to a buffer in version 2 + eps = torch.tensor([torch.finfo(torch.float32).eps]) + state_dict[prefix + "eps"] = eps + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @torch.jit.export + def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None: + r"""Validates that the user-specified quantization range is properly initialized + and within the given bound supported by the observer dtype. + + To accommodate lower-bit quantization with respect to the existing torch.qint8 and + torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing + in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax + values are used to calculate static estimates of the scale and zero point for aggressive lower-bit + fake quantization. These estimates are compared against parameters learned through backpropagation. + The related literatures for scale and zero point via backpropagation are as follows: + + Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS + Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf + """ + # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted + # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer. + if not quant_min <= 0 <= quant_max: + raise AssertionError("Used-specified quantization range must include 0.") + if quant_min >= quant_max: + raise AssertionError( + "qmin must be strictly less than qmax for user-specified quantization range." + ) + + @torch.jit.export + def _calculate_qparams( + self, min_val: torch.Tensor, max_val: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + r"""Calculates the quantization parameters, given min and max + value tensors. Works for both per tensor and per channel cases + + Args: + min_val: Minimum values per channel + max_val: Maximum values per channel + + Returns: + scales: Scales tensor of shape (#channels,) + zero_points: Zero points tensor of shape (#channels,) + """ + # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme + # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer + # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code + # seems unlikely to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. + # TODO(jakeszwe, jerryzh168) + if not check_min_max_valid(min_val, max_val): + return torch.tensor([1.0], device=min_val.device.type), torch.tensor( + [0], device=min_val.device.type + ) + + quant_min, quant_max = self.quant_min, self.quant_max + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + + device = min_val_neg.device + scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device) + zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device) + + if ( + self.qscheme == torch.per_tensor_symmetric + or self.qscheme == torch.per_channel_symmetric + ): + max_val_pos = torch.max(-min_val_neg, max_val_pos) + scale = max_val_pos / (float(quant_max - quant_min) / 2) + scale = torch.max(scale, self.eps) + if self.dtype in [torch.quint8, torch.uint8]: + if self.has_customized_qrange: + # When customized quantization range is used, down-rounded midpoint of the range is chosen. + zero_point = zero_point.new_full( + zero_point.size(), (quant_min + quant_max) // 2 + ) + else: + zero_point = zero_point.new_full(zero_point.size(), 128) + elif self.dtype == torch.uint16: + zero_point = zero_point.new_full(zero_point.size(), 2**15) + elif self.qscheme == torch.per_channel_affine_float_qparams: + scale = (max_val - min_val) / float(quant_max - quant_min) + scale = torch.where(scale > self.eps, scale, torch.ones_like(scale)) + # We use the quantize function + # xq = Round(Xf * inv_scale + zero_point), + # setting zero_point to (-1 * min *inv_scale) we get + # Xq = Round((Xf - min) * inv_scale) + zero_point = -1 * min_val / scale + else: + scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min) + scale = torch.max(scale, self.eps) + zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int) + zero_point = torch.clamp(zero_point, quant_min, quant_max) + + # For scalar values, cast them to Tensors of size 1 to keep the shape + # consistent with default values in FakeQuantize. + if len(scale.shape) == 0: + # TODO: switch to scale.item() after adding JIT support + scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device) + if len(zero_point.shape) == 0: + # TODO: switch to zero_point.item() after adding JIT support + zero_point = torch.tensor( + [int(zero_point)], dtype=zero_point.dtype, device=device + ) + if self.qscheme == torch.per_channel_affine_float_qparams: + zero_point = torch.tensor( + [float(zero_point)], dtype=zero_point.dtype, device=device + ) + + return scale, zero_point + + @torch.jit.export + def reset_min_max_vals(self): + raise NotImplementedError("Cannot reset min/max values in the given observer.") + + +# Originally, this class was called `_ObserverBase`. Keeping the old name around +# for backwards compatibility. +# TODO(after v1.13): delete this +_ObserverBase = UniformQuantizationObserverBase + + +class MinMaxObserver(UniformQuantizationObserverBase): + r"""Observer module for computing the quantization parameters based on the + running min and max values. + + This observer uses the tensor min/max statistics to compute the quantization + parameters. The module records the running minimum and maximum of incoming + tensors, and uses this statistic to compute the quantization parameters. + + Args: + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. + quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + Given running min/max as :math:`x_\text{min}` and :math:`x_\text{max}`, + scale :math:`s` and zero point :math:`z` are computed as: + + The running minimum/maximum :math:`x_\text{min/max}` is computed as: + + .. math:: + + \begin{array}{ll} + x_\text{min} &= \begin{cases} + \min(X) & \text{if~}x_\text{min} = \text{None} \\ + \min\left(x_\text{min}, \min(X)\right) & \text{otherwise} + \end{cases}\\ + x_\text{max} &= \begin{cases} + \max(X) & \text{if~}x_\text{max} = \text{None} \\ + \max\left(x_\text{max}, \max(X)\right) & \text{otherwise} + \end{cases}\\ + \end{array} + + where :math:`X` is the observed tensor. + + The scale :math:`s` and zero point :math:`z` are then computed as: + + .. math:: + + \begin{aligned} + \text{if Symmetric:}&\\ + &s = 2 \max(|x_\text{min}|, x_\text{max}) / + \left( Q_\text{max} - Q_\text{min} \right) \\ + &z = \begin{cases} + 0 & \text{if dtype is qint8} \\ + 128 & \text{otherwise} + \end{cases}\\ + \text{Otherwise:}&\\ + &s = \left( x_\text{max} - x_\text{min} \right ) / + \left( Q_\text{max} - Q_\text{min} \right ) \\ + &z = Q_\text{min} - \text{round}(x_\text{min} / s) + \end{aligned} + + where :math:`Q_\text{min}` and :math:`Q_\text{max}` are the minimum and + maximum of the quantized data type. + + .. warning:: :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``. + + .. note:: If the running minimum equals to the running maximum, the scale + and zero_point are set to 1.0 and 0. + """ + + min_val: torch.Tensor + max_val: torch.Tensor + + def __init__( + self, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + if not is_per_tensor(qscheme): + raise NotImplementedError( + "MinMaxObserver's qscheme only support torch.per_tensor_symmetric \ + and torch.per_tensor_affine." + ) + # TODO: MinMaxObserver by itself doesn't support dynamic quantization, but + # if it's inherited by MovingAverageObserver, and averaging_constant is 1, it + # supports dynamic quantization, we may need to better error checking here + + # For x86 quantized kernels, we need to ensure that the vpmaddubsw + # instruction does not overflow. We allow for a reduce_range argument to + # observers that reduces the quantized range to (0,127) or (-64, 63). + # For more details see aten/src/ATen/native/quantized/cpu/qconv.cpp + # This is not an optimal choice for non x86 backends as it loses a bit + # of precision for activations. + super().__init__( + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + factory_kwargs=factory_kwargs, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) + self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) + if ( + self.qscheme == torch.per_tensor_symmetric + and self.reduce_range + and self.dtype == torch.quint8 + ): + raise NotImplementedError( + "Cannot reduce range for symmetric \ + quantization for quint8" + ) + + def forward(self, x_orig): + r"""Records the running minimum and maximum of ``x``.""" + if x_orig.numel() == 0: + return x_orig + x = x_orig.detach() # avoid keeping autograd tape + x = x.to(self.min_val.dtype) + min_val_cur, max_val_cur = torch.aminmax(x) + min_val = torch.min(min_val_cur, self.min_val) + max_val = torch.max(max_val_cur, self.max_val) + self.min_val.copy_(min_val) + self.max_val.copy_(max_val) + return x_orig + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + r"""Calculates the quantization parameters.""" + return self._calculate_qparams(self.min_val, self.max_val) + + @torch.jit.export + def extra_repr(self): + return f"min_val={self.min_val}, max_val={self.max_val}" + + @torch.jit.export + def reset_min_max_vals(self): + """Resets the min/max values.""" + self.min_val.copy_(torch.tensor(float("inf"))) + self.max_val.copy_(torch.tensor(float("-inf"))) + + +class MovingAverageMinMaxObserver(MinMaxObserver): + r"""Observer module for computing the quantization parameters based on the + moving average of the min and max values. + + This observer computes the quantization parameters based on the moving + averages of minimums and maximums of the incoming tensors. The module + records the average minimum and maximum of incoming tensors, and uses this + statistic to compute the quantization parameters. + + Args: + averaging_constant: Averaging constant for min/max. + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. + quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + The moving average min/max is computed as follows + + .. math:: + + \begin{array}{ll} + x_\text{min} = \begin{cases} + \min(X) & \text{if~}x_\text{min} = \text{None} \\ + (1 - c) x_\text{min} + c \min(X) & \text{otherwise} + \end{cases}\\ + x_\text{max} = \begin{cases} + \max(X) & \text{if~}x_\text{max} = \text{None} \\ + (1 - c) x_\text{max} + c \max(X) & \text{otherwise} + \end{cases}\\ + \end{array} + + where :math:`x_\text{min/max}` is the running average min/max, :math:`X` is + is the incoming tensor, and :math:`c` is the ``averaging_constant``. + + The scale and zero point are then computed as in + :class:`~torch.ao.quantization.observer.MinMaxObserver`. + + .. note:: Only works with ``torch.per_tensor_affine`` quantization scheme. + + .. note:: If the running minimum equals to the running maximum, the scale + and zero_point are set to 1.0 and 0. + """ + + def __init__( + self, + averaging_constant=0.01, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + if not is_per_tensor(qscheme): + raise NotImplementedError( + f"MovingAverageMinMaxObserver's qscheme only support \ + torch.per_tensor_symmetric and torch.per_tensor_affine. \ + but got: {qscheme}" + ) + self.averaging_constant = averaging_constant + if is_dynamic and self.averaging_constant != 1: + raise NotImplementedError( + "MovingAverageMinMaxObserver doesn't support dynamic quantization for " + f"averaging constant of {self.averaging_constant}" + ) + super().__init__( + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + + def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig + x = x_orig.detach() # avoid keeping autograd tape + x = x.to(self.min_val.dtype) + min_val = self.min_val + max_val = self.max_val + if min_val == float("inf") and max_val == float("-inf"): + min_val, max_val = torch.aminmax(x) + else: + min_val_cur, max_val_cur = torch.aminmax(x) + min_val = min_val + self.averaging_constant * (min_val_cur - min_val) + max_val = max_val + self.averaging_constant * (max_val_cur - max_val) + self.min_val.copy_(min_val) + self.max_val.copy_(max_val) + return x_orig + + +class PerChannelMinMaxObserver(UniformQuantizationObserverBase): + r"""Observer module for computing the quantization parameters based on the + running per channel min and max values. + + This observer uses the tensor min/max statistics to compute the per channel + quantization parameters. The module records the running minimum and maximum + of incoming tensors, and uses this statistic to compute the quantization + parameters. + + Args: + ch_axis: Channel axis + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. + quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + The quantization parameters are computed the same way as in + :class:`~torch.ao.quantization.observer.MinMaxObserver`, with the difference + that the running min/max values are stored per channel. + Scales and zero points are thus computed per channel as well. + + .. note:: If the running minimum equals to the running maximum, the scales + and zero_points are set to 1.0 and 0. + """ + + min_val: torch.Tensor + max_val: torch.Tensor + + def __init__( + self, + ch_axis=0, + dtype=torch.quint8, + qscheme=torch.per_channel_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + if not is_per_channel(qscheme): + raise NotImplementedError( + "PerChannelMinMaxObserver's qscheme only support \ + torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams." + ) + if is_dynamic: + raise NotImplementedError( + "PerChannelMinMaxObserver doesn't support dynamic quantization" + ) + super().__init__( + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + factory_kwargs=factory_kwargs, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + self.ch_axis = ch_axis + self.register_buffer("min_val", torch.tensor([], **factory_kwargs)) + self.register_buffer("max_val", torch.tensor([], **factory_kwargs)) + if ( + self.qscheme == torch.per_channel_symmetric + and self.reduce_range + and self.dtype == torch.quint8 + ): + raise NotImplementedError( + "Cannot reduce range for symmetric quantization for quint8" + ) + + def forward(self, x_orig): + return self._forward(x_orig) + + def _forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig + x = x_orig.detach() # avoid keeping autograd tape + min_val = self.min_val + max_val = self.max_val + x_dim = x.size() + + new_axis_list = [i for i in range(len(x_dim))] # noqa: C416 + new_axis_list[self.ch_axis] = 0 + new_axis_list[0] = self.ch_axis + y = x.permute(new_axis_list) + # Need to match dtype of min/max because the updates to buffers + # are done in place and types need to match for comparisons + y = y.to(self.min_val.dtype) + y = torch.flatten(y, start_dim=1) + if min_val.numel() == 0 or max_val.numel() == 0: + min_val, max_val = torch.aminmax(y, dim=1) + else: + min_val_cur, max_val_cur = torch.aminmax(y, dim=1) + min_val = torch.min(min_val_cur, min_val) + max_val = torch.max(max_val_cur, max_val) + self.min_val.resize_(min_val.shape) + self.max_val.resize_(max_val.shape) + self.min_val.copy_(min_val) + self.max_val.copy_(max_val) + return x_orig + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + return self._calculate_qparams(self.min_val, self.max_val) + + def extra_repr(self): + return f"min_val={self.min_val}, max_val={self.max_val}" + + def _load_from_state_dict( + self, + state_dict: dict[str, Any], + prefix: str, + local_metadata: dict[str, torch.Tensor], + strict: bool, + missing_keys: list[str], + unexpected_keys: list[str], + error_msgs: list[str], + ): + version = local_metadata.get("version") + if version is not None and version < 3: + local_state = ["min_vals", "max_vals"] + expected_min_name = "min_vals" + expected_max_name = "max_vals" + else: + local_state = ["min_val", "max_val"] + expected_min_name = "min_val" + expected_max_name = "max_val" + for name in local_state: + key = prefix + name + if key in state_dict: + val = state_dict[key] + # Custom handling to allow loading min_val or max_val + # of size N into uninitialized buffers of size 0. The + # buffers are resized here, and the values are copied in + # the default state_dict loading code of the parent. + if name == expected_min_name: + self.min_val.resize_(val.shape) + elif name == expected_max_name: + self.max_val.resize_(val.shape) + else: + warnings.warn( + f"Observer load_from_state_dict got unexpected name {name}", + stacklevel=2, + ) + # For torchscript module we need to update the attributes here since we do not + # call the `_load_from_state_dict` function defined module.py + if torch.jit.is_scripting(): + if name == expected_min_name: + self.min_val.copy_(val) + elif name == expected_max_name: + self.max_val.copy_(val) + else: + warnings.warn( + f"Observer load_from_state_dict got unexpected name {name}", + stacklevel=2, + ) + elif strict: + missing_keys.append(key) + + if not torch.jit.is_scripting(): + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + False, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def _load_from_state_dict_script( + self, + state_dict: dict[str, Any], + prefix: str, + local_metadata: dict[str, torch.Tensor], + strict: bool, + missing_keys: list[str], + unexpected_keys: list[str], + error_msgs: list[str], + ): + self._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + @torch.jit.export + def reset_min_max_vals(self): + """Resets the min/max values.""" + # This used to be torch.ones but that does not work because + # JIT compiler can optimize it via common subexpression elimination + # in which case both min_val and max_val point to the same tensor. + self.min_val = torch.rand( + 0, + ) + self.max_val = torch.rand( + 0, + ) + + +class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver): + r"""Observer module for computing the quantization parameters based on the + running per channel min and max values. + + This observer uses the tensor min/max statistics to compute the per channel + quantization parameters. The module records the running minimum and maximum + of incoming tensors, and uses this statistic to compute the quantization + parameters. + + Args: + averaging_constant: Averaging constant for min/max. + ch_axis: Channel axis + dtype: Quantized data type + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. + quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + The quantization parameters are computed the same way as in + :class:`~torch.ao.quantization.observer.MovingAverageMinMaxObserver`, with the + difference that the running min/max values are stored per channel. + Scales and zero points are thus computed per channel as well. + + .. note:: If the running minimum equals to the running maximum, the scales + and zero_points are set to 1.0 and 0. + """ + + def __init__( + self, + averaging_constant=0.01, + ch_axis=0, + dtype=torch.quint8, + qscheme=torch.per_channel_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + if not is_per_channel(qscheme): + raise NotImplementedError( + "MovingAveragePerChannelMinMaxObserver's qscheme only support \ + torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams." + ) + if is_dynamic: + raise NotImplementedError( + "MovingAveragePerChannelMinMaxObserver doesn't support dynamic quantization" + ) + super().__init__( + ch_axis=ch_axis, + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + self.averaging_constant = averaging_constant + + def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig + x = x_orig.detach() # avoid keeping autograd tape + x = x.to(self.min_val.dtype) + min_val = self.min_val + max_val = self.max_val + x_dim = x.size() + + new_axis_list = [i for i in range(len(x_dim))] # noqa: C416 + new_axis_list[self.ch_axis] = 0 + new_axis_list[0] = self.ch_axis + y = x.permute(new_axis_list) + y = torch.flatten(y, start_dim=1) + if min_val.numel() == 0 or max_val.numel() == 0: + min_val, max_val = torch.aminmax(y, dim=1) + else: + min_val_cur, max_val_cur = torch.aminmax(y, dim=1) + min_val = min_val + self.averaging_constant * (min_val_cur - min_val) + max_val = max_val + self.averaging_constant * (max_val_cur - max_val) + self.min_val.resize_(min_val.shape) + self.max_val.resize_(max_val.shape) + self.min_val.copy_(min_val) + self.max_val.copy_(max_val) + return x_orig + + +class HistogramObserver(UniformQuantizationObserverBase): + r""" + The module records the running histogram of tensor values along with + min/max values. ``calculate_qparams`` will calculate scale and zero_point. + + Args: + bins: Number of bins to use for the histogram + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. + + The scale and zero point are computed as follows: + + 1. Create the histogram of the incoming inputs. + The histogram is computed continuously, and the ranges per bin change + with every new tensor observed. + 2. Search the distribution in the histogram for optimal min/max values. + The search for the min/max values ensures the minimization of the + quantization error with respect to the floating point model. + 3. Compute the scale and zero point the same way as in the + :class:`~torch.ao.quantization.MinMaxObserver` + """ + + histogram: torch.Tensor + min_val: torch.Tensor + max_val: torch.Tensor + + def __init__( + self, + bins: int = 2048, + dtype: torch.dtype = torch.quint8, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, + is_dynamic=False, + **kwargs, + ) -> None: + if not is_per_tensor(qscheme): + raise NotImplementedError( + "HistogramObserver's qscheme only support torch.per_tensor_symmetric \ + and torch.per_tensor_affine." + ) + if is_dynamic: + raise NotImplementedError( + "HistogramObserver doesn't support dynamic quantization" + ) + # bins: The number of bins used for histogram calculation. + super().__init__( + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + factory_kwargs=factory_kwargs, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + self.bins = bins + self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs)) + self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) + self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) + self.dst_nbins = 2 ** torch.iinfo(self.dtype).bits + self.upsample_rate = ( + 16 # used to reduce quantization errors when upscaling histogram + ) + + def _get_norm( + self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor + ) -> torch.Tensor: + r""" + Compute the norm of the values uniformaly distributed between + delta_begin and delta_end. + Currently only L2 norm is supported. + + norm = density * (integral_{begin, end} x^2) + = density * (end^3 - begin^3) / 3 + """ + norm = ( + delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin + ) / 3 + return density * norm + + def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int): + r""" + Compute the quantization error if we use start_bin to end_bin as the + min and max to do the quantization. + """ + bin_width = (self.max_val.item() - self.min_val.item()) / self.bins + + dst_bin_width = bin_width * (next_end_bin - next_start_bin + 1) / self.dst_nbins + if dst_bin_width == 0.0: + return 0.0 + + src_bin = torch.arange(self.bins, device=self.histogram.device) + # distances from the beginning of first dst_bin to the beginning and + # end of src_bin + src_bin_begin = (src_bin - next_start_bin) * bin_width + src_bin_end = src_bin_begin + bin_width + + # which dst_bins the beginning and end of src_bin belong to? + dst_bin_of_begin = torch.clamp( + torch.div(src_bin_begin, dst_bin_width, rounding_mode="floor"), + 0, + self.dst_nbins - 1, + ) + dst_bin_of_begin_center = (dst_bin_of_begin + 0.5) * dst_bin_width + + dst_bin_of_end = torch.clamp( + torch.div(src_bin_end, dst_bin_width, rounding_mode="floor"), + 0, + self.dst_nbins - 1, + ) + density = self.histogram / bin_width + + norm = torch.zeros(self.bins, device=self.histogram.device) + + delta_begin = src_bin_begin - dst_bin_of_begin_center + delta_end = dst_bin_width / 2 + norm += self._get_norm( + delta_begin, + torch.ones(self.bins, device=self.histogram.device) * delta_end, + density, + ) + + norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm( + torch.tensor(-dst_bin_width / 2), torch.tensor(dst_bin_width / 2), density + ) + + dst_bin_of_end_center = dst_bin_of_end * dst_bin_width + dst_bin_width / 2 + + delta_begin = -dst_bin_width / 2 + delta_end = src_bin_end - dst_bin_of_end_center + norm += self._get_norm(torch.tensor(delta_begin), delta_end, density) + + return norm.sum().item() + + def _non_linear_param_search(self) -> tuple[torch.Tensor, torch.Tensor]: + r"""Non-linear parameter search. + + An approximation for L2 error minimization for selecting min/max. + By selecting new min/max, we filter out outliers in input distribution. + This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in + caffe2/quantization/server/norm_minimization.cc + """ + if self.histogram.size()[0] != self.bins: + raise AssertionError("bins mismatch") + bin_width = (self.max_val - self.min_val) / self.bins + + # cumulative sum + total = torch.sum(self.histogram).item() + cSum = torch.cumsum(self.histogram, dim=0) + + stepsize = 1e-5 # granularity + alpha = 0.0 # lower bound + beta = 1.0 # upper bound + start_bin = 0 + end_bin = self.bins - 1 + norm_min = float("inf") + + while alpha < beta: + # Find the next step + next_alpha = alpha + stepsize + next_beta = beta - stepsize + + # find the left and right bins between the quantile bounds + l = start_bin + r = end_bin + while l < end_bin and cSum[l] < next_alpha * total: + l = l + 1 + while r > start_bin and cSum[r] > next_beta * total: + r = r - 1 + + # decide the next move + next_start_bin = start_bin + next_end_bin = end_bin + if (l - start_bin) > (end_bin - r): + # move the start bin + next_start_bin = l + alpha = next_alpha + else: + # move the end bin + next_end_bin = r + beta = next_beta + + if next_start_bin == start_bin and next_end_bin == end_bin: + continue + + # calculate the quantization error using next_start_bin and next_end_bin + norm = self._compute_quantization_error(next_start_bin, next_end_bin) + + if norm > norm_min: + break + norm_min = norm + start_bin = next_start_bin + end_bin = next_end_bin + + new_min = self.min_val + bin_width * start_bin + new_max = self.min_val + bin_width * (end_bin + 1) + return new_min, new_max + + def _upscale_histogram( + self, + histogram: torch.Tensor, + orig_min: torch.Tensor, + orig_max: torch.Tensor, + update_min: torch.Tensor, + update_max: torch.Tensor, + ): + # this turns the histogram into a more fine-coarsed histogram to reduce + # bin quantization errors + histogram = histogram.repeat_interleave(self.upsample_rate) / self.upsample_rate + bin_size = (orig_max - orig_min) / (self.bins * self.upsample_rate) + mid_points_histogram = ( + torch.linspace( + orig_min, + orig_max, + self.bins * self.upsample_rate + 1, + device=orig_min.device, + )[:-1].to(histogram.device) + + 0.5 * bin_size + ) + boundaries_new_histogram = torch.linspace( + update_min, update_max, self.bins + 1, device=update_min.device + ).to(histogram.device) + # this maps the mid-points of the histogram to the new histogram's space + bucket_assignments = ( + torch.bucketize(mid_points_histogram, boundaries_new_histogram, right=True) + - 1 + ) + # this then maps the histogram mid-points in the new space, weighted by the original histogram's values + # this is just the old histogram in the new histogram's space + + # In case due to numerical issues the values land higher/lower than the maximum/minimum + bucket_assignments[bucket_assignments >= self.bins] = self.bins - 1 + bucket_assignments[bucket_assignments < 0] = 0 + + update_histogram = torch.bincount( + bucket_assignments, weights=histogram, minlength=self.bins + ) + return update_histogram + + def _combine_histograms( + self, + orig_hist: torch.Tensor, + orig_min: torch.Tensor, + orig_max: torch.Tensor, + update_hist: torch.Tensor, + update_min: torch.Tensor, + update_max: torch.Tensor, + ) -> torch.Tensor: + # If the new min and max are the same as the current min and max, + # we can just add the new histogram to the original histogram + if update_min == orig_min and update_max == orig_max: + return orig_hist + update_hist + + # If the orig hist only has one value (i.e., the min and max are the same) + # we can just add it into new histogram + if orig_min == orig_max: + bin_value = torch.sum(orig_hist) + transformed_orig_hist = ( + torch.histc(orig_min, bins=self.bins, min=update_min, max=update_max) # type: ignore[arg-type] + * bin_value + ) + return transformed_orig_hist + update_hist + + # We assume the update_hist is already in the target range, we will map the orig_max to it + if update_min > orig_min: + raise AssertionError("update_min must be <= orig_min") + if update_max < orig_max: + raise AssertionError("update_max must be >= orig_max") + + # Now we need to turn the old_histogram, into the range of the new histogram + transformed_orig_hist = self._upscale_histogram( + orig_hist, + orig_min, + orig_max, + update_min, + update_max, + ) + + return update_hist + transformed_orig_hist + + def reset_histogram( + self, x: torch.Tensor, min_val: torch.Tensor, max_val: torch.Tensor + ) -> None: + self.min_val.resize_(min_val.shape) + self.min_val.copy_(min_val) + self.max_val.resize_(max_val.shape) + self.max_val.copy_(max_val) + if min_val.numel() != 1 or max_val.numel() != 1: + raise AssertionError("histogram min/max values must be scalar.") + new_histogram = torch.histc(x, self.bins, min=min_val, max=max_val) # type: ignore[arg-type] + self.histogram.detach_().resize_(new_histogram.shape) + self.histogram.copy_(new_histogram) + + def forward(self, x_orig: torch.Tensor) -> torch.Tensor: # pyre-ignore[14] + if x_orig.numel() == 0: + return x_orig + x = x_orig.detach() + x_min, x_max = torch.aminmax(x) + # want to ignore torch.inf since we don't actually + # want to make our quantization range infinite + # and in practice those values will be clamped + if x_min == -torch.inf or x_max == torch.inf: + warnings.warn( + "torch.inf detected in input tensor, ignoring input", stacklevel=2 + ) + x = x[x.abs() != torch.inf] + if x.numel() == 0: + return x_orig + x_min, x_max = torch.aminmax(x) + + current_min = self.min_val + current_max = self.max_val + + is_uninitialized = self.min_val == float("inf") or self.max_val == float("-inf") + if is_uninitialized: + self.reset_histogram(x, x_min, x_max) + else: + update_min, update_max = x_min, x_max + new_min = torch.min(current_min, update_min) + new_max = torch.max(current_max, update_max) + + # TODO: For some reason, this is required for it to pass torchscript test + # new_min and new_max should already have requires_grad set to False + new_min, new_max = new_min.detach(), new_max.detach() + update_histogram = torch.histc( + x, + self.bins, + min=new_min, # type: ignore[arg-type] + max=new_max, # type: ignore[arg-type] + ).to(self.histogram.device) + if new_min == current_min and new_max == current_max: + combined_histogram = self.histogram + update_histogram + self.histogram.detach_().resize_(combined_histogram.shape) + self.histogram.copy_(combined_histogram) + else: + combined_histogram = self._combine_histograms( + self.histogram, + current_min, + current_max, + update_histogram, + new_min, + new_max, + ) + self.histogram.detach_().resize_(combined_histogram.shape) + self.histogram.copy_(combined_histogram) + self.min_val.detach_().resize_(new_min.shape) + self.min_val.copy_(new_min) + self.max_val.detach_().resize_(new_max.shape) + self.max_val.copy_(new_max) + + return x_orig + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + is_uninitialized = self.min_val == float("inf") and self.max_val == float( + "-inf" + ) + if is_uninitialized: + warnings.warn( + "must run observer before calling calculate_qparams.\ + Returning default scale and zero point ", + stacklevel=2, + ) + return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor( + [0], device=self.min_val.device.type + ) + if self.bins != len(self.histogram): + raise AssertionError( + "The number of bins in histogram should be equal to the number of bins " + "supplied while making this observer" + ) + + new_min, new_max = self._non_linear_param_search() + + return self._calculate_qparams(new_min, new_max) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + "min_val"] = self.min_val + destination[prefix + "max_val"] = self.max_val + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + + if version is None or version < 3: + # if min_val and max_val are not initialized, update their shape + # to account for the differences between v2 and v3 + min_val_name, max_val_name = prefix + "min_val", prefix + "max_val" + if min_val_name in state_dict: + if state_dict[min_val_name].shape == torch.Size([0]): + state_dict[min_val_name] = torch.tensor(float("inf")) + if max_val_name in state_dict: + if state_dict[max_val_name].shape == torch.Size([0]): + state_dict[max_val_name] = torch.tensor(float("-inf")) + + local_state = ["min_val", "max_val"] + for name in local_state: + key = prefix + name + if key in state_dict: + val = state_dict[key] + setattr(self, name, val) + elif strict: + missing_keys.append(key) + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def extra_repr(self): + return f"min_val={self.min_val}, max_val={self.max_val}" + + +class FixedQParamsObserver(ObserverBase): + r""" + Observer that simulates quantize and dequantize with fixed + quantization parameters in training time. Only per tensor + quantization is supported. + + Args: + `scale` (float): fixed scale for the observer + `zero_point` (int): fixed zero point for the observer + `dtype`, `qscheme`, `quant_min`, `quant_max` + """ + + scale: torch.Tensor + zero_point: torch.Tensor + + def __init__( + self, + scale, + zero_point, + dtype=torch.quint8, + qscheme=torch.per_tensor_affine, + quant_min=0, + quant_max=255, + is_dynamic=False, + **kwargs, + ): + if is_dynamic: + raise NotImplementedError( + "FixedQParamsObserver doesn't support dynamic quantization" + ) + super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs) + self.quant_min = quant_min + self.quant_max = quant_max + self.register_buffer("scale", torch.tensor([scale], dtype=torch.float)) + self.register_buffer("zero_point", torch.tensor([zero_point], dtype=torch.int)) + self.dtype = dtype + self.qscheme = qscheme + + def forward(self, X): + return X + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + return self.scale, self.zero_point + + +class PlaceholderObserver(ObserverBase): + r""" + Observer that doesn't do anything and just passes its configuration to the + quantized module's ``.from_float()``. + + Can be used for quantization to float16 which doesn't require determining + ranges. + + Args: + dtype: dtype argument to the `quantize` node needed to implement the + reference model spec. + quant_min: minimum value in quantized domain (TODO: align behavior with other observers) + quant_max: maximum value in quantized domain + custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation + (Can be used in Graph Mode Passes for special case ops). + compute_dtype (deprecated): if set, marks the future quantize function to use + dynamic quantization instead of static quantization. + This field is deprecated, use `is_dynamic=True` instead. + is_dynamic: if True, the `quantize` function in the reference model + representation taking stats from this observer instance will + use dynamic quantization. + """ + + def __init__( + self, + dtype=torch.float32, + custom_op_name="", + compute_dtype=None, + quant_min=None, + quant_max=None, + qscheme=None, + eps=None, + is_dynamic=False, + ) -> None: + super().__init__(dtype=dtype, is_dynamic=is_dynamic) + if qscheme is None: + qscheme = torch.per_tensor_affine + if eps is None: + eps = torch.finfo(torch.float32).eps + + # dtype of input of the target operator, e.g. for dynamic quantization + # ops, the dtype will be float32 + self.dtype = dtype + self.qscheme = qscheme + self.quant_min = quant_min + self.quant_max = quant_max + self.eps = eps + self.custom_op = custom_op_name + # used for configuration of computation type for dynamic quantization + if compute_dtype: + is_dynamic = True + warnings.warn( + "Please use `is_dynamic` instead of `compute_dtype`. \ + `compute_dtype` will be deprecated in a future release \ + of PyTorch.", + stacklevel=2, + ) + + def forward(self, x): + return x + + @torch.jit.export + def extra_repr(self): + return f"dtype={self.dtype}, is_dynamic={self.is_dynamic}" + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + raise Exception( # noqa: TRY002 + "calculate_qparams should not be called for PlaceholderObserver" + ) + + +class RecordingObserver(ObserverBase): + r""" + The module is mainly for debug and records the tensor values during runtime. + + Args: + dtype: Quantized data type + qscheme: Quantization scheme to be used + reduce_range: Reduces the range of the quantized data type by 1 bit + """ + + __annotations__ = {"tensor_val": list[torch.Tensor | None]} + + def __init__(self, dtype=torch.quint8): + super().__init__(dtype=dtype, is_dynamic=False) + self.tensor_val = [] + + def forward(self, x): + self.tensor_val.append(x.clone()) + return x + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + raise Exception( # noqa: TRY002 + "calculate_qparams should not be called for RecordingObserver" + ) + + @torch.jit.export + def get_tensor_value(self): + return self.tensor_val + + +class NoopObserver(ObserverBase): + r""" + Observer that doesn't do anything and just passes its configuration to the + quantized module's ``.from_float()``. + + Primarily used for quantization to float16 which doesn't require determining + ranges. + + Args: + dtype: Quantized data type + custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation + (Can be used in Graph Mode Passes for special case ops). + """ + + def __init__(self, dtype=torch.float16, custom_op_name="") -> None: + super().__init__(dtype=dtype, is_dynamic=False) + self.dtype = dtype + self.custom_op = custom_op_name + + def forward(self, x): + return x + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + raise Exception( # noqa: TRY002 + "calculate_qparams should not be called for NoopObserver" + ) + + +class ReuseInputObserver(ObserverBase): + r"""This observer is used when we want to reuse the observer from the operator + that produces the input Tensor, typically used for operators like reshape, e.g. + ``` + x0 = ... + x1 = x0.reshape() + ``` + if we configure x0 to be observed by some observer, let's say MinMaxObserver, + and reshape is configured with ReuseInputObserver, we'll reuse the observer instance + for x0 for x1 (output of reshape). If x0 is not observed, we also won't observe x1. + + Note: this is only enabled in FX Graph Mode Quantization + """ + + def __init__(self) -> None: + super().__init__(torch.quint8, is_dynamic=False) + + def forward(self, x): + return x + + @torch.jit.export + def calculate_qparams(self): # type: ignore[override] + raise Exception( # noqa: TRY002 + "calculate_qparams should not be called for ReuseInputObserver" + ) + + +""" +# Experimental Affine Quantization Feature START +We plan to merge the following with torchao repo after we move pt2e flow to torchao +copied from https://github.com/pytorch/ao/blob/main/torchao/quantization/observer.py +""" +from dataclasses import dataclass +from enum import auto, Enum + + +class MappingType(Enum): + """How floating point number is mapped to integer number + + symmetric mapping means floating point range is symmetrically mapped to integer range + let's say we have floating point range (-3.5, 10.2) and integer range (-8, 7) (int4) + we'll use (-10.2, 10.2) as the range for floating point and map that to (-8, 7) + e.g. scale = (10.2 - (-10.2)) / (7 - (-8)) + + SYMMETRIC_NO_CLIPPING_ERR is a variant of symmetric mapping, where the scale is the max of smin + and smax, where smin = min_val_neg / quant_min, and smax = max_val_pos / quant_max. By calculating + smin and smax individually, there can be less round error on negative values, and no out-of-range + of all floating point values. + + asymmetric mapping means we just directly map the floating point range to integer range, + for the above example, we will map (-3.5, 10.2) to (-8, 7) and calculate quantization parameter + based on this mapping + e.g. scale = (10.2 - (-3.5)) / (7 - (-8)) + """ + + SYMMETRIC = auto() + SYMMETRIC_NO_CLIPPING_ERR = auto() + ASYMMETRIC = auto() + + +class ZeroPointDomain(Enum): + """Enum that indicate whether zero_point is in integer domain or floating point domain + + integer domain: quantized_val = (float_val / scale) (integer) + zero_point (integer) + float domain: quantized_val = (float_val - (zero_point (float) - scale * mid_point)) / scale + none domain: quantized_val = (float_val / scale) + """ + + INT = auto() + FLOAT = auto() + NONE = auto() + + +class TorchAODType(Enum): + """ + Placeholder for dtypes that do not exist in PyTorch core yet. + """ + + # torch.int1 to torch.int7 will be added to PyTorch 2.6 + # These will remain here for BC with older PyTorch versions + INT1 = auto() + INT2 = auto() + INT3 = auto() + INT4 = auto() + INT5 = auto() + INT6 = auto() + INT7 = auto() + + +@dataclass(frozen=True) +class Granularity: + """ + Base class for representing the granularity of quantization. + + This class serves as a parent for specific granularity types used in + quantization operations, such as per-tensor or per-axis quantization. + """ + + +@dataclass(frozen=True) +class PerBlock(Granularity): + """ + Represents per-block granularity in quantization. See + :func:`~torchao.quantization.quant_primitives.quantize_affine` for docs for + `block_size` + + Attributes: + block_size (Tuple[int, ...]): The size of each quantization group + """ + + block_size: tuple[int, ...] + + +@dataclass(frozen=True) +class PerTensor(Granularity): + """ + Represents per-tensor granularity in quantization. + + This granularity type calculates the quantization parameters + based off the entire tensor. + + """ + + +@dataclass(frozen=True) +class PerAxis(Granularity): + """ + Represents per-axis granularity in quantization. + + This granularity type calculates different quantization parameters + along a specified axis of the tensor. + + For example if the input tensor is shape [8, 16] and axis=0, then + the quantization parameters are calculated for each row of the tensor. + Giving a total of 8 quantization parameters. + + Attributes: + axis (int): The axis along which reduction is performed. + """ + + axis: int + + +@dataclass(frozen=True) +class PerGroup(Granularity): + """ + Represents per-channel group granularity in quantization. + + This granularity type calculates different quantization parameters + for each group of elements. + + For example if the input tensor is shape [8, 16], and the group size is 4, then + the input tensor is reshaped to [64, 4] + quantization parameters are calculated for each group of 4 elements, + giving a total of 64 quantization parameters. + + Attributes: + group_size (int): The size of each quantization group + + """ + + group_size: int + + +class PerRow(Granularity): + """ + Represents row-wise granularity in quantization. + + This is a special case of per-axis quantization and is unique to Float8 matmuls + where the input is quantized with a block_size of (1, ..., input.shape[-1]). And the weight + is quantized with a block_size of (1, weight.shape[1]). + """ + + +class PerToken(Granularity): + """ + Represents per-token granularity in quantization. + + This granularity type calculates a different set of quantization parameters + for each token, which is represented as the last dimension of the tensor. + + For example, if the input tensor has shape [2, 3, 4], then there are 6 tokens + with 4 elements each, and we will calculate 6 sets of quantization parameters, + one for each token. + + If the input tensor has only two dimensions, e.g. [8, 16], then this is + equivalent to `PerAxis(axis=0)`, which yields 8 sets of quantization parameters. + """ + + +def get_block_size( + input_shape: tuple[int, ...], granularity: Granularity +) -> tuple[int, ...]: + """Get the block size based on the input shape and granularity type. + + Args: + input_shape: The input tensor shape possibly more than 2 dimensions + granularity: The granularity type of the quantization + """ + if not isinstance(granularity, Granularity): + raise AssertionError( + "Please provide an instance of Granularity, not subclass of it" + ) + if isinstance(granularity, PerTensor): + return input_shape + elif isinstance(granularity, PerAxis): + block_size = list(input_shape) + block_size[granularity.axis] = 1 + return tuple(block_size) + elif isinstance(granularity, PerRow): + return (1,) * (len(input_shape) - 1) + (input_shape[-1],) + elif isinstance(granularity, PerGroup): + if len(input_shape) != 2: + raise AssertionError( + f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}" + ) + return (1, granularity.group_size) + elif isinstance(granularity, PerToken): + block_size = [1] * len(input_shape) + block_size[-1] = input_shape[-1] + return tuple(block_size) + raise ValueError(f"Unsupported Granularity: {granularity}") + + +class AffineQuantizedObserverBase(ABC, torch.nn.Module): + """Observer module for affine quantization (https://github.com/pytorch/ao/tree/main/torchao/quantization#affine-quantization) + + Args: + `granularity` and `block_size`: The granularity of the quantization, + must specify at least one, if both are specified `block_size` takes precedence + Current supported granularity type are `PerTensor` and `PerAxis` + other args: please see `:class:torchao.dtypes.AffineQuantizedTensor` + """ + + with_args = classmethod(_with_args) + + def __init__( + self, + mapping_type: MappingType, + target_dtype: torch.dtype, + granularity: Granularity, + quant_min: int | None = None, + quant_max: int | None = None, + eps: float | None = None, + scale_dtype: torch.dtype | None = None, + zero_point_dtype: torch.dtype | None = None, + preserve_zero: bool = True, + zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT, + # there could be some extra args that's ignored + **kwargs, + ): + super().__init__() + if granularity is None: + raise AssertionError("granularity is None") + self.mapping_type = mapping_type + self.target_dtype = target_dtype + self.granularity = granularity + self.quant_min = quant_min + self.quant_max = quant_max + self.eps = eps + self.scale_dtype = scale_dtype + self.zero_point_dtype = zero_point_dtype + self.preserve_zero = preserve_zero + self.zero_point_domain = zero_point_domain + # populatd during forward + self.block_size = None + self.original_dtype = None + + @abstractmethod + def forward(self, input: torch.Tensor) -> torch.Tensor: + """forward function should take the input tensor + and updates internal stats and return the original input Tensor + """ + + @abstractmethod + def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]: + """Calculate quantization parameter based on the stats attached to the observer module + and returns a tuple of scale and zero_point Tensor + """ + + def convert(self, model: torch.fx.GraphModule, observer_node: Node): + """ + Converts the observer node in the graph into its quantized representation + + Args: + model: graph module to convert the observer node in + observer_node: the observer node to convert + """ + from torch.ao.quantization.fx.utils import create_getattr_from_value + + with model.graph.inserting_before(observer_node): + if self.block_size is None: + raise AssertionError("Expecting block_size to be populated") + if self.original_dtype is None: + raise AssertionError("Expecting original_dtype to be populated") + if hasattr(self, "is_dynamic") and self.is_dynamic: + choose_qparams_affine = model.graph.call_function( + torch.ops.pt2e_quant.choose_qparams_affine, + ( + observer_node.args[0], + self.mapping_type.name, + self.block_size, + self.target_dtype, + self.quant_min, + self.quant_max, + self.eps, + self.scale_dtype, + self.zero_point_dtype, + self.preserve_zero, + self.zero_point_domain.name, + ), + ) + scale_node = model.graph.call_function( + operator.getitem, (choose_qparams_affine, 0) + ) + zero_point_node = model.graph.call_function( + operator.getitem, (choose_qparams_affine, 1) + ) + else: + scale, zero_point = self.calculate_qparams() + scale_node = create_getattr_from_value( + model, + model.graph, + "_scale", + scale, + scale.device if isinstance(scale, torch.Tensor) else None, + ) + zero_point_node = create_getattr_from_value( + model, + model.graph, + "_zero_point", + zero_point, + zero_point.device if isinstance(zero_point, torch.Tensor) else None, + ) + + q_node = model.graph.call_function( + torch.ops.pt2e_quant.quantize_affine, + ( + observer_node.args[0], + self.block_size, + scale_node, + zero_point_node, + self.target_dtype, + self.quant_min, + self.quant_max, + self.zero_point_domain.name, + ), + {}, + ) + dq_node = model.graph.call_function( + torch.ops.pt2e_quant.dequantize_affine, + ( + q_node, + self.block_size, + scale_node, + zero_point_node, + self.target_dtype, + self.quant_min, + self.quant_max, + self.zero_point_domain.name, + ), + {"output_dtype": self.original_dtype}, + ) + observer_node.replace_all_uses_with(dq_node) + model.graph.erase_node(observer_node) + + +def _is_observer_script_module(mod, obs_type_name): + """Returns true if given mod is an instance of Observer script module.""" + if isinstance(mod, torch.jit.RecursiveScriptModule): + # qualified name looks like '__torch__.torch.ao.quantization.observer.___torch_mangle_2.MinMaxObserver' + suffix = mod._c.qualified_name.split(".", 1)[1] + name = re.sub(r"\.___torch_mangle_\d+", "", suffix) + return obs_type_name in name + return False + + +# Experimental Affine Quantization Feature END + + +def _is_activation_post_process(module): + return isinstance( + module, + ( + torch.ao.quantization.ObserverBase, + torch.ao.quantization.FakeQuantizeBase, + AffineQuantizedObserverBase, + ), + ) or _is_observer_script_module(module, "quantization.observer") + + +def _is_per_channel_script_obs_instance(module): + if isinstance(module, torch.jit.RecursiveScriptModule): + return _is_observer_script_module( + module, "quantization.observer.PerChannelMinMaxObserver" + ) or _is_observer_script_module( + module, "quantization.observer.MovingAveragePerChannelMinMaxObserver" + ) + return False + + +def get_observer_state_dict(mod): + r""" + Returns the state dict corresponding to the observer stats. + Traverse the model state_dict and extract out the stats. + """ + od = OrderedDict() + if isinstance(mod, torch.jit.RecursiveScriptModule): + for k, v in mod.state_dict().items(): + if "observer" in k: + od[k] = v + else: + # path for GraphModule and nn.Module (eager mode) + for k, v in mod.state_dict().items(): + if "activation_post_process" in k: + od[k] = v + od._metadata = mod.state_dict()._metadata # type: ignore[attr-defined] + return od + + +def load_observer_state_dict(mod, obs_dict): + r""" + Given input model and a state_dict containing model observer stats, + load the stats back into the model. The observer state_dict can be saved + using torch.ao.quantization.get_observer_state_dict + """ + missing_keys: list[str] = [] + unexpected_keys: list[str] = [] + for name, module in mod.named_modules(): + prefix = name + "." + if _is_activation_post_process(module): + if _is_per_channel_script_obs_instance(module): + # For per-channel observers we need to call a custom load_from_state_dict to resize the tensor. + # However this is not called when the module is scripted and we end up calling the default one in module.py + module._load_from_state_dict_script( + obs_dict, prefix, {}, True, missing_keys, unexpected_keys, [] + ) + else: + module._load_from_state_dict( + obs_dict, prefix, {}, False, missing_keys, unexpected_keys, [] + ) + for k in missing_keys: + if "observer" in k or "activation_post_process" in k: + raise Exception( # noqa: TRY002 + f"Missing keys for observer {k} in state_dict" + ) + for k in unexpected_keys: + if "observer" in k or "activation_post_process" in k: + raise Exception( # noqa: TRY002 + f"Unexpected keys for observer {k} in state_dict" + ) + + +# Restrict activations to be in the range (0,127) +default_observer = MinMaxObserver.with_args(quant_min=0, quant_max=127) +""" +Default observer for static quantization, usually used for debugging. +""" + +default_placeholder_observer = PlaceholderObserver +""" +Default placeholder observer, usually used for quantization to torch.float16. +""" + +default_debug_observer = RecordingObserver +""" +Default debug-only observer. +""" + +default_weight_observer = MinMaxObserver.with_args( + dtype=torch.qint8, qscheme=torch.per_tensor_symmetric +) +""" +Default weight observer. +""" + +weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args( + dtype=torch.qint8, + qscheme=torch.per_tensor_symmetric, + quant_min=-127, + quant_max=127, + eps=2**-12, +) +""" +Symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128. +""" + +default_histogram_observer = HistogramObserver.with_args(quant_min=0, quant_max=127) +""" +Default histogram observer, usually used for PTQ. +""" + +default_per_channel_weight_observer = PerChannelMinMaxObserver.with_args( + dtype=torch.qint8, qscheme=torch.per_channel_symmetric +) +""" +Default per-channel weight observer, usually used on backends where per-channel +weight quantization is supported, such as `fbgemm`. +""" + +per_channel_weight_observer_range_neg_127_to_127 = PerChannelMinMaxObserver.with_args( + dtype=torch.qint8, + qscheme=torch.per_channel_symmetric, + quant_min=-127, + quant_max=127, + eps=2**-12, +) +""" +Per-channel, symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128. +""" + +default_dynamic_quant_observer = PlaceholderObserver.with_args( + dtype=torch.quint8, + quant_min=0, + quant_max=255, + is_dynamic=True, +) +""" +Default observer for dynamic quantization. +""" + +default_float_qparams_observer = PerChannelMinMaxObserver.with_args( + dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0 +) +""" +Default observer for a floating point zero-point. +""" + +default_float_qparams_observer_4bit = PerChannelMinMaxObserver.with_args( + dtype=torch.quint4x2, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0 +) +""" +Default observer for a floating point zero-point and 4 bit activations. +""" + +# TODO(future PR): remove these defaults and enforce activation functions +# to explicitly specify their output range +default_fixed_qparams_range_neg1to1_observer = FixedQParamsObserver.with_args( + scale=2.0 / 256.0, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255 +) +default_fixed_qparams_range_0to1_observer = FixedQParamsObserver.with_args( + scale=1.0 / 256.0, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255 +) +# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases +default_symmetric_fixed_qparams_observer = default_fixed_qparams_range_neg1to1_observer +default_affine_fixed_qparams_observer = default_fixed_qparams_range_0to1_observer + +""" +Default observers for fixed qparams operations. +""" + +default_reuse_input_observer = ReuseInputObserver +""" +Default observer for operators like reshape that reuses the observer of input to +the operator +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b34d3417ce1c5ae055003725be7918a679d03f88 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48238f38746a62273e1001fab81e2f50ca0f803d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8876d439feb41929ca9b64f3f023db499eac007b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py @@ -0,0 +1,6 @@ +from .rewrite import reference_representation_rewrite + + +__all__ = [ + "reference_representation_rewrite", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py new file mode 100644 index 0000000000000000000000000000000000000000..52084784f5036a92a909ad7f044d733677e48618 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py @@ -0,0 +1,825 @@ +# mypy: allow-untyped-defs +from collections.abc import Callable +from dataclasses import dataclass +from functools import partial +from typing import Any + +import torch +from torch._export.utils import _disable_aten_to_metadata_assertions +from torch._higher_order_ops.out_dtype import out_dtype +from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa: F401 +from torch.ao.quantization.pt2e.export_utils import _WrapperModule +from torch.ao.quantization.pt2e.utils import ( + _get_aten_graph_module_for_pattern, + _replace_literals_with_existing_placeholders, + _replace_literals_with_new_placeholders, + remove_tensor_overload_for_qdq_ops, +) +from torch.fx import GraphModule +from torch.fx.subgraph_rewriter import replace_pattern + + +__all__ = [ + "reference_representation_rewrite", +] + + +def _qdq_quantized_linear( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8 + ) + weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + torch.int8, + ) + out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantized_linear( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args. + # This results in failure to match the pattern. + # Therefore, we call a torch.ops.aten.clamp here + x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max) + weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max) + + x_i16 = x_i8.to(torch.int16) + weight_i16 = weight_i8.to(torch.int16) + # always set bias to None so that the same representation can work for the case + # no matter if bias_scale == x_scale * weight_scale or not + acc_i32 = out_dtype( + torch.ops.aten.linear.default, + torch.int32, + x_i16 - x_zero_point, + weight_i16 - weight_zero_point, + None, + ) + # TODO: change to mul.Scalar + # Note: we are quantizing bias with these scales without signal from user, but it might be OK + bias_scale = x_scale * weight_scale + bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) + acc_i32 = acc_i32 + bias_i32 + # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values + acc_i32 = ( + out_dtype( + torch.ops.aten.mul.Tensor, + torch.int32, + acc_i32, + x_scale * weight_scale / out_scale, + ) + + out_zero_point + ) + out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8) + return out_i8 + + +def _qdq_dynamic_quantized_linear( + x_fp32, + x_quant_min, + x_quant_max, + x_eps, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, +): + x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams( + x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8 + ) + x_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + x_fp32, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8 + ) + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8 + ) + weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + torch.int8, + ) + out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32) + return out_fp32 + + +def _reference_dynamic_quantized_linear( + x_fp32, + x_quant_min, + x_quant_max, + x_eps, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, +): + x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams( + x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8 + ) + # decomposed representation for quantize_per_tensor + # TODO: use out_dtype(mul, ...) here when the op is ready + x_fp32 = x_fp32 / x_scale # fp32 + # round modes might be different here + # pytorch is rounding to even, which is also common for most of the backends + x_fp32 = torch.round(x_fp32) # fp32 + x_i32 = x_fp32.to(dtype=torch.int32) # int32 + x_i32 = x_i32 + x_zero_point # int32 + # clamp works for fp32, int32 and int8 dtypes + x_i32 = torch.clamp(x_i32, x_quant_min, x_quant_max) # int32 + x_i8 = x_i32.to(dtype=torch.int8) + + weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max) + + x_i16 = x_i8.to(torch.int16) + weight_i16 = weight_i8.to(torch.int16) + # always set bias to None so that the same representation can work for the case + # no matter if bias_scale == x_scale * weight_scale or not + acc_i32 = out_dtype( + torch.ops.aten.linear.default, + torch.int32, + x_i16 - x_zero_point, + weight_i16 - weight_zero_point, + None, + ) + bias_scale = x_scale * weight_scale + bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) + acc_i32 = acc_i32 + bias_i32 + out_fp32 = acc_i32 * (x_scale * weight_scale) + return out_fp32 + + +def _qdq_quantized_conv2d( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + stride = [1, 1] + padding = [0, 0] + dilation = [1, 1] + transposed = False + output_padding = [0, 0] + groups = 1 + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8 + ) + weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + torch.int8, + ) + out_fp32 = torch.ops.aten.convolution.default( + x_fp32, + weight_fp32, + bias_fp32, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + ) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantized_conv2d( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + weight_i8, + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + bias_fp32, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + stride = [1, 1] + padding = [0, 0] + dilation = [1, 1] + transposed = False + output_padding = [0, 0] + groups = 1 + # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args. + # This results in failure to match the pattern. + # Therefore, we call a torch.ops.aten.clamp here + x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max) + weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max) + + x_i16 = x_i8.to(torch.int16) + weight_i16 = weight_i8.to(torch.int16) + # always set bias to None so that the same representation can work for the case + # no matter if bias_scale == x_scale * weight_scale or not + acc_i32 = out_dtype( + torch.ops.aten.convolution.default, + torch.int32, + x_i16 - x_zero_point, + weight_i16 - weight_zero_point, + None, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + ) + # Note: we are quantizing bias with these scales without signal from user, but it might be OK + bias_scale = x_scale * weight_scale + # bias quantization to int32 uses bias_scale = x_scale * weight_scale due to: + # Take linear calculation for example + # Out_(i, j)_fp32 = Sum_(over k)[X_(i, k)_fp32 * W_(i, k)_fp32] + bias_(i)_fp32 + # Represent X, W fp32 as their dequant transforms + # A_fp32 = (A_q - A_zero_point)/A_scale + # Out_(i, j)_fp32 = Sum_(over k)[(X_(i, k)_fp32 - X_zp) * X_scale * (W_(i, k)_fp32 - W_zp) * W_scale] + bias_(i)_fp32 + # Factor out X_scale and W_scale + # Out_(i, j)_fp32 = ((X_scale * W_scale) * Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)]) + bias_(i)_fp32 + # In order to addition of bias_(i)_fp32 inside, we must do + # Out_(i, j)_fp32 = (X_scale * W_scale) * (Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)] + (1 / (X_scale * W_scale)) * bias_(i)_fp32)W_scale # noqa: B950 + # Note we had to multiply bias_fp32 with X_scale * W_scale = bias_scale + # Thus bias quantization to int32 must be with X_scale * W_scale + + bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) + # Unsqueeze to match broadcast dims + # Unfortnuately I cannot do bias_i32.unsqueeze(0) due to literal matching nightmare + # in graph pattern replacement + bias_i32 = bias_i32.unsqueeze(-1) + bias_i32 = bias_i32.unsqueeze(-1) + acc_i32 = acc_i32 + bias_i32 + # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values + acc_i32 = ( + out_dtype( + torch.ops.aten.mul.Tensor, + torch.int32, + acc_i32, + x_scale * weight_scale / out_scale, + ) + + out_zero_point + ) + out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8) + return out_i8 + + +def _qdq_quantized_add_relu( + x_i8, + x_scale, + x_zero_point, + y_i8, + y_scale, + y_zero_point, + out_scale, + out_zero_point, + quant_min, + quant_max, +): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8 + ) + y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8 + ) + out_fp32 = x_fp32 + y_fp32 + out_fp32 = torch.ops.aten.relu(out_fp32) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantized_add_relu( + x_i8, + x_scale, + x_zero_point, + y_i8, + y_scale, + y_zero_point, + out_scale, + out_zero_point, + quant_min, + quant_max, +): + """ + See comments for `_reference_quantized_add` for more information on + how to derive the formula for out_i8 based on x_i8 and y_i8 + """ + x_i32 = x_i8.to(torch.int32) + y_i32 = y_i8.to(torch.int32) + # TODO: change this to mul.Scalar? + x_i32 = out_dtype( + torch.ops.aten.mul.Tensor, + torch.int32, + (x_i32 - x_zero_point), + (x_scale / out_scale), + ) + y_i32 = out_dtype( + torch.ops.aten.mul.Tensor, + torch.int32, + (y_i32 - y_zero_point), + (y_scale / out_scale), + ) + out_i32 = x_i32 + y_i32 + out_zero_point + # out_i32 = torch.ops.aten.clamp(out_i32, out_zero_point) + out_i8 = torch.ops.aten.clamp(out_i32, out_zero_point, quant_max).to(torch.int8) + return out_i8 + + +def _qdq_quantized_add( + x_i8, + x_scale, + x_zero_point, + y_i8, + y_scale, + y_zero_point, + out_scale, + out_zero_point, + quant_min, + quant_max, +): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8 + ) + y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8 + ) + out_fp32 = x_fp32 + y_fp32 + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantized_add( + x_i8, + x_scale, + x_zero_point, + y_i8, + y_scale, + y_zero_point, + out_scale, + out_zero_point, + quant_min, + quant_max, +): + """ + # How to Derive the formula for out_i8 based on x_i8 and y_i8 + # (since quantized add takes x_i8, y_i8 and their quantization parameters, and produce an out_i8) + + # out_i8 is quantized output, we can write down the formula for it first: + out_i8 = out_f32 / out_scale + out_zero_point (1) + + # then out_fp32 is computed from x_f32 + y_f32, and the x_fp32 and y_fp32 are the dequantized x_i8 and y_i8 + out_f32 = x_f32 + y_f32 (2) + x_fp32 = (x_i8 - x_zero_point) * x_scale (3) + y_fp32 = (y_i8 - y_zero_point) * y_scale (4) + + # applying the above formula to the out_i8 equation we can get the following: + out_i8 = out_fp32 / out_scale + out_zero_point # (1) + = (x_f32 + y_f32) / out_scale + out_zero_point # applying (2) to substitute out_fp32 with x_fp32 + y_fp32 + = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point # apply (3) and (4) + """ + x_i32 = x_i8.to(torch.int32) + y_i32 = y_i8.to(torch.int32) + # TODO: use out_dtype op + x_i32 = torch.round((x_scale / out_scale) * (x_i32 - x_zero_point)).to(torch.int32) + y_i32 = torch.round((y_scale / out_scale) * (y_i32 - y_zero_point)).to(torch.int32) + out_i32 = x_i32 + y_i32 + out_zero_point + quant_min = -128 + quant_max = 127 + out_i8 = torch.ops.aten.clamp(out_i32, quant_min, quant_max).to(torch.int8) + return out_i8 + + +def _qdq_quantized_max_pool2d( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + kernel_size = 1 + stride = 1 + padding = 0 + dilation = 1 + ceil_mode = False + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8 + ) + out_fp32, _ = torch.ops.aten.max_pool2d_with_indices.default( + x_fp32, kernel_size, stride, padding, dilation, ceil_mode + ) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantized_max_pool2d( + x_i8, + x_scale, + x_zero_point, + x_quant_min, + x_quant_max, + out_scale, + out_zero_point, + out_quant_min, + out_quant_max, +): + kernel_size = 1 + stride = 1 + padding = 0 + dilation = 1 + ceil_mode = False + # to preserve x_quant_min, x_quant_max in the graph for pattern matching + x_i8 = torch.clamp(x_i8, x_quant_min, x_quant_max) + x_i32 = x_i8.to(torch.int32) + out_i32, _ = torch.ops.aten.max_pool2d_with_indices.default( + x_i32 - x_zero_point, kernel_size, stride, padding, dilation, ceil_mode + ) + out_fp32 = out_i32 * (x_scale / out_scale) + out_zero_point + out_fp32 = torch.clamp(out_fp32, out_quant_min, out_quant_max) + out_i8 = out_fp32.to(torch.int8) + return out_i8 + + +def _quantize_per_tensor_int8(x_fp32, scale, zero_point, quant_min, quant_max): + x = torch.ops.quantized_decomposed.quantize_per_tensor( + x_fp32, scale, zero_point, quant_min, quant_max, torch.int8 + ) + return x + + +def _reference_quantize_per_tensor_int8( + x_fp32, scale, zero_point, quant_min, quant_max +): + # TODO: use out_dtype(mul, ...) here when the op is ready + x = x_fp32 / scale # fp32 + # round modes might be different here + # pytorch is rounding to even, which is also common for most of the backends + x = torch.round(x) # fp32 + x = x.to(dtype=torch.int32) # int32 + x = x + zero_point # int32 + # clamp works for fp32, int32 and int8 dtypes + x = torch.clamp(x, quant_min, quant_max) # int32 + x = x.to(dtype=torch.int8) + return x + + +def _dequantize_per_tensor_int8(x_i8, scale, zero_point, quant_min, quant_max): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, scale, zero_point, quant_min, quant_max, torch.int8 + ) + return x_fp32 + + +def _reference_dequantize_per_tensor_int8( + x_i8, scale, zero_point, quant_min, quant_max +): + # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args. + # This results in failure to match the pattern. + # Therefore, we call a torch.ops.aten.clamp here + x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max) + # TODO: use out_dtype op + # note: x_i8.to(torch.int32) does not work here + # TODO: debug the implementation later when torchdynamo time out issue is resolved + return ((x_i8.to(torch.float32) - zero_point) * scale).to(dtype=torch.float32) + + +def _quantize_per_channel_int8( + x_fp32, scales, zero_points, ch_axis, quant_min, quant_max +): + out_i8 = torch.ops.quantized_decomposed.quantize_per_channel( + x_fp32, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8 + ) + return out_i8 + + +def _reference_quantize_per_channel_int8( + x_fp32, scales, zero_points, ch_axis, quant_min, quant_max +): + x_fp32 = torch.transpose(x_fp32, ch_axis, -1) + out_i32 = torch.ops.aten.clamp( + torch.round(x_fp32 / scales).to(torch.int32) + zero_points, quant_min, quant_max + ) + out_i32 = torch.transpose(out_i32, ch_axis, -1) + return out_i32.to(torch.int8) + + +def _dequantize_per_channel_int8( + x_i8, scales, zero_points, ch_axis, quant_min, quant_max +): + # the following will be replaced as placeholders + out_fp32 = torch.ops.quantized_decomposed.dequantize_per_channel( + x_i8, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8 + ) + return out_fp32 + + +def _reference_dequantize_per_channel_int8( + x_i8, scales, zero_points, ch_axis, quant_min, quant_max +): + # the following will be replaced as placeholders + # in order to preserve the quant_min/quant_max args for pattern matching (e.g. matching for int4 quantized ops) + # we call a torch.ops.aten.clamp here + x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max) + x_i8 = torch.transpose(x_i8, ch_axis, -1) + x_i32 = x_i8.to(torch.int32) + out_fp32 = (x_i32 - zero_points).to(torch.float) * scales + out_fp32 = torch.transpose(out_fp32, ch_axis, -1) + return out_fp32 + + +def _replace_ph_qdq_per_channel_replacement(gm: torch.fx.GraphModule): + return _replace_literals_with_existing_placeholders( + gm, exclude_literals=[-1], literal_to_ph_idx={1: 3, -128: 4, 127: 5} + ) + + +@dataclass +class _RewriteInfo: + """Data needed for rewrite, this includes example inputs, pattern and replacement functions + and post transformation functions for the exported pattern and replacement GraphModule + """ + + # example inputs used for exporting the pattern into GraphModule + example_inputs: tuple[Any, ...] + pattern: Callable + replacement: Callable + # post transformation on the exported pattern and replacement GraphModule + pattern_post_trans: Callable[[GraphModule], GraphModule] | None = None + replacement_post_trans: Callable[[GraphModule], GraphModule] | None = None + + +def reference_representation_rewrite(model: GraphModule) -> GraphModule: + _QUANTIZED_LINEAR_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (2, 5), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randint(-128, 127, (5, 5), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-127], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randn(1, dtype=torch.float), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS = ( + torch.randn((2, 5), dtype=torch.float), + -128, + 127, + torch.finfo(torch.float32).eps, + torch.randint(-128, 127, (5, 5), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-127], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randn(1, dtype=torch.float), + ) + + _QUANTIZED_CONV2d_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-127], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randn(1, dtype=torch.float), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = ( + torch.randn(1, 3, 3, 3, dtype=torch.float), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(1, dtype=torch.float), + torch.zeros(1, dtype=torch.int), + torch.tensor([-128], dtype=torch.int), + torch.tensor([127], dtype=torch.int), + ) + + _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = ( + torch.randn(1, 3, 3, 3, dtype=torch.float), + torch.randn(3, dtype=torch.float), + torch.zeros(3, dtype=torch.int), + 1, + -128, + 127, + ) + + _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = ( + torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8), + torch.randn(3, dtype=torch.float), + torch.zeros(3, dtype=torch.int), + 1, + -128, + 127, + ) + + _REWRITE_INFO_LIST = [ + _RewriteInfo( + _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS, + _WrapperModule(_qdq_dynamic_quantized_linear), + _WrapperModule(_reference_dynamic_quantized_linear), + partial( + _replace_literals_with_existing_placeholders, + literal_to_ph_idx={-128: 1, 127: 2, torch.finfo(torch.float32).eps: 3}, + ), + partial( + _replace_literals_with_existing_placeholders, + literal_to_ph_idx={-128: 1, 127: 2, torch.finfo(torch.float32).eps: 3}, + ), + ), + _RewriteInfo( + _QUANTIZED_LINEAR_EXAMPLE_INPUTS, + _WrapperModule(_qdq_quantized_linear), + _WrapperModule(_reference_quantized_linear), + _replace_literals_with_new_placeholders, + _replace_literals_with_new_placeholders, + ), + _RewriteInfo( + _QUANTIZED_CONV2d_EXAMPLE_INPUTS, + _WrapperModule(_qdq_quantized_conv2d), + _WrapperModule(_reference_quantized_conv2d), + partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]), + partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]), + ), + _RewriteInfo( + _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS, + _WrapperModule(_qdq_quantized_add_relu), + _WrapperModule(_reference_quantized_add_relu), + ), + _RewriteInfo( + _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS, + _WrapperModule(_qdq_quantized_add), + _WrapperModule(_reference_quantized_add), + ), + _RewriteInfo( + _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS, + _WrapperModule(_qdq_quantized_max_pool2d), + _WrapperModule(_reference_quantized_max_pool2d), + _replace_literals_with_new_placeholders, + _replace_literals_with_new_placeholders, + ), + _RewriteInfo( + _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS, + _WrapperModule(_quantize_per_tensor_int8), + _WrapperModule(_reference_quantize_per_tensor_int8), + ), + _RewriteInfo( + _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS, + _WrapperModule(_dequantize_per_tensor_int8), + _WrapperModule(_reference_dequantize_per_tensor_int8), + ), + _RewriteInfo( + _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS, + _WrapperModule(_quantize_per_channel_int8), + _WrapperModule(_reference_quantize_per_channel_int8), + _replace_ph_qdq_per_channel_replacement, + _replace_ph_qdq_per_channel_replacement, + ), + _RewriteInfo( + _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS, + _WrapperModule(_dequantize_per_channel_int8), + _WrapperModule(_reference_dequantize_per_channel_int8), + _replace_ph_qdq_per_channel_replacement, + _replace_ph_qdq_per_channel_replacement, + ), + ] + + remove_tensor_overload_for_qdq_ops(model) + + with _disable_aten_to_metadata_assertions(): + for rewrite_info in _REWRITE_INFO_LIST: + example_inputs = rewrite_info.example_inputs + pattern = rewrite_info.pattern + replacement = rewrite_info.replacement + pattern_post_trans = rewrite_info.pattern_post_trans + replacement_post_trans = rewrite_info.replacement_post_trans + pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs) # type: ignore[arg-type, assignment] + remove_tensor_overload_for_qdq_ops(pattern) # type: ignore[arg-type] + replacement = _get_aten_graph_module_for_pattern( # type: ignore[assignment] + replacement, + example_inputs, # type: ignore[arg-type] + ) + remove_tensor_overload_for_qdq_ops(replacement) # type: ignore[arg-type] + if pattern_post_trans: + pattern = pattern_post_trans(pattern) + if replacement_post_trans: + replacement = replacement_post_trans(replacement) + pattern.recompile() # type: ignore[attr-defined] + replacement.recompile() # type: ignore[attr-defined] + replace_pattern(model, pattern, replacement) + + return model diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py new file mode 100644 index 0000000000000000000000000000000000000000..ff5d1f341751a3b0ea4f720978d3c380e26ccc41 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py @@ -0,0 +1,715 @@ +# mypy: allow-untyped-defs +import copy +import sys +import warnings +from collections import namedtuple +from typing import Any, Optional, Union +from typing_extensions import deprecated + +import torch +import torch.nn as nn +from torch.ao.quantization.fake_quantize import ( + default_dynamic_fake_quant, + default_embedding_fake_quant, + default_embedding_fake_quant_4bit, + default_fake_quant, + default_fused_act_fake_quant, + default_fused_per_channel_wt_fake_quant, + default_fused_wt_fake_quant, + default_per_channel_weight_fake_quant, + default_weight_fake_quant, + FakeQuantize, + FakeQuantizeBase, + fused_per_channel_wt_fake_quant_range_neg_127_to_127, + fused_wt_fake_quant_range_neg_127_to_127, + FusedMovingAvgObsFakeQuantize, +) + +from .observer import ( + _PartialWrapper, + default_debug_observer, + default_dynamic_quant_observer, + default_float_qparams_observer, + default_float_qparams_observer_4bit, + default_observer, + default_per_channel_weight_observer, + default_placeholder_observer, + default_reuse_input_observer, + default_weight_observer, + HistogramObserver, + MinMaxObserver, + MovingAverageMinMaxObserver, + NoopObserver, + ObserverBase, + per_channel_weight_observer_range_neg_127_to_127, + PlaceholderObserver, + ReuseInputObserver, + weight_observer_range_neg_127_to_127, +) + + +__all__ = [ + "QConfig", + # TODO: deprecated, remove + "QConfigDynamic", + "default_qconfig", + "default_debug_qconfig", + "default_per_channel_qconfig", + "default_dynamic_qconfig", + "float16_dynamic_qconfig", + "float16_static_qconfig", + "per_channel_dynamic_qconfig", + "float_qparams_weight_only_qconfig", + "float_qparams_weight_only_qconfig_4bit", + "default_quint8_weight_qconfig", + "default_qat_qconfig", + "default_dynamic_qat_qconfig", + "default_weight_only_qconfig", + "default_activation_only_qconfig", + "default_qat_qconfig_v2", + "default_reuse_input_qconfig", + "default_symmetric_qnnpack_qconfig", + "default_per_channel_symmetric_qnnpack_qconfig", + "default_symmetric_qnnpack_qat_qconfig", + "default_per_channel_symmetric_qnnpack_qat_qconfig", + "default_embedding_qat_qconfig", + "default_embedding_qat_qconfig_4bit", + "get_default_qconfig", + "get_default_qat_qconfig", + "get_default_qconfig_dict", + "get_default_qat_qconfig_dict", + "QConfigAny", + "qconfig_equals", +] + + +# pyrefly: ignore [invalid-inheritance] +class QConfig(namedtuple("QConfig", ["activation", "weight"])): + """ + Describes how to quantize a layer or a part of the network by providing + settings (observer classes) for activations and weights respectively. + + + Note that QConfig needs to contain observer **classes** (like MinMaxObserver) or a callable that returns + instances on invocation, not the concrete observer instances themselves. + Quantization preparation function will instantiate observers multiple times for each of the layers. + + + Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args` + method (that behaves like functools.partial):: + + my_qconfig = QConfig( + activation=MinMaxObserver.with_args(dtype=torch.qint8), + weight=default_observer.with_args(dtype=torch.qint8), + ) + + """ + + __slots__ = () + + def __new__(cls, activation, weight): + # catch common mistakes + if isinstance(activation, nn.Module) or isinstance(weight, nn.Module): + raise ValueError( + "QConfig received observer instance, please pass observer class instead. " + + "Use MyObserver.with_args(x=1) to override arguments to constructor if needed" + ) + return super().__new__(cls, activation, weight) + + +@deprecated( + "`QConfigDynamic` is going to be deprecated in PyTorch 1.12, please use `QConfig` instead", + category=FutureWarning, +) +# pyrefly: ignore [invalid-inheritance] +class QConfigDynamic(namedtuple("QConfigDynamic", ["activation", "weight"])): + """ + Describes how to dynamically quantize a layer or a part of the network by providing + settings (observer classes) for weights. + + It's like QConfig, but for dynamic quantization. + + Note that QConfigDynamic needs to contain observer **classes** (like MinMaxObserver) or a callable that returns + instances on invocation, not the concrete observer instances themselves. + Quantization function will instantiate observers multiple times for each of the layers. + + Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args` + method (that behaves like functools.partial):: + + my_qconfig = QConfigDynamic(weight=default_observer.with_args(dtype=torch.qint8)) + """ + + __slots__ = () + + def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity): + # catch common mistakes + if isinstance(weight, nn.Module): + raise ValueError( + "QConfigDynamic received observer instance, please pass observer class instead. " + + "Use MyObserver.with_args(x=1) to override arguments to constructor if needed" + ) + return super().__new__(cls, activation, weight) + + +default_qconfig = QConfig(activation=default_observer, weight=default_weight_observer) +""" +Default qconfig configuration. +""" + +default_debug_qconfig = QConfig( + weight=default_weight_observer, activation=default_debug_observer +) +""" +Default qconfig configuration for debugging. +""" + +default_per_channel_qconfig = QConfig( + activation=default_observer, weight=default_per_channel_weight_observer +) +""" +Default qconfig configuration for per channel weight quantization. +""" + +default_dynamic_qconfig = QConfig( + activation=default_dynamic_quant_observer, weight=default_weight_observer +) +""" +Default dynamic qconfig. +""" + +float16_dynamic_qconfig = QConfig( + activation=PlaceholderObserver.with_args(dtype=torch.float16, is_dynamic=True), + weight=PlaceholderObserver.with_args(dtype=torch.float16), +) +""" +Dynamic qconfig with weights quantized to `torch.float16`. +""" + +float16_static_qconfig = QConfig( + activation=PlaceholderObserver.with_args(dtype=torch.float16), + weight=PlaceholderObserver.with_args(dtype=torch.float16), +) +""" +Dynamic qconfig with both activations and weights quantized to `torch.float16`. +""" + +per_channel_dynamic_qconfig = QConfig( + activation=default_dynamic_quant_observer, + weight=default_per_channel_weight_observer, +) +""" +Dynamic qconfig with weights quantized per channel. +""" + +float_qparams_weight_only_qconfig = QConfig( + activation=default_placeholder_observer, weight=default_float_qparams_observer +) +""" +Dynamic qconfig with weights quantized with a floating point zero_point. +""" + +float_qparams_weight_only_qconfig_4bit = QConfig( + activation=default_placeholder_observer, weight=default_float_qparams_observer_4bit +) + +default_qat_qconfig = QConfig( + activation=default_fake_quant, weight=default_weight_fake_quant +) +""" +Default qconfig for QAT. +""" + +default_dynamic_qat_qconfig = QConfig( + activation=default_dynamic_fake_quant, weight=default_weight_fake_quant +) +""" +Default qconfig for dynamic QAT. +""" + +default_weight_only_qconfig = QConfig( + activation=torch.nn.Identity, weight=default_weight_fake_quant +) +""" +Default qconfig for quantizing weights only. +""" + +default_activation_only_qconfig = QConfig( + activation=default_fake_quant, weight=torch.nn.Identity +) +""" +Default qconfig for quantizing activations only. +""" + +# QAT config that uses a fused observer + fake quant modules for optimized training performance. +# to modify the activation/weight observers, the default entries in fake_quantize.py can be modified. +default_qat_qconfig_v2 = QConfig( + activation=default_fused_act_fake_quant, weight=default_fused_wt_fake_quant +) +""" +Fused version of `default_qat_config`, has performance benefits. +""" + +default_reuse_input_qconfig = QConfig( + activation=default_reuse_input_observer, weight=NoopObserver +) +""" +Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape +""" + + +def get_default_qconfig(backend="x86", version=0): + """ + Returns the default PTQ qconfig for the specified backend. + + Args: + * `backend` (str): a string representing the target backend. Currently supports + `x86` (default), `fbgemm`, `qnnpack` and `onednn`. + + Return: + qconfig + """ + supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"] + if backend not in supported_backends: + raise AssertionError( + "backend: " + + str(backend) + + f" not supported. backend must be one of {supported_backends}" + ) + + if version == 0: + if backend == "fbgemm": + qconfig = QConfig( + activation=HistogramObserver.with_args(reduce_range=True), + weight=default_per_channel_weight_observer, + ) + elif backend == "qnnpack": + # TODO: make this compatible with xnnpack constraints + qconfig = QConfig( + activation=HistogramObserver.with_args(reduce_range=False), + weight=default_weight_observer, + ) + elif backend == "onednn": + if not torch.cpu._is_vnni_supported(): + warnings.warn( + "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues " + "on CPU without Vector Neural Network Instruction support.", + stacklevel=2, + ) + qconfig = QConfig( + activation=HistogramObserver.with_args(reduce_range=False), + weight=default_per_channel_weight_observer, + ) + elif backend == "x86": + qconfig = QConfig( + activation=HistogramObserver.with_args(reduce_range=True), + weight=default_per_channel_weight_observer, + ) + else: + # won't reach + qconfig = default_qconfig + else: + raise AssertionError( + "Version number: " + + str(version) + + " in get_default_qconfig is not supported. Version number must be 0" + ) + + return qconfig + + +""" +Default, symmetric PTQ qconfig for the specified backend. And a per_channel +variant of the same. + +Symmetric here applies to signed weights with zero point = 0, and additional +value restrictions. The activations are also signed 8-bit integers with this +qconfig. + + * Once this change is merged [as of 3/17/22], with backend or qengine = + 'qnnpack', some quantized operators with this symmetric qconfig may use + operators from xnnpack library. + + ** Support to use xnnpack ops with `qnnpack` backed for asymmetric + qconfig (returned by get_default_qconfig()) is not available yet. + + * This qconfig uses signed activations and weights. Weights have added + restrictions such as zero point is forced to be 0, making the weights + symmetric, hence the name. And the 8-bit quantized values are + restricting to to [-127, +127], excluding -128. + + * xnnpack has a requantization scale value restriction, 0x1p-32 <= + requantization_scale < 256.0 where, `requantization_scale = (input_scale + * kernel_scale) / (output_scale)`. Using this eps (w/ assumed max value + of 256) is to prevent requantization_scale to go below xnnpack lower + threshold. +""" +default_symmetric_qnnpack_qconfig = QConfig( + activation=HistogramObserver.with_args( + dtype=torch.qint8, reduce_range=False, eps=2**-12 + ), + weight=weight_observer_range_neg_127_to_127, +) + +default_per_channel_symmetric_qnnpack_qconfig = QConfig( + activation=HistogramObserver.with_args( + dtype=torch.qint8, reduce_range=False, eps=2**-12 + ), + weight=per_channel_weight_observer_range_neg_127_to_127, +) + +default_embedding_qat_qconfig = QConfig( + activation=NoopObserver.with_args(dtype=torch.float32), + weight=default_embedding_fake_quant, +) + +default_embedding_qat_qconfig_4bit = QConfig( + activation=NoopObserver.with_args(dtype=torch.float32), + weight=default_embedding_fake_quant_4bit, +) + +default_quint8_weight_qconfig = QConfig( + activation=HistogramObserver, weight=MinMaxObserver +) + + +def get_default_qat_qconfig(backend="x86", version=1): + """ + Returns the default QAT qconfig for the specified backend. + + Args: + * `backend` (str): a string representing the target backend. Currently supports + `x86` (default), `fbgemm`, `qnnpack` and `onednn`. + * `version`: version, for backwards compatibility. Can be `None` or `1`. + + Return: + qconfig + """ + supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"] + if backend not in supported_backends: + raise AssertionError( + "backend: " + + str(backend) + + f" not supported. backend must be one of {supported_backends}" + ) + + # Histogram observer is too slow for quantization aware training + if version == 0: + if backend == "fbgemm": + qconfig = QConfig( + activation=FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=True, + ), + weight=default_per_channel_weight_fake_quant, + ) + elif backend == "qnnpack": + qconfig = QConfig( + activation=FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=False, + ), + weight=default_weight_fake_quant, + ) + elif backend == "onednn": + qconfig = QConfig( + activation=FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255 + ), + weight=default_per_channel_weight_fake_quant, + ) + elif backend == "x86": + qconfig = QConfig( + activation=FakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=True, + ), + weight=default_per_channel_weight_fake_quant, + ) + else: + qconfig = default_qat_qconfig + # Use the fused observe + fake_quant modules for doing QAT. + elif version == 1: + if backend == "fbgemm": + qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=True, + ), + weight=default_fused_per_channel_wt_fake_quant, + ) + elif backend == "qnnpack": + # TODO: make this compatible with xnnpack constraints + qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=False, + ), + weight=default_fused_wt_fake_quant, + ) + elif backend == "onednn": + qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255 + ), + weight=default_fused_per_channel_wt_fake_quant, + ) + elif backend == "x86": + qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255, + reduce_range=True, + ), + weight=default_fused_per_channel_wt_fake_quant, + ) + else: + qconfig = default_qat_qconfig_v2 + else: + raise AssertionError( + "Version number: " + + str(version) + + "in get_default_qat_qconfig is not supported. Version number must be 0 or 1" + ) + + return qconfig + + +""" +Default symmetric QAT qconfig for qnnpack. And its per channel weight variant. +""" +default_symmetric_qnnpack_qat_qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + reduce_range=False, + eps=2**-12, + ), + weight=fused_wt_fake_quant_range_neg_127_to_127, +) + +default_per_channel_symmetric_qnnpack_qat_qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args( + observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + reduce_range=False, + eps=2**-12, + ), + weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127, +) + +_default_fp32_placeholder_qconfig = QConfig( + activation=PlaceholderObserver.with_args(dtype=torch.float32), + weight=PlaceholderObserver.with_args(dtype=torch.float32), +) + +_default_quint8_placeholder_qconfig = QConfig( + activation=PlaceholderObserver.with_args(dtype=torch.quint8), + # operators using this qconfig doesn't have weights + weight=None, +) + + +@deprecated( + "`torch.ao.quantization.get_default_qconfig_dict` is deprecated and will be removed in " + "a future version. Please use `torch.ao.quantization.get_default_qconfig_mapping` instead.", + category=FutureWarning, +) +def get_default_qconfig_dict(backend="x86", version=0): + return torch.ao.quantization.get_default_qconfig_mapping(backend, version).to_dict() + + +@deprecated( + "`torch.ao.quantization.get_default_qat_qconfig_dict` is deprecated and will be removed in " + "a future version. Please use `torch.ao.quantization.get_default_qat_qconfig_mapping` instead.", + category=FutureWarning, +) +def get_default_qat_qconfig_dict(backend="x86", version=1): + return torch.ao.quantization.get_default_qat_qconfig_mapping( + backend, version + ).to_dict() + + +def _assert_valid_qconfig(qconfig: QConfig | None, mod: torch.nn.Module) -> None: + """ + Verifies that this `qconfig` is valid. + """ + if qconfig is None: + return + is_conv_transpose_mod = isinstance( + mod, + (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d), + ) + if is_conv_transpose_mod: + if qconfig.weight is None: + # for now, we assume that any qconfig for ConvTranspose without a weight is valid + return + example_observer = qconfig.weight() + is_per_channel = isinstance( + example_observer, + ( + torch.ao.quantization.PerChannelMinMaxObserver, + torch.ao.quantization.MovingAveragePerChannelMinMaxObserver, + ), + ) + if is_per_channel: + raise AssertionError( + "Per channel weight observer is not supported yet for ConvTranspose{n}d." + ) + + +if sys.version_info < (3, 12): + QConfigAny = Optional[QConfig] + QConfigAny.__module__ = "torch.ao.quantization.qconfig" +else: + from typing import TypeAliasType + + QConfigAny = TypeAliasType("QConfigAny", QConfig | None) + + +def _add_module_to_qconfig_obs_ctr( + qconfig: QConfigAny, module: nn.Module | None +) -> Any: + r"""This is a helper function for use in quantization prepare that updates a qconfig so that + the constructors stored in the qconfig will create observers on the same device that + 'module' is on. This is intended to be used when the qconfigs are propagated to each + module in order to avoid potential device alignment issues. + + Args: + qconfig: QConfig with obs constructors stored in activation and weight + module: module which the qconfig is related to + + Return: + qconfig: configured so that obs constructors set to construct on the same device as module + """ + + if module is None or qconfig is None or qconfig._fields != ("activation", "weight"): + return qconfig + + def get_factory_kwargs_based_on_module_device(): + if not isinstance(module, torch.nn.Module): + raise AssertionError("module must be an instance of torch.nn.Module") + devices = {p.device for p in module.parameters()} | { + p.device for p in module.buffers() + } + device = next(iter(devices)) if len(devices) > 0 else None + return None if device is None else {"device": device} + + def configure_constructor_to_put_obs_on_module_device(original_constructor): + try: + # check if constructor can accept factory_kwargs + check = original_constructor.with_args(factory_kwargs=None) + check() + return original_constructor.with_callable_args( + factory_kwargs=get_factory_kwargs_based_on_module_device + ) + except AttributeError: # qconfig doesn't have activation or weight + return original_constructor + except TypeError: # the class doesn't accept factory_kwargs argument + return original_constructor + + activation = configure_constructor_to_put_obs_on_module_device(qconfig.activation) + weight = configure_constructor_to_put_obs_on_module_device(qconfig.weight) + + return QConfig(activation, weight) + + +_ObserverOrFakeQuantizeConstructor = Union[ + _PartialWrapper, type[ObserverBase], type[FakeQuantizeBase] +] + + +def _obs_or_fq_ctr_equals( + obs_or_fq1: _ObserverOrFakeQuantizeConstructor, + obs_or_fq2: _ObserverOrFakeQuantizeConstructor, +): + if isinstance(obs_or_fq1, _PartialWrapper) and isinstance( + obs_or_fq2, _PartialWrapper + ): + return _partial_wrapper_equals(obs_or_fq1, obs_or_fq2) + return obs_or_fq1 == obs_or_fq2 + + +def _partial_wrapper_equals(obs_or_fq1: _PartialWrapper, obs_or_fq2: _PartialWrapper): + """ + Return whether the two partial wrappers are equal, + """ + # functools.partial has no __eq__ operator defined so '==' defaults to 'is' + obs_or_fq1_keywords = copy.copy(obs_or_fq1.p.keywords) + obs_or_fq2_keywords = copy.copy(obs_or_fq2.p.keywords) + keywords_equal = True + # compare observer constructor with _obs_or_fq_ctr_equals since direct compare would fail + if "observer" in obs_or_fq1_keywords and "observer" in obs_or_fq2_keywords: + keywords_equal = keywords_equal and _obs_or_fq_ctr_equals( + obs_or_fq1_keywords["observer"], obs_or_fq2_keywords["observer"] + ) + obs_or_fq1_keywords.pop("observer") + obs_or_fq2_keywords.pop("observer") + keywords_equal = keywords_equal and obs_or_fq1_keywords == obs_or_fq2_keywords + return ( + obs_or_fq1.p.func == obs_or_fq2.p.func + and obs_or_fq1.p.args == obs_or_fq2.p.args + and keywords_equal + ) + + +def qconfig_equals(q1: QConfigAny, q2: QConfigAny): + """ + Returns `True` if `q1` equals `q2`, and `False` otherwise. + """ + if q1 is None or q2 is None: + return q1 == q2 + else: + if q1 is None or q2 is None: + raise AssertionError( + "Both q1 and q2 must be non-None for qconfig comparison" + ) + try: + # Qconfig weight and activation can be either a partial wrapper, + # or an observer class. Special handling is required (above) for + # comparing partial wrappers. + activation_same = _obs_or_fq_ctr_equals(q1.activation, q2.activation) + weight_same = _obs_or_fq_ctr_equals(q1.weight, q2.weight) + return activation_same and weight_same + except AttributeError: + return q1 == q2 + + +def _activation_is_memoryless(qconfig: QConfig): + """ + Return whether the observer for activations defined in the given QConfig is memoryless. + This means a MovingAverage observer with averaging constant equal to 1. + """ + + def _is_memoryless(observer): + return ( + hasattr(observer, "averaging_constant") and observer.averaging_constant == 1 + ) + + act = qconfig.activation() + if isinstance(act, FakeQuantizeBase) and hasattr(act, "activation_post_process"): + return _is_memoryless(act.activation_post_process) + else: + return _is_memoryless(act) + + +def _is_reuse_input_qconfig(qconfig: QConfig | None): + return ( + qconfig is not None + and isinstance(qconfig.activation(), ReuseInputObserver) + and isinstance(qconfig.weight(), NoopObserver) + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..cf896a96da055ea99d1e165c12dc450f50ad77dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py @@ -0,0 +1,385 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +from collections import OrderedDict +from typing import Any, TYPE_CHECKING + +import torch + +from .fake_quantize import default_weight_fake_quant, FixedQParamsFakeQuantize +from .observer import ( + _PartialWrapper, + default_fixed_qparams_range_0to1_observer, + default_fixed_qparams_range_neg1to1_observer, + default_placeholder_observer, + default_weight_observer, +) +from .qconfig import ( + default_quint8_weight_qconfig, + default_reuse_input_qconfig, + default_symmetric_qnnpack_qat_qconfig, + default_symmetric_qnnpack_qconfig, + get_default_qat_qconfig, + get_default_qconfig, + QConfig, + QConfigAny, +) + + +if TYPE_CHECKING: + from collections.abc import Callable + + +__all__ = [ + "get_default_qconfig_mapping", + "get_default_qat_qconfig_mapping", + "QConfigMapping", +] + + +# TODO: replace all usages with these constants +_GLOBAL_DICT_KEY = "" +_OBJECT_TYPE_DICT_KEY = "object_type" +_MODULE_NAME_REGEX_DICT_KEY = "module_name_regex" +_MODULE_NAME_DICT_KEY = "module_name" +_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order" + +# TODO: derive this map from the BackendConfig +_FIXED_QPARAMS_OP_TO_OBSERVER: dict[Callable | str, _PartialWrapper] = { + torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer, + torch.nn.functional.hardsigmoid: default_fixed_qparams_range_0to1_observer, + "hardsigmoid": default_fixed_qparams_range_0to1_observer, + "hardsigmoid_": default_fixed_qparams_range_0to1_observer, + torch.nn.Sigmoid: default_fixed_qparams_range_0to1_observer, + torch.sigmoid: default_fixed_qparams_range_0to1_observer, + "sigmoid": default_fixed_qparams_range_0to1_observer, + "sigmoid_": default_fixed_qparams_range_0to1_observer, + torch.nn.Softmax: default_fixed_qparams_range_0to1_observer, + torch.nn.Tanh: default_fixed_qparams_range_neg1to1_observer, + torch.tanh: default_fixed_qparams_range_neg1to1_observer, + "tanh": default_fixed_qparams_range_neg1to1_observer, + "tanh_": default_fixed_qparams_range_neg1to1_observer, +} + + +def _get_default_qconfig_mapping( + is_qat: bool, backend: str, version: int +) -> QConfigMapping: + """ + Return the default QConfigMapping for the given quantization type and backend. + """ + if is_qat: + qconfig = get_default_qat_qconfig(backend, version) + else: + qconfig = get_default_qconfig(backend, version) + default_weight = default_weight_fake_quant if is_qat else default_weight_observer + + # default_per_channel_weight_observer is not currently compatible with fbgemm backend + # so we have to modify the weight observer to default_weight_observer or another + # per tensor supported observer. + # see https://github.com/pytorch/pytorch/issues/47535 + if backend in ("fbgemm", "x86"): + qconfig_transpose = QConfig( + activation=qconfig.activation, weight=default_weight + ) + else: + qconfig_transpose = qconfig + + # currently layernorm only supports float weights + # we have to add this because otherwise there will be a extra quantize-dequantize pair + qconfig_layernorm = QConfig( + activation=qconfig.activation, weight=default_placeholder_observer + ) + + qconfig_mapping = ( + QConfigMapping() + .set_global(qconfig) + .set_object_type("reshape", default_reuse_input_qconfig) + .set_object_type(torch.nn.ConvTranspose1d, qconfig_transpose) + .set_object_type(torch.nn.ConvTranspose2d, qconfig_transpose) + .set_object_type(torch.nn.ConvTranspose3d, qconfig_transpose) + .set_object_type(torch.nn.functional.conv_transpose1d, qconfig_transpose) + .set_object_type(torch.nn.functional.conv_transpose2d, qconfig_transpose) + .set_object_type(torch.nn.functional.conv_transpose3d, qconfig_transpose) + .set_object_type(torch.nn.functional.layer_norm, qconfig_layernorm) + .set_object_type(torch.nn.LayerNorm, qconfig_layernorm) + .set_object_type(torch.nn.PReLU, default_quint8_weight_qconfig) + ) + # Use special observers for ops with fixed qparams + fixed_qparams_observer_to_qconfig: dict[Any, QConfigAny] = {} + for fixed_qparams_op, observer in _FIXED_QPARAMS_OP_TO_OBSERVER.items(): + if observer in fixed_qparams_observer_to_qconfig: + fixed_qparams_qconfig = fixed_qparams_observer_to_qconfig[observer] + else: + if is_qat: + activation = FixedQParamsFakeQuantize.with_args(observer=observer) + else: + activation = observer + fixed_qparams_qconfig = QConfig( + activation=activation, weight=default_weight + ) + fixed_qparams_observer_to_qconfig[observer] = fixed_qparams_qconfig + qconfig_mapping.set_object_type(fixed_qparams_op, fixed_qparams_qconfig) + + # TODO Currently it's required that separate ops in a fused op/module have the same qconfig. + # Need to be able to support fusion of ops with different qconfigs + + return qconfig_mapping + + +def get_default_qconfig_mapping(backend="x86", version=0) -> QConfigMapping: + """ + Return the default QConfigMapping for post training quantization. + + Args: + * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be + one of ["x86" (default), "fbgemm", "qnnpack", "onednn"] + * ``version`` (int) : the version for the default qconfig mapping + """ + # TODO: add assert for backend choices + return _get_default_qconfig_mapping(False, backend, version) + + +def get_default_qat_qconfig_mapping(backend="x86", version=1) -> QConfigMapping: + """ + Return the default QConfigMapping for quantization aware training. + + Args: + * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be + one of ["x86" (default), "fbgemm", "qnnpack", "onednn"] + * ``version`` (int) : the version for the default qconfig mapping + """ + return _get_default_qconfig_mapping(True, backend, version) + + +def _get_symmetric_qnnpack_qconfig_mapping() -> QConfigMapping: + """ + Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qconfig` + as the default QConfig. + """ + default_qconfig = default_symmetric_qnnpack_qconfig + return _get_default_qconfig_mapping_with_default_qconfig( + False, "qnnpack", default_qconfig + ) + + +def _get_symmetric_qnnpack_qat_qconfig_mapping() -> QConfigMapping: + """ + Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qat_qconfig` + as the default QConfig. + """ + default_qconfig = default_symmetric_qnnpack_qat_qconfig + return _get_default_qconfig_mapping_with_default_qconfig( + True, "qnnpack", default_qconfig + ) + + +def _get_default_qconfig_mapping_with_default_qconfig( + is_qat: bool, + backend: str, + default_qconfig: QConfig, +) -> QConfigMapping: + """ + Return a QConfigMapping that uses the provided qconfig as the default QConfig. + """ + if is_qat: + qconfig_mapping = get_default_qat_qconfig_mapping(backend) + else: + qconfig_mapping = get_default_qconfig_mapping(backend) + qconfig_mapping.set_global(default_qconfig) + for pattern in qconfig_mapping.object_type_qconfigs: + if pattern not in _FIXED_QPARAMS_OP_TO_OBSERVER: + qconfig_mapping.set_object_type(pattern, default_qconfig) + return qconfig_mapping + + +_QCONFIG_STYLE_ORDER: list[str] = [ + "global_qconfig", + "object_type_qconfigs", + "module_name_regex_qconfigs", + "module_name_qconfigs", + "module_name_object_type_order_qconfigs", +] + + +class QConfigMapping: + """ + Mapping from model ops to :class:`torch.ao.quantization.QConfig` s. + + The user can specify QConfigs using the following methods (in increasing match priority): + + ``set_global`` : sets the global (default) QConfig + + ``set_object_type`` : sets the QConfig for a given module type, function, or method name + + ``set_module_name_regex`` : sets the QConfig for modules matching the given regex string + + ``set_module_name`` : sets the QConfig for modules matching the given module name + + ``set_module_name_object_type_order`` : sets the QConfig for modules matching a combination + of the given module name, object type, and the index at which the module appears + + Example usage:: + + qconfig_mapping = QConfigMapping() + .set_global(global_qconfig) + .set_object_type(torch.nn.Linear, qconfig1) + .set_object_type(torch.nn.ReLU, qconfig1) + .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1) + .set_module_name_regex("foo.*", qconfig2) + .set_module_name("module1", qconfig1) + .set_module_name("module2", qconfig2) + .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, qconfig3) + + """ + + def __init__(self) -> None: + # In increasing match priority: + self.global_qconfig: QConfigAny = None + self.object_type_qconfigs: OrderedDict[Callable | str, QConfigAny] = ( + OrderedDict() + ) + self.module_name_regex_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict() + self.module_name_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict() + self.module_name_object_type_order_qconfigs: OrderedDict[ + tuple[str, Callable, int], QConfigAny + ] = OrderedDict() + + def set_global(self, global_qconfig: QConfigAny) -> QConfigMapping: + """ + Set the global (default) QConfig. + """ + self.global_qconfig = global_qconfig + return self + + def set_object_type( + self, object_type: Callable | str, qconfig: QConfigAny + ) -> QConfigMapping: + """ + Set the QConfig for a given module type, function, or method name. + If the QConfig for an existing object type was already set, the new QConfig will override the old one. + """ + self.object_type_qconfigs[object_type] = qconfig + return self + + def set_module_name_regex( + self, module_name_regex: str, qconfig: QConfigAny + ) -> QConfigMapping: + """ + Set the QConfig for modules matching the given regex string. + + Regexes will be matched in the order in which they are registered through this method. + Thus, the caller should register more specific patterns first, e.g.:: + + qconfig_mapping = QConfigMapping() + .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1) + .set_module_name_regex("foo.*bar.*", qconfig2) + .set_module_name_regex("foo.*", qconfig3) + + In this example, "foo.bar.conv0" would match qconfig1, "foo.bar.linear" would match qconfig2, + and "foo.baz.relu" would match qconfig3. + + If the QConfig for an existing module name regex was already set, the new QConfig will override the + old one while preserving the order in which the regexes were originally registered. + """ + self.module_name_regex_qconfigs[module_name_regex] = qconfig + return self + + def set_module_name(self, module_name: str, qconfig: QConfigAny) -> QConfigMapping: + """ + Set the QConfig for modules matching the given module name. + If the QConfig for an existing module name was already set, the new QConfig will override the old one. + """ + self.module_name_qconfigs[module_name] = qconfig + return self + + def set_module_name_object_type_order( + self, module_name: str, object_type: Callable, index: int, qconfig: QConfigAny + ) -> QConfigMapping: + """ + Set the QConfig for modules matching a combination of the given module name, object type, + and the index at which the module appears. + + If the QConfig for an existing (module name, object type, index) was already set, the new QConfig + will override the old one. + """ + self.module_name_object_type_order_qconfigs[ + (module_name, object_type, index) + ] = qconfig + return self + + def __repr__(self) -> str: + output = self.__class__.__name__ + " (" + for style_name in _QCONFIG_STYLE_ORDER: + output += f"\n {style_name}" + qconfigs = getattr(self, style_name) + if isinstance(qconfigs, OrderedDict) and len(qconfigs) > 0: + for key, qconfig in qconfigs.items(): + output += f"\n {key}: {qconfig}" + else: + output += f"\n {qconfigs}" + return output + "\n)" + + # TODO: remove this + def to_dict(self) -> dict[str, Any]: + """ + Convert this ``QConfigMapping`` to a dictionary with the following keys: + + "" (for global QConfig) + + "object_type" + + "module_name_regex" + + "module_name" + + "module_name_object_type_order" + + The values of this dictionary are lists of tuples. + """ + return { + _GLOBAL_DICT_KEY: self.global_qconfig, + _OBJECT_TYPE_DICT_KEY: list(self.object_type_qconfigs.items()), + _MODULE_NAME_REGEX_DICT_KEY: list(self.module_name_regex_qconfigs.items()), + _MODULE_NAME_DICT_KEY: list(self.module_name_qconfigs.items()), + _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [ + (*k, v) for k, v in self.module_name_object_type_order_qconfigs.items() + ], + } + + # TODO: remove this + @classmethod + def from_dict(cls, qconfig_dict: dict[str, Any]) -> QConfigMapping: + """ + Create a ``QConfigMapping`` from a dictionary with the following keys (all optional): + + "" (for global QConfig) + + "object_type" + + "module_name_regex" + + "module_name" + + "module_name_object_type_order" + + The values of this dictionary are expected to be lists of tuples. + """ + conf = cls() + if _GLOBAL_DICT_KEY in qconfig_dict: + conf.set_global(qconfig_dict[_GLOBAL_DICT_KEY]) + for object_type, qconfig in qconfig_dict.get(_OBJECT_TYPE_DICT_KEY, []): + conf.set_object_type(object_type, qconfig) + for module_name_regex, qconfig in qconfig_dict.get( + _MODULE_NAME_REGEX_DICT_KEY, [] + ): + conf.set_module_name_regex(module_name_regex, qconfig) + for module_name, qconfig in qconfig_dict.get(_MODULE_NAME_DICT_KEY, []): + conf.set_module_name(module_name, qconfig) + for module_name, object_type, index, qconfig in qconfig_dict.get( + _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY, [] + ): + conf.set_module_name_object_type_order( + module_name, object_type, index, qconfig + ) + return conf diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py new file mode 100644 index 0000000000000000000000000000000000000000..18488d7f9ccba604ca8f1df7ea0ef4a88546d63e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py @@ -0,0 +1,35 @@ +import enum + + +__all__ = [ + "QuantType", +] + + +# Quantization type (dynamic quantization, static quantization). +# Should match the c++ enum in quantization_type.h +class QuantType(enum.IntEnum): + DYNAMIC = 0 + STATIC = 1 + QAT = 2 + WEIGHT_ONLY = 3 + + +_quant_type_to_str = { + QuantType.STATIC: "static", + QuantType.DYNAMIC: "dynamic", + QuantType.QAT: "qat", + QuantType.WEIGHT_ONLY: "weight_only", +} + + +# TODO: make this private +def _get_quant_type_to_str(quant_type: QuantType) -> str: + return _quant_type_to_str[quant_type] + + +def _quant_type_from_str(name: str) -> QuantType: + for quant_type, s in _quant_type_to_str.items(): + if name == s: + return quant_type + raise ValueError(f"Unknown QuantType name '{name}'") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..647ed5a4d4f3946626ef360a7a45541719136006 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py @@ -0,0 +1,369 @@ +import copy +from collections.abc import Callable +from typing import Any + +import torch +import torch.ao.nn as ao_nn +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +import torch.ao.nn.intrinsic.quantized as nniq +import torch.ao.nn.intrinsic.quantized.dynamic as nniqd +import torch.ao.nn.qat as nnqat +import torch.ao.nn.qat.dynamic as nnqatd +import torch.ao.nn.quantized as nnq +import torch.ao.nn.quantized.dynamic as nnqd +import torch.ao.nn.quantized.reference as nnqr + +# Because `torch.ao.nn` uses lazy imports, we need to make +# sure we import the contents explicitly here. +import torch.ao.nn.sparse +import torch.nn.functional as F +from torch import nn +from torch.ao.quantization.fake_quantize import ( + default_fixed_qparams_range_0to1_fake_quant, + default_fixed_qparams_range_neg1to1_fake_quant, +) +from torch.ao.quantization.stubs import DeQuantStub, QuantStub +from torch.ao.quantization.utils import get_combined_dict +from torch.nn.utils.parametrize import type_before_parametrizations + + +__all__ = [ + "DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS", + "DEFAULT_STATIC_QUANT_MODULE_MAPPINGS", + "DEFAULT_QAT_MODULE_MAPPINGS", + "DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS", + "DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS", + "DEFAULT_MODULE_TO_ACT_POST_PROCESS", + "DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS", + "DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS", + "no_observer_set", + "get_default_static_quant_module_mappings", + "get_default_static_quant_reference_module_mappings", + "get_embedding_static_quant_module_mappings", + "get_default_static_sparse_quant_module_mappings", + "get_static_quant_module_class", + "get_dynamic_quant_module_class", + "get_default_qat_module_mappings", + "get_embedding_qat_module_mappings", + "get_default_dynamic_quant_module_mappings", + "get_default_dynamic_sparse_quant_module_mappings", + "get_default_qconfig_propagation_list", + "get_default_compare_output_module_list", + "get_default_float_to_quantized_operator_mappings", + "get_quantized_operator", +] + +# Default map for swapping float module to reference quantized modules +DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = { + QuantStub: nnq.Quantize, + DeQuantStub: nnq.DeQuantize, + nn.Linear: nnqr.Linear, + nn.Conv1d: nnqr.Conv1d, + nn.Conv2d: nnqr.Conv2d, + nn.Conv3d: nnqr.Conv3d, + nn.ConvTranspose1d: nnqr.ConvTranspose1d, + nn.ConvTranspose2d: nnqr.ConvTranspose2d, + nn.ConvTranspose3d: nnqr.ConvTranspose3d, + nn.Embedding: nnqr.Embedding, + nn.EmbeddingBag: nnqr.EmbeddingBag, + nn.GRUCell: nnqr.GRUCell, + nn.LSTMCell: nnqr.LSTMCell, + nn.RNNCell: nnqr.RNNCell, + nn.LSTM: nnqr.LSTM, +} + +# Default map for swapping float module to quantized ones +DEFAULT_STATIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = { + QuantStub: nnq.Quantize, + DeQuantStub: nnq.DeQuantize, + nn.BatchNorm2d: nnq.BatchNorm2d, + nn.BatchNorm3d: nnq.BatchNorm3d, + nn.Dropout: nnq.Dropout, + nn.Conv1d: nnq.Conv1d, + nn.Conv2d: nnq.Conv2d, + nn.Conv3d: nnq.Conv3d, + nn.ConvTranspose1d: nnq.ConvTranspose1d, + nn.ConvTranspose2d: nnq.ConvTranspose2d, + nn.ConvTranspose3d: nnq.ConvTranspose3d, + nn.ELU: nnq.ELU, + nn.Embedding: nnq.Embedding, + nn.EmbeddingBag: nnq.EmbeddingBag, + nn.GroupNorm: nnq.GroupNorm, + nn.Hardswish: nnq.Hardswish, + nn.InstanceNorm1d: nnq.InstanceNorm1d, + nn.InstanceNorm2d: nnq.InstanceNorm2d, + nn.InstanceNorm3d: nnq.InstanceNorm3d, + nn.LayerNorm: nnq.LayerNorm, + nn.LeakyReLU: nnq.LeakyReLU, + nn.modules.linear.NonDynamicallyQuantizableLinear: nnq.Linear, + nn.Linear: nnq.Linear, + nn.ReLU6: nnq.ReLU6, + nn.PReLU: nnq.PReLU, + # Wrapper Modules: + nnq.FloatFunctional: nnq.QFunctional, + # Intrinsic modules: + nni.BNReLU2d: nniq.BNReLU2d, + nni.BNReLU3d: nniq.BNReLU3d, + nni.ConvReLU1d: nniq.ConvReLU1d, + nni.ConvReLU2d: nniq.ConvReLU2d, + nni.ConvReLU3d: nniq.ConvReLU3d, + nni.ConvAdd2d: nniq.ConvAdd2d, + nni.ConvAddReLU2d: nniq.ConvAddReLU2d, + nni.LinearReLU: nniq.LinearReLU, + nni.LinearLeakyReLU: nniq.LinearLeakyReLU, + nni.LinearTanh: nniq.LinearTanh, + nniqat.ConvBn1d: nnq.Conv1d, + nniqat.ConvBn2d: nnq.Conv2d, + nniqat.ConvBn3d: nnq.Conv3d, + nniqat.ConvBnReLU1d: nniq.ConvReLU1d, + nniqat.ConvBnReLU2d: nniq.ConvReLU2d, + nniqat.ConvBnReLU3d: nniq.ConvReLU3d, + nniqat.ConvReLU2d: nniq.ConvReLU2d, + nniqat.ConvReLU3d: nniq.ConvReLU3d, + nniqat.LinearReLU: nniq.LinearReLU, + nniqat.LinearBn1d: nnq.Linear, + # QAT modules: + nnqat.Linear: nnq.Linear, + nnqat.Conv2d: nnq.Conv2d, + nnqat.Conv3d: nnq.Conv3d, +} + +# Default map for swapping float module to qat modules +DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = { + nn.Conv2d: nnqat.Conv2d, + nn.Conv3d: nnqat.Conv3d, + nn.Linear: nnqat.Linear, + nn.modules.linear.NonDynamicallyQuantizableLinear: nnqat.Linear, + # Intrinsic modules: + nni.ConvBn1d: nniqat.ConvBn1d, + nni.ConvBn2d: nniqat.ConvBn2d, + nni.ConvBn3d: nniqat.ConvBn3d, + nni.ConvBnReLU1d: nniqat.ConvBnReLU1d, + nni.ConvBnReLU2d: nniqat.ConvBnReLU2d, + nni.ConvBnReLU3d: nniqat.ConvBnReLU3d, + nni.ConvReLU2d: nniqat.ConvReLU2d, + nni.ConvReLU3d: nniqat.ConvReLU3d, + nni.LinearReLU: nniqat.LinearReLU, + nni.LinearBn1d: nniqat.LinearBn1d, +} + +# Default map for swapping dynamic modules +DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = { + nn.GRUCell: nnqd.GRUCell, + nn.Linear: nnqd.Linear, + nnqatd.Linear: nnqd.Linear, + nn.modules.linear.NonDynamicallyQuantizableLinear: nnqd.Linear, + nn.LSTM: nnqd.LSTM, + nn.GRU: nnqd.GRU, + nn.LSTMCell: nnqd.LSTMCell, + nn.RNNCell: nnqd.RNNCell, + nni.LinearReLU: nniqd.LinearReLU, + nn.EmbeddingBag: nnq.EmbeddingBag, + nn.Embedding: nnq.Embedding, + # Don't want to enable these by default because the numerical + # accuracy is poor compared to other dynamic ops + # nn.Conv1d: nnqd.Conv1d, + # nn.Conv2d: nnqd.Conv2d, + # nn.Conv3d: nnqd.Conv3d, + # nn.ConvTranspose1d: nnqd.ConvTranspose1d, + # nn.ConvTranspose2d: nnqd.ConvTranspose2d, + # nn.ConvTranspose3d: nnqd.ConvTranspose3d, +} + +# Allowlist for propagating the qconfig +_INCLUDE_QCONFIG_PROPAGATE_LIST: set[Callable] = { + nn.Sequential, +} + +# Default mapping from floating point function or torch ops to quantized ops +# TODO: merge with default static mapping +DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS: dict[Callable | str, Callable] = { + F.elu: torch.ops.quantized.elu, + F.hardswish: torch.ops.quantized.hardswish, + F.instance_norm: torch.ops.quantized.instance_norm, + F.layer_norm: torch.ops.quantized.layer_norm, + F.leaky_relu: torch.ops.quantized.leaky_relu, + F.dropout: torch.ops.quantized.dropout, +} + +# mapping from module to output activation post process class +DEFAULT_MODULE_TO_ACT_POST_PROCESS: dict[Callable, Callable] = { + nn.Hardsigmoid: default_fixed_qparams_range_0to1_fake_quant, + nn.Sigmoid: default_fixed_qparams_range_0to1_fake_quant, + nn.Softmax: default_fixed_qparams_range_0to1_fake_quant, + nn.Tanh: default_fixed_qparams_range_neg1to1_fake_quant, +} + +# Default map for swapping float module to static sparse quantized ones +DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = { + nn.Linear: ao_nn.sparse.quantized.Linear +} + +# Default map for swapping float module to dynamic sparse quantized ones +DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = { + nn.Linear: ao_nn.sparse.quantized.dynamic.Linear +} + + +def no_observer_set() -> set[Any]: + r"""These modules cannot have observers inserted by default.""" + no_observers = {nn.quantizable.LSTM, nn.quantizable.MultiheadAttention} + return no_observers + + +def get_default_static_quant_module_mappings() -> dict[Callable, Any]: + """Get module mapping for post training static quantization""" + return copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS) + + +def get_default_static_quant_reference_module_mappings() -> dict[Callable, Any]: + """Get reference module mapping for post training static quantization""" + return copy.deepcopy(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS) + + +def get_embedding_static_quant_module_mappings() -> dict[Callable, Any]: + """Get module mapping, including mapping for embedding QAT""" + mapping = copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS) + mapping[nnqat.EmbeddingBag] = nnq.EmbeddingBag + mapping[nnqat.Embedding] = nnq.Embedding + return mapping + + +def get_default_static_sparse_quant_module_mappings() -> dict[Callable, Any]: + """Get module mapping for post training static sparse quantization""" + return copy.deepcopy(DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS) + + +def get_static_quant_module_class( + float_module_class: Callable, + additional_static_quant_mapping: dict[Callable, Any] | None = None, + is_reference: bool = False, +) -> Any: + r"""n Get the statically quantized module class corresponding to + the floating point module class + """ + if additional_static_quant_mapping is None: + additional_static_quant_mapping = {} + all_mappings = get_combined_dict( + DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS + if is_reference + else DEFAULT_STATIC_QUANT_MODULE_MAPPINGS, + additional_static_quant_mapping, + ) + static_quant_module_class = all_mappings.get(float_module_class, None) + if static_quant_module_class is None: + raise AssertionError( + f"Floating point module class {str(float_module_class)}" + + " does not have a corresponding quantized module class" + ) + return copy.deepcopy(static_quant_module_class) + + +def get_dynamic_quant_module_class( + float_module_class: Callable, + additional_dynamic_quant_mapping: dict[Callable, Any] | None = None, +) -> Any: + r"""n Get the dynamically quantized module class corresponding to + the floating point module class + """ + if additional_dynamic_quant_mapping is None: + additional_dynamic_quant_mapping = {} + all_mappings = get_combined_dict( + DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, additional_dynamic_quant_mapping + ) + dynamic_quant_module_class = all_mappings.get(float_module_class, None) + if dynamic_quant_module_class is None: + raise AssertionError( + f"Floating point module class {str(float_module_class)}" + + " does not have a corresponding quantized module class" + ) + return copy.deepcopy(dynamic_quant_module_class) + + +def get_default_qat_module_mappings() -> dict[Callable, Any]: + """Get default module mapping for quantization aware training""" + return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS) + + +def get_embedding_qat_module_mappings() -> dict[Callable, Any]: + """Get module mapping for quantization aware training + This is includes default values in addition to + enabling qat for embeddings. + """ + mapping = copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS) + mapping[nn.EmbeddingBag] = nnqat.EmbeddingBag + mapping[nn.Embedding] = nnqat.Embedding + return mapping + + +def get_default_dynamic_quant_module_mappings() -> dict[Callable, Any]: + """Get module mapping for post training dynamic quantization""" + return DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS + + +def get_default_dynamic_sparse_quant_module_mappings() -> dict[Callable, Any]: + """Get module mapping for post training dynamic sparse quantization""" + return DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS + + +def get_default_qconfig_propagation_list() -> set[Callable]: + """Get the default list of module types that we'll attach qconfig + attribute to in prepare + """ + QCONFIG_PROPAGATE_MODULE_CLASS_LIST = ( + set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys()) + | set(DEFAULT_QAT_MODULE_MAPPINGS.keys()) + | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys()) + | _INCLUDE_QCONFIG_PROPAGATE_LIST + ) + return copy.deepcopy(QCONFIG_PROPAGATE_MODULE_CLASS_LIST) + + +def get_default_compare_output_module_list() -> set[Callable]: + """Get list of module class types that we will record output + in numeric suite + """ + NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST = ( + set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.values()) + | set(DEFAULT_QAT_MODULE_MAPPINGS.values()) + | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.values()) + | set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys()) + | set(DEFAULT_QAT_MODULE_MAPPINGS.keys()) + | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys()) + | _INCLUDE_QCONFIG_PROPAGATE_LIST + ) + return copy.deepcopy(NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST) + + +def get_default_float_to_quantized_operator_mappings() -> dict[ + Callable | str, Callable +]: + return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS) + + +# TODO: merge with get_static_quant_module_class +def get_quantized_operator(float_op: Callable | str) -> Callable: + """Get the quantized operator corresponding to the float operator""" + quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op) + if quantized_op is None: + raise AssertionError( + f"Operator {str(float_op)} does not have corresponding quantized op" + ) + return quantized_op + + +def _get_special_act_post_process(module: torch.nn.Module) -> Callable | None: + r"""Get the special activation post process for `module`, this has + higher priority than the activation post process in `qconfig` + e.g. + input: torch.nn.Sigmoid + output: default_affine_fixed_qparam_fake_quant + """ + return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get( + type_before_parametrizations(module), None + ) + + +def _has_special_act_post_process(module: torch.nn.Module) -> bool: + return module.training and type(module) in DEFAULT_MODULE_TO_ACT_POST_PROCESS diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..e71dd24fda745d7f23f671eedaa1ff43df147a9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py @@ -0,0 +1,829 @@ +# mypy: allow-untyped-defs +import copy +import inspect +import itertools +import typing_extensions +import warnings + +import torch +import torch.ao.nn.quantized as nnq +import torch.nn as nn +from torch.ao.nn.intrinsic import _FusedModule +from torch.ao.quantization.observer import _is_activation_post_process +from torch.ao.quantization.qconfig import ( + _activation_is_memoryless, + _add_module_to_qconfig_obs_ctr, + default_dynamic_qconfig, + float16_dynamic_qconfig, + float_qparams_weight_only_qconfig, + float_qparams_weight_only_qconfig_4bit, +) +from torch.ao.quantization.quantization_mappings import ( + _get_special_act_post_process, + _has_special_act_post_process, + get_default_dynamic_quant_module_mappings, + get_default_qat_module_mappings, + get_default_qconfig_propagation_list, + get_default_static_quant_module_mappings, + get_default_static_quant_reference_module_mappings, + no_observer_set, +) +from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper +from torch.nn.utils.parametrize import type_before_parametrizations + +from .utils import ( + DEPRECATION_WARNING, + get_qparam_dict, + has_no_children_ignoring_parametrizations, +) + + +__all__ = [ + "get_default_custom_config_dict", + "propagate_qconfig_", + "add_quant_dequant", + "prepare", + "quantize", + "quantize_dynamic", + "prepare_qat", + "quantize_qat", + "convert", + "swap_module", +] + + +# TODO remove this once BC is no longer required to avoid a SEV +is_activation_post_process = _is_activation_post_process + + +_DEFAULT_CUSTOM_CONFIG_DICT = { + "float_to_observed_custom_module_class": { + nn.LSTM: nn.quantizable.LSTM, + nn.MultiheadAttention: nn.quantizable.MultiheadAttention, + }, + "observed_to_quantized_custom_module_class": { + nn.quantizable.LSTM: nn.quantized.LSTM, + nn.quantizable.MultiheadAttention: nn.quantized.MultiheadAttention, + }, +} + + +def get_default_custom_config_dict(): + r"""Defines the default custom config dict.""" + return _DEFAULT_CUSTOM_CONFIG_DICT + + +def _propagate_qconfig_helper( + module, + qconfig_dict, + qconfig_parent=None, + prefix="", + prepare_custom_config_dict=None, +): + r"""This is a helper function for `propagate_qconfig_` + + Args: + module: input module + qconfig_dict: dictionary that maps from name of submodule to quantization + configuration + qconfig_parent: quantization config of parent module, we will fallback to + this config when there is no specified config for current + module + prefix: corresponding prefix of the current module, used as key in + qconfig_dict + prepare_custom_config_dict: dictionary for custom handling of modules + see docs for :func:`~torch.ao.quantization.prepare_fx` + + Return: + None, module is modified inplace with qconfig attached + """ + + module_qconfig = qconfig_dict.get( + type_before_parametrizations(module), qconfig_parent + ) + module_qconfig = qconfig_dict.get(prefix, module_qconfig) + module_qconfig = getattr(module, "qconfig", module_qconfig) + + torch.ao.quantization.qconfig._assert_valid_qconfig(module_qconfig, module) + + qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(module_qconfig, module) + module.qconfig = qconfig_with_device_check + + for name, child in module.named_children(): + module_prefix = prefix + "." + name if prefix else name + # do no not propagate qconfig to child if child is non traceable + if prepare_custom_config_dict is None or not ( + name in prepare_custom_config_dict.get("non_traceable_module_name", []) + or type(child) + in prepare_custom_config_dict.get("non_traceable_module_class", []) + ): + _propagate_qconfig_helper( + child, qconfig_dict, qconfig_with_device_check, module_prefix + ) + + +def propagate_qconfig_(module, qconfig_dict=None, prepare_custom_config_dict=None): + r"""Propagate qconfig through the module hierarchy and assign `qconfig` + attribute on each leaf module + + Args: + module: input module + qconfig_dict: dictionary that maps from name or type of submodule to + quantization configuration, qconfig applies to all submodules of a + given module unless qconfig for the submodules are specified (when + the submodule already has qconfig attribute) + prepare_custom_config_dict: dictionary for custom handling of modules + see docs for :func:`~torch.ao.quantization.prepare_fx` + + Return: + None, module is modified inplace with qconfig attached + """ + if qconfig_dict is None: + qconfig_dict = {} + if prepare_custom_config_dict is None: + prepare_custom_config_dict = {} + _propagate_qconfig_helper( + module, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict + ) + + +def _observer_forward_hook(self, input, output): + r"""Forward hook that calls observer on the output""" + return self.activation_post_process(output) + + +def _observer_forward_pre_hook(self, input): + r"""Forward pre hook that calls observer on the output""" + return self.activation_post_process(input[0]) + + +def _register_activation_post_process_hook(module, pre_hook=False): + if not hasattr(module, "activation_post_process"): + raise AssertionError( + "Expect activation_post_process attribute already attached to the module" + ) + if pre_hook: + module.register_forward_pre_hook(_observer_forward_pre_hook, prepend=True) + else: + module.register_forward_hook(_observer_forward_hook, prepend=True) + + +def _add_observer_( + module, + qconfig_propagation_list=None, + non_leaf_module_list=None, + device=None, + custom_module_class_mapping=None, +): + r"""Add observer for the leaf child of the module. + + This function insert observer module to all leaf child module that + has a valid qconfig attribute. + + Args: + module: input module with qconfig attributes for all the leaf modules that we want to quantize + qconfig_propagation_list: a list of quantizable modules that will have observers added to them + if they are leaf nodes + device: parent device, if any + non_leaf_module_list: list of non-leaf modules we want to add observer + + Return: + None, module is modified inplace with added observer modules and forward_hooks + """ + if qconfig_propagation_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + + if custom_module_class_mapping is None: + custom_module_class_mapping = {} + + # respect device affinity when adding observers + if device is None: + devices = _get_unique_devices_(module) + if len(devices) > 1: + raise AssertionError( + f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}" + ) + device = next(iter(devices)) if len(devices) > 0 else None + + def get_activation_post_process(qconfig, device, special_act_post_process=None): + activation = ( + qconfig.activation() + if special_act_post_process is None + else special_act_post_process() + ) + if device is not None: + activation.to(device) + return activation + + def needs_observation(m): + return hasattr(m, "qconfig") and m.qconfig is not None + + def insert_activation_post_process(m, special_act_post_process=None): + """Adds an activation post process module and register + a pre or post hook that calls the module + """ + # We don't insert observer/fake_quantize for DeQuantStub + if needs_observation(m) and not isinstance(m, DeQuantStub): + # observer and hook will be gone after we swap the module + m.add_module( + "activation_post_process", + get_activation_post_process( + m.qconfig, device, special_act_post_process + ), + ) + # Register observer as the first entry in the hook list + # All post forward hooks are preserved and will be executed after the observer before convert + _register_activation_post_process_hook( + m, pre_hook=_activation_is_memoryless(m.qconfig) + ) + + for name, child in module.named_children(): + # TODO remove Dropout special after codebase stable + if type_before_parametrizations(child) is nn.Dropout: + continue + elif issubclass( + type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional) + ): + if needs_observation(child): + if not hasattr(child, "activation_post_process"): + raise AssertionError( + f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`" + ) + child.activation_post_process = get_activation_post_process( + child.qconfig, device + ) + elif isinstance(child, _FusedModule): + # activation_post_process are now added directly to nn.Sequential/_FusedModule + if needs_observation(child): + insert_activation_post_process(child) + elif ( + non_leaf_module_list is not None + and type_before_parametrizations(child) in non_leaf_module_list + ): + if needs_observation(child): + insert_activation_post_process(child) + elif _has_special_act_post_process(child): + special_act_post_process = _get_special_act_post_process(child) + insert_activation_post_process(child, special_act_post_process) + elif ( + needs_observation(child) + and type_before_parametrizations(child) in custom_module_class_mapping + ): + observed_class = custom_module_class_mapping[ + type_before_parametrizations(child) + ] + observed_child = observed_class.from_float(child) + setattr(module, name, observed_child) + # TODO: These are the modules that cannot be observed + # Once there are more, we should move them to a separate list + if not issubclass(observed_class, tuple(no_observer_set())): + insert_activation_post_process(observed_child) + else: + _add_observer_( + child, + qconfig_propagation_list, + non_leaf_module_list, + device, + custom_module_class_mapping, + ) + + # Insert observers only for leaf nodes, note that this observer is for + # the output of the module, for input QuantStub will observe them + if ( + has_no_children_ignoring_parametrizations(module) + and not isinstance(module, torch.nn.Sequential) + and type_before_parametrizations(module) in qconfig_propagation_list + ): + insert_activation_post_process(module) + # This is a special case for AdaRound eager mode + # AdaRound contains weight_fake_quant to be propagated from API to convert + # leaf node check with a number of children looks naive assumption that blocks + # Adding an exception case for AdaRound + if ( + hasattr(module, "weight_fake_quant") + and not isinstance(module, torch.nn.Sequential) + and type_before_parametrizations(module) in qconfig_propagation_list + ): + insert_activation_post_process(module) + + +def _get_unique_devices_(module): + return {p.device for p in module.parameters() if p.device.type != "meta"} | { + p.device for p in module.buffers() if p.device.type != "meta" + } + + +def add_quant_dequant(module): + r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig + Note that this function will modify the children of module inplace and it + can return a new module which wraps the input module as well. + + Args: + module: input module with qconfig attributes for all the leaf modules + that we want to quantize + + Return: + Either the inplace modified module with submodules wrapped in + `QuantWrapper` based on qconfig or a new `QuantWrapper` module which + wraps the input module, the latter case only happens when the input + module is a leaf module and we want to quantize it. + """ + if ( + has_no_children_ignoring_parametrizations(module) + and hasattr(module, "qconfig") + and module.qconfig + ): + return QuantWrapper(module) + + for name, child in module.named_children(): + module._modules[name] = add_quant_dequant(child) + return module + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare( + model, + inplace=False, + allow_list=None, + observer_non_leaf_module_list=None, + prepare_custom_config_dict=None, +): + r"""Prepares a copy of the model for quantization calibration or quantization-aware training. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + The model will be attached with observer or fake quant modules, and qconfig + will be propagated. + + Args: + `model`: input model to be modified in-place + `inplace`: carry out model transformations in-place, the original module is mutated + `allow_list`: list of quantizable modules + `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer + `prepare_custom_config_dict`: customization configuration dictionary for prepare function + + .. code-block:: python + + # Example of prepare_custom_config_dict: + prepare_custom_config_dict = { + # user will manually define the corresponding observed + # module class which has a from_float class method that converts + # float custom module to observed custom module + "float_to_observed_custom_module_class": {CustomModule: ObservedCustomModule} + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare") + if prepare_custom_config_dict is None: + prepare_custom_config_dict = get_default_custom_config_dict() + custom_module_class_mapping = prepare_custom_config_dict.get( + "float_to_observed_custom_module_class", {} + ) + + if not inplace: + model = copy.deepcopy(model) + + # TODO: remove allow_list + qconfig_propagation_list = allow_list + if allow_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + propagate_qconfig_(model, qconfig_dict=None) + + # sanity check common API misusage + if not any(hasattr(m, "qconfig") and m.qconfig for m in model.modules()): + warnings.warn( + "None of the submodule got qconfig applied. Make sure you " + "passed correct configuration through `qconfig_dict` or " + "by assigning the `.qconfig` attribute directly on submodules", + stacklevel=2, + ) + + _add_observer_( + model, + qconfig_propagation_list, + observer_non_leaf_module_list, + custom_module_class_mapping=custom_module_class_mapping, + ) + return model + + +def _remove_activation_post_process(module): + # TODO: maybe we should change activation_post_process to _activation_post_process + # to prevent it from being used by user + if hasattr(module, "activation_post_process") and _is_activation_post_process( + module.activation_post_process + ): + delattr(module, "activation_post_process") + + # remove activation_post_process pre and post hooks + def remove_hooks(pre_hook=False): + hook_map = module._forward_pre_hooks if pre_hook else module._forward_hooks + observer_hook = ( + _observer_forward_pre_hook if pre_hook else _observer_forward_hook + ) + handle_ids_to_remove = set() + for handle_id, hook_fn in hook_map.items(): + if hook_fn is observer_hook: + handle_ids_to_remove.add(handle_id) + for handle_id in handle_ids_to_remove: + hook_map.pop(handle_id) + + remove_hooks(pre_hook=True) + remove_hooks(pre_hook=False) + + +# TODO: rename to something more general +def _remove_qconfig(module): + r"""Clean up the qconfig left in the module so that new qconfig can be + propagated. + + Args: + module: module to be cleaned up + """ + for child in module.children(): + _remove_qconfig(child) + + if hasattr(module, "qconfig"): + del module.qconfig + + _remove_activation_post_process(module) + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def quantize(model, run_fn, run_args, mapping=None, inplace=False): + r"""Quantize the input float model with post training static quantization. + + First it will prepare the model for calibration, then it calls + `run_fn` which will run the calibration step, after that we will + convert the model to a quantized model. + + Args: + model: input float model + run_fn: a calibration function for calibrating the prepared model + run_args: positional arguments for `run_fn` + inplace: carry out model transformations in-place, the original module is mutated + mapping: correspondence between original module types and quantized counterparts + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize") + if mapping is None: + mapping = get_default_static_quant_module_mappings() + if not inplace: + model = copy.deepcopy(model) + model.eval() + prepare(model, inplace=True) + run_fn(model, *run_args) + convert(model, mapping, inplace=True) + return model + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def quantize_dynamic( + model, qconfig_spec=None, dtype=torch.qint8, mapping=None, inplace=False +): + r"""Converts a float model to dynamic (i.e. weights-only) quantized model. + + Replaces specified modules with dynamic weight-only quantized versions and output the quantized model. + + For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization + by default is performed for layers with large weights size - i.e. Linear and RNN variants. + + Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`. + If `qconfig` is provided, the `dtype` argument is ignored. + + Args: + model: input model + qconfig_spec: Either: + + - A dictionary that maps from name or type of submodule to quantization + configuration, qconfig applies to all submodules of a given + module unless qconfig for the submodules are specified (when the + submodule already has qconfig attribute). Entries in the dictionary + need to be QConfig instances. + + - A set of types and/or submodule names to apply dynamic quantization to, + in which case the `dtype` argument is used to specify the bit-width + + inplace: carry out model transformations in-place, the original module is mutated + mapping: maps type of a submodule to a type of corresponding dynamically quantized version + with which the submodule needs to be replaced + + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic") + if qconfig_spec is None: + if dtype == torch.qint8: + qconfig_spec = { + nn.Linear: default_dynamic_qconfig, + nn.LSTM: default_dynamic_qconfig, + nn.GRU: default_dynamic_qconfig, + nn.LSTMCell: default_dynamic_qconfig, + nn.RNNCell: default_dynamic_qconfig, + nn.GRUCell: default_dynamic_qconfig, + } + elif dtype == torch.float16: + qconfig_spec = { + nn.Linear: float16_dynamic_qconfig, + nn.LSTM: float16_dynamic_qconfig, + nn.GRU: float16_dynamic_qconfig, + nn.LSTMCell: float16_dynamic_qconfig, + nn.RNNCell: float16_dynamic_qconfig, + nn.GRUCell: float16_dynamic_qconfig, + } + elif dtype == torch.quint8: + qconfig_spec = { + nn.EmbeddingBag: float_qparams_weight_only_qconfig, + nn.Embedding: float_qparams_weight_only_qconfig, + } + elif dtype == torch.quint4x2: + qconfig_spec = { + nn.EmbeddingBag: float_qparams_weight_only_qconfig_4bit, + } + else: + raise ValueError( + f"Don't know how to quantize with default settings for {dtype}. Provide full qconfig please" + ) + elif isinstance(qconfig_spec, set): + if dtype is torch.qint8: + default_qconfig = default_dynamic_qconfig + elif dtype is torch.float16: + default_qconfig = float16_dynamic_qconfig + elif dtype is torch.quint8: + default_qconfig = float_qparams_weight_only_qconfig + elif dtype is torch.quint4x2: + default_qconfig = float_qparams_weight_only_qconfig_4bit + else: + raise RuntimeError( + "Unknown dtype specified for quantize_dynamic: ", str(dtype) + ) + qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig))) + + if mapping is None: + mapping = get_default_dynamic_quant_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + model.eval() + propagate_qconfig_(model, qconfig_spec) + convert(model, mapping, inplace=True) + return model + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare_qat(model, mapping=None, inplace=False): + r""" + Prepares a copy of the model for quantization calibration or + quantization-aware training and converts it to quantized version. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + Args: + model: input model to be modified in-place + mapping: dictionary that maps float modules to quantized modules to be + replaced. + inplace: carry out model transformations in-place, the original module + is mutated + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat") + if not model.training: + raise AssertionError("prepare_qat only works on models in training mode") + if mapping is None: + mapping = get_default_qat_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + + propagate_qconfig_(model, qconfig_dict=None) + convert(model, mapping=mapping, inplace=True, remove_qconfig=False) + prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True) + return model + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def quantize_qat(model, run_fn, run_args, inplace=False): + r"""Do quantization aware training and output a quantized model + + Args: + model: input model + run_fn: a function for evaluating the prepared model, can be a + function that simply runs the prepared model or a training + loop + run_args: positional arguments for `run_fn` + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat") + if not inplace: + model = copy.deepcopy(model) + model.train() + prepare_qat(model, inplace=True) + run_fn(model, *run_args) + convert(model, inplace=True) + return model + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def convert( + module, + mapping=None, + inplace=False, + remove_qconfig=True, + is_reference=False, + convert_custom_config_dict=None, + use_precomputed_fake_quant=False, +): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class. And remove qconfig at the + end if remove_qconfig is set to True. + + Args: + `module`: prepared and calibrated module + `mapping`: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + `inplace`: carry out model transformations in-place, the original module + is mutated + `convert_custom_config_dict`: custom configuration dictionary for convert function + `use_precomputed_fake_quant`: a flag to enable use of precomputed fake quant + + .. code-block:: python + + # Example of convert_custom_config_dict: + convert_custom_config_dict = { + # user will manually define the corresponding quantized + # module class which has a from_observed class method that converts + # observed custom module to quantized custom module + "observed_to_quantized_custom_module_class": { + ObservedCustomModule: QuantizedCustomModule + } + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.convert") + if not inplace: + module = copy.deepcopy(module) + _convert( + module, + mapping, + inplace=True, + is_reference=is_reference, + convert_custom_config_dict=convert_custom_config_dict, + use_precomputed_fake_quant=use_precomputed_fake_quant, + ) + if remove_qconfig: + _remove_qconfig(module) + return module + + +def _convert( + module, + mapping=None, + inplace=False, + is_reference=False, + convert_custom_config_dict=None, + use_precomputed_fake_quant=False, +): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class + + Args: + module: input module + mapping: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + inplace: carry out model transformations in-place, the original module + is mutated + is_reference: a flag to enable quantized reference module + use_precomputed_fake_quant: a flag to enable use of precomputed fake quant + + """ + if mapping is None: + mapping = ( + get_default_static_quant_reference_module_mappings() + if is_reference + else get_default_static_quant_module_mappings() + ) + if convert_custom_config_dict is None: + convert_custom_config_dict = get_default_custom_config_dict() + custom_module_class_mapping = convert_custom_config_dict.get( + "observed_to_quantized_custom_module_class", {} + ) + + if not inplace: + module = copy.deepcopy(module) + reassign = {} + for name, mod in module.named_children(): + # both fused modules and observed custom modules are + # swapped as one unit + if ( + not isinstance(mod, _FusedModule) + and type_before_parametrizations(mod) not in custom_module_class_mapping + ): + _convert( + mod, + mapping, + True, # inplace + is_reference, + convert_custom_config_dict, + use_precomputed_fake_quant=use_precomputed_fake_quant, + ) + reassign[name] = swap_module( + mod, mapping, custom_module_class_mapping, use_precomputed_fake_quant + ) + + for key, value in reassign.items(): + module._modules[key] = value + + return module + + +def swap_module( + mod, mapping, custom_module_class_mapping, use_precomputed_fake_quant=False +): + r"""Swaps the module if it has a quantized counterpart and it has an + `observer` attached. + + Args: + mod: input module + mapping: a dictionary that maps from nn module to nnq module + + Return: + The corresponding quantized module of `mod` + """ + new_mod = mod + if hasattr(mod, "qconfig") and mod.qconfig is not None: + swapped = False + if type_before_parametrizations(mod) in custom_module_class_mapping: + new_mod = custom_module_class_mapping[ + type_before_parametrizations(mod) + ].from_observed(mod) + swapped = True + elif type_before_parametrizations(mod) in mapping: + qmod = mapping[type_before_parametrizations(mod)] + if hasattr(qmod, "_IS_REFERENCE") and qmod._IS_REFERENCE: + if mod.qconfig is None: + raise AssertionError( + "module qconfig must not be None when swapping to reference module" + ) + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + weight_qparams = get_qparam_dict(weight_post_process) + new_mod = qmod.from_float(mod, weight_qparams) + else: + sig = inspect.signature(qmod.from_float) + if "use_precomputed_fake_quant" in sig.parameters: + new_mod = qmod.from_float( + mod, use_precomputed_fake_quant=use_precomputed_fake_quant + ) + else: + new_mod = qmod.from_float(mod) + swapped = True + + if swapped: + # Preserve module's pre forward hooks. They'll be called on quantized input + for pre_hook_fn in mod._forward_pre_hooks.values(): + new_mod.register_forward_pre_hook(pre_hook_fn) + # Preserve module's post forward hooks except _observer_forward_hook + # After convert they'll work with quantized output + for hook_fn in mod._forward_hooks.values(): + if hook_fn is not _observer_forward_hook: + new_mod.register_forward_hook(hook_fn) + + # respect device affinity when swapping modules + devices = _get_unique_devices_(mod) + if not ( + len(devices) <= 1 + or (len(devices) == 2 and torch.device("meta") in devices) + ): + raise AssertionError( + f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}" + ) + device = next(iter(devices)) if len(devices) > 0 else None + if device: + new_mod.to(device) + return new_mod + + +def _get_observer_dict(mod, target_dict, prefix=""): + r"""Traverse the modules and save all observers into dict. + This is mainly used for quantization accuracy debug + Args: + mod: the top module we want to save all observers + prefix: the prefix for the current module + target_dict: the dictionary used to save all the observers + """ + + def get_prefix(prefix): + return prefix if prefix == "" else prefix + "." + + if hasattr(mod, "activation_post_process"): + target_dict[get_prefix(prefix) + "activation_post_process"] = ( + mod.activation_post_process + ) + for name, child in mod.named_children(): + module_prefix = get_prefix(prefix) + name if prefix else name + _get_observer_dict(child, target_dict, module_prefix) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py new file mode 100644 index 0000000000000000000000000000000000000000..ba6ab86aaa048fbd128f9a89cc32d4e438d3fe12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py @@ -0,0 +1,759 @@ +import copy +import typing_extensions +import warnings +from typing import Any + +import torch +from torch.fx import GraphModule +from torch.fx.graph_module import _USER_PRESERVED_ATTRIBUTES_KEY + +from .backend_config import BackendConfig, get_tensorrt_backend_config # noqa: F401 +from .fx.convert import convert +from .fx.custom_config import ConvertCustomConfig, FuseCustomConfig, PrepareCustomConfig +from .fx.fuse import fuse # noqa: F401 +from .fx.graph_module import ObservedGraphModule # noqa: F401 +from .fx.prepare import prepare # noqa: F401 +from .fx.tracer import QuantizationTracer, Scope, ScopeContextManager # noqa: F401 +from .fx.utils import ( # noqa: F401 + get_custom_module_class_keys, + get_skipped_module_name_and_classes, +) +from .qconfig_mapping import QConfigMapping +from .utils import DEPRECATION_WARNING + + +def attach_preserved_attrs_to_model( + model: GraphModule | torch.nn.Module, + preserved_attrs: dict[str, Any], +) -> None: + """Store preserved attributes to the model.meta so that it can be preserved during deepcopy""" + model.meta[_USER_PRESERVED_ATTRIBUTES_KEY] = copy.copy(preserved_attrs) # type: ignore[operator, index, assignment] + # set the preserved attributes in the model so that user can call + # model.attr as they do before calling fx graph mode quantization + for attr_name, attr in model.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items(): # type: ignore[index, union-attr] + setattr(model, attr_name, attr) + + +def _check_is_graph_module(model: torch.nn.Module) -> None: + if not isinstance(model, GraphModule): + raise ValueError( + "input model must be a GraphModule, " + + "Got type:" + + str(type(model)) + + " Please make " + + "sure to follow the tutorials." + ) + + +def _attach_meta_to_node_if_not_exist(model: GraphModule) -> None: + """Attach meta field to all nodes of the graph if it does not exist, + meta field is a field stores some meta information about the node, such + as dtype and shape information for output of the node, this only exists + if the program is captured by make_fx (used in quantize_pt2e flow), if + the program is captured by torch.fx symbolic tracing, this field may not exist, + so we add it here to avoid checking this all over the places + """ + for node in model.graph.nodes: + if not hasattr(node, "meta"): + node.meta = {} + + +def _swap_ff_with_fxff(model: torch.nn.Module) -> None: + r"""Swap FloatFunctional with FXFloatFunctional""" + modules_to_swap = [] + for name, module in model.named_children(): + if isinstance(module, torch.ao.nn.quantized.FloatFunctional): + modules_to_swap.append(name) + else: + _swap_ff_with_fxff(module) + + for name in modules_to_swap: + del model._modules[name] + model._modules[name] = torch.ao.nn.quantized.FXFloatFunctional() + + +def _fuse_fx( + model: GraphModule, + is_qat: bool, + fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""Internal helper function to fuse modules in preparation for quantization + + Args: + model: GraphModule object from symbolic tracing (torch.fx.symbolic_trace) + """ + _check_is_graph_module(model) + return fuse(model, is_qat, fuse_custom_config, backend_config) # type: ignore[operator] + + +def _prepare_fx( + model: torch.nn.Module, + qconfig_mapping: QConfigMapping | dict[str, Any], + is_qat: bool, + example_inputs: tuple[Any, ...], + prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None, + _equalization_config: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, + is_standalone_module: bool = False, +) -> GraphModule: + r"""Internal helper function for prepare_fx + Args: + `model`, `qconfig_mapping`, `prepare_custom_config`, `_equalization_config`: + see docs for :func:`~torch.ao.quantization.prepare_fx` + `is_standalone_module`: a boolean flag indicates whether we are + quantizing a standalone module or not, a standalone module + is a submodule of the parent module that is not inlined in the + forward graph of the parent module, + the way we quantize standalone module is described in: + :func:`~torch.ao.quantization._prepare_standalone_module_fx` + """ + if prepare_custom_config is None: + prepare_custom_config = PrepareCustomConfig() + if _equalization_config is None: + _equalization_config = QConfigMapping() + + if isinstance(prepare_custom_config, dict): + warnings.warn( + "Passing a prepare_custom_config_dict to prepare is deprecated and will not be supported " + "in a future version. Please pass in a PrepareCustomConfig instead.", + FutureWarning, + stacklevel=3, + ) + prepare_custom_config = PrepareCustomConfig.from_dict(prepare_custom_config) + + # swap FloatFunctional with FXFloatFunctional + _swap_ff_with_fxff(model) + + skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes( + prepare_custom_config, is_standalone_module + ) + preserved_attr_names = prepare_custom_config.preserved_attributes + preserved_attrs = { + attr: getattr(model, attr) + for attr in preserved_attr_names + if hasattr(model, attr) + } + # symbolically trace the model + tracer = QuantizationTracer(skipped_module_names, skipped_module_classes) # type: ignore[arg-type] + graph_module = GraphModule(model, tracer.trace(model)) + _attach_meta_to_node_if_not_exist(graph_module) + + fuse_custom_config = FuseCustomConfig().set_preserved_attributes( + prepare_custom_config.preserved_attributes + ) + graph_module = _fuse_fx(graph_module, is_qat, fuse_custom_config, backend_config) + prepared = prepare( + graph_module, + qconfig_mapping, + is_qat, + tracer.node_name_to_scope, + example_inputs=example_inputs, + prepare_custom_config=prepare_custom_config, + _equalization_config=_equalization_config, + backend_config=backend_config, + is_standalone_module=is_standalone_module, + ) # type: ignore[operator] + + attach_preserved_attrs_to_model(prepared, preserved_attrs) + return prepared + + +def _prepare_standalone_module_fx( + model: torch.nn.Module, + qconfig_mapping: QConfigMapping | dict[str, Any], + is_qat: bool, + example_inputs: tuple[Any, ...], + prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""[Internal use only] Prepare a standalone module, so that it can be used when quantizing the + parent module. + standalone_module means it a submodule that is not inlined in parent module, + and will be quantized separately as one unit. + + How the standalone module is observed is specified by `input_quantized_idxs` and + `output_quantized_idxs` in the prepare_custom_config for the standalone module + + Returns: + + * model(GraphModule): prepared standalone module. It has these attributes in + model.meta: + + * `standalone_module_input_quantized_idxs(List[Int])`: a list of + indexes for the graph input that is expected to be quantized, + same as input_quantized_idxs configuration provided + for the standalone module + * `standalone_module_output_quantized_idxs(List[Int])`: a list of + indices for the graph output that is quantized + same as input_quantized_idxs configuration provided + for the standalone module + + """ + return _prepare_fx( + model, + qconfig_mapping, + is_qat, + example_inputs, + prepare_custom_config, + backend_config=backend_config, + is_standalone_module=True, + ) + + +def fuse_fx( + model: torch.nn.Module, + fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode. + Fusion rules are defined in torch.ao.quantization.fx.fusion_pattern.py + + Args: + + * `model` (torch.nn.Module): a torch.nn.Module model + * `fuse_custom_config` (FuseCustomConfig): custom configurations for fuse_fx. + See :class:`~torch.ao.quantization.fx.custom_config.FuseCustomConfig` for more details + Example:: + + from torch.ao.quantization import fuse_fx + + m = Model().eval() + m = fuse_fx(m) + + """ + if fuse_custom_config is None: + fuse_custom_config = FuseCustomConfig() + + if isinstance(fuse_custom_config, dict): + warnings.warn( + "Passing a fuse_custom_config_dict to fuse is deprecated and will not be supported " + "in a future version. Please pass in a FuseCustomConfig instead.", + FutureWarning, + stacklevel=2, + ) + fuse_custom_config = FuseCustomConfig.from_dict(fuse_custom_config) + + torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx") + preserved_attr_names = fuse_custom_config.preserved_attributes + preserved_attrs = { + attr: getattr(model, attr) + for attr in preserved_attr_names + if hasattr(model, attr) + } + + graph_module = torch.fx.symbolic_trace(model) + _attach_meta_to_node_if_not_exist(graph_module) + graph_module = _fuse_fx(graph_module, False, fuse_custom_config, backend_config) + + attach_preserved_attrs_to_model(graph_module, preserved_attrs) + return graph_module + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare_fx( + model: torch.nn.Module, + qconfig_mapping: QConfigMapping | dict[str, Any], + example_inputs: tuple[Any, ...], + prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None, + _equalization_config: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r""" Prepare a model for post training quantization + + Args: + * `model` (torch.nn.Module): torch.nn.Module model + + * `qconfig_mapping` (QConfigMapping): QConfigMapping object to configure how a model is + quantized, see :class:`~torch.ao.quantization.qconfig_mapping.QConfigMapping` + for more details + + * `example_inputs` (Tuple[Any, ...]): Example inputs for forward function of the model, + Tuple of positional args (keyword args can be passed as positional args as well) + + * `prepare_custom_config` (PrepareCustomConfig): customization configuration for quantization tool. + See :class:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig` for more details + + * `_equalization_config`: config for specifying how to perform equalization on the model + + * `backend_config` (BackendConfig): config that specifies how operators are quantized + in a backend, this includes how the operators are observed, + supported fusion patterns, how quantize/dequantize ops are + inserted, supported dtypes etc. See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details + + Return: + A GraphModule with observer (configured by qconfig_mapping), ready for calibration + + Example:: + + import torch + from torch.ao.quantization import get_default_qconfig_mapping + from torch.ao.quantization.quantize_fx import prepare_fx + + class Submodule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 5) + def forward(self, x): + x = self.linear(x) + return x + + class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 5) + self.sub = Submodule() + + def forward(self, x): + x = self.linear(x) + x = self.sub(x) + x + return x + + # initialize a floating point model + float_model = M().eval() + + # define calibration function + def calibrate(model, data_loader): + model.eval() + with torch.no_grad(): + for image, target in data_loader: + model(image) + + # qconfig is the configuration for how we insert observers for a particular + # operator + # qconfig = get_default_qconfig("fbgemm") + # Example of customizing qconfig: + # qconfig = torch.ao.quantization.QConfig( + # activation=MinMaxObserver.with_args(dtype=torch.qint8), + # weight=MinMaxObserver.with_args(dtype=torch.qint8)) + # `activation` and `weight` are constructors of observer module + + # qconfig_mapping is a collection of quantization configurations, user can + # set the qconfig for each operator (torch op calls, functional calls, module calls) + # in the model through qconfig_mapping + # the following call will get the qconfig_mapping that works best for models + # that target "fbgemm" backend + qconfig_mapping = get_default_qconfig_mapping("fbgemm") + + # We can customize qconfig_mapping in different ways. + # e.g. set the global qconfig, which means we will use the same qconfig for + # all operators in the model, this can be overwritten by other settings + # qconfig_mapping = QConfigMapping().set_global(qconfig) + # e.g. quantize the linear submodule with a specific qconfig + # qconfig_mapping = QConfigMapping().set_module_name("linear", qconfig) + # e.g. quantize all nn.Linear modules with a specific qconfig + # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig) + # for a more complete list, please see the docstring for :class:`torch.ao.quantization.QConfigMapping` + # argument + + # example_inputs is a tuple of inputs, that is used to infer the type of the + # outputs in the model + # currently it's not used, but please make sure model(*example_inputs) runs + example_inputs = (torch.randn(1, 3, 224, 224),) + + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + # `prepare_fx` inserts observers in the model based on qconfig_mapping and + # backend_config. If the configuration for an operator in qconfig_mapping + # is supported in the backend_config (meaning it's supported by the target + # hardware), we'll insert observer modules according to the qconfig_mapping + # otherwise the configuration in qconfig_mapping will be ignored + # + # Example: + # in qconfig_mapping, user sets linear module to be quantized with quint8 for + # activation and qint8 for weight: + # qconfig = torch.ao.quantization.QConfig( + # observer=MinMaxObserver.with_args(dtype=torch.quint8), + # weight=MinMaxObserver.with-args(dtype=torch.qint8)) + # Note: current qconfig api does not support setting output observer, but + # we may extend this to support these more fine grained control in the + # future + # + # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig) + # in backend config, linear module also supports in this configuration: + # weighted_int8_dtype_config = DTypeConfig( + # input_dtype=torch.quint8, + # output_dtype=torch.quint8, + # weight_dtype=torch.qint8, + # bias_type=torch.float) + + # linear_pattern_config = BackendPatternConfig(torch.nn.Linear) \ + # .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \ + # .add_dtype_config(weighted_int8_dtype_config) \ + # ... + + # backend_config = BackendConfig().set_backend_pattern_config(linear_pattern_config) + # `prepare_fx` will check that the setting requested by suer in qconfig_mapping + # is supported by the backend_config and insert observers and fake quant modules + # in the model + prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) + # Run calibration + calibrate(prepared_model, sample_inference_data) + """ + torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx") + return _prepare_fx( + model, + qconfig_mapping, + False, # is_qat + example_inputs, + prepare_custom_config, + _equalization_config, + backend_config, + ) + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare_qat_fx( + model: torch.nn.Module, + qconfig_mapping: QConfigMapping | dict[str, Any], + example_inputs: tuple[Any, ...], + prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""Prepare a model for quantization aware training + + Args: + * `model` (torch.nn.Module): torch.nn.Module model + * `qconfig_mapping` (QConfigMapping): see :func:`~torch.ao.quantization.prepare_fx` + * `example_inputs` (Tuple[Any, ...]): see :func:`~torch.ao.quantization.prepare_fx` + * `prepare_custom_config` (PrepareCustomConfig): see :func:`~torch.ao.quantization.prepare_fx` + * `backend_config` (BackendConfig): see :func:`~torch.ao.quantization.prepare_fx` + + Return: + A GraphModule with fake quant modules (configured by qconfig_mapping and backend_config), ready for + quantization aware training + + Example:: + + import torch + from torch.ao.quantization import get_default_qat_qconfig_mapping + from torch.ao.quantization.quantize_fx import prepare_qat_fx + + + class Submodule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 5) + + def forward(self, x): + x = self.linear(x) + return x + + + class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 5) + self.sub = Submodule() + + def forward(self, x): + x = self.linear(x) + x = self.sub(x) + x + return x + + + # initialize a floating point model + float_model = M().train() + # (optional, but preferred) load the weights from pretrained model + # float_model.load_weights(...) + + + # define the training loop for quantization aware training + def train_loop(model, train_data): + model.train() + for image, target in data_loader: + ... + + + # qconfig is the configuration for how we insert observers for a particular + # operator + # qconfig = get_default_qconfig("fbgemm") + # Example of customizing qconfig: + # qconfig = torch.ao.quantization.QConfig( + # activation=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8)), + # weight=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8))) + # `activation` and `weight` are constructors of observer module + + # qconfig_mapping is a collection of quantization configurations, user can + # set the qconfig for each operator (torch op calls, functional calls, module calls) + # in the model through qconfig_mapping + # the following call will get the qconfig_mapping that works best for models + # that target "fbgemm" backend + qconfig_mapping = get_default_qat_qconfig_mapping("fbgemm") + + # We can customize qconfig_mapping in different ways, please take a look at + # the docstring for :func:`~torch.ao.quantization.prepare_fx` for different ways + # to configure this + + # example_inputs is a tuple of inputs, that is used to infer the type of the + # outputs in the model + # currently it's not used, but please make sure model(*example_inputs) runs + example_inputs = (torch.randn(1, 3, 224, 224),) + + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + # `prepare_qat_fx` inserts observers in the model based on qconfig_mapping and + # backend_config, if the configuration for an operator in qconfig_mapping + # is supported in the backend_config (meaning it's supported by the target + # hardware), we'll insert fake_quantize modules according to the qconfig_mapping + # otherwise the configuration in qconfig_mapping will be ignored + # see :func:`~torch.ao.quantization.prepare_fx` for a detailed explanation of + # how qconfig_mapping interacts with backend_config + prepared_model = prepare_qat_fx(float_model, qconfig_mapping, example_inputs) + # Run training + train_loop(prepared_model, train_loop) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx") + return _prepare_fx( + model, + qconfig_mapping, + True, # is_qat + example_inputs, + prepare_custom_config, + backend_config=backend_config, + ) + + +def _convert_fx( + graph_module: GraphModule, + is_reference: bool, + convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None, + is_standalone_module: bool = False, + _remove_qconfig: bool = True, + qconfig_mapping: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, + is_decomposed: bool = False, + keep_original_weights: bool = False, +) -> GraphModule: + """`is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx`""" + if convert_custom_config is None: + convert_custom_config = ConvertCustomConfig() + + if isinstance(convert_custom_config, dict): + warnings.warn( + "Passing a convert_custom_config_dict to convert is deprecated and will not be supported " + "in a future version. Please pass in a ConvertCustomConfig instead.", + FutureWarning, + stacklevel=3, + ) + convert_custom_config = ConvertCustomConfig.from_dict(convert_custom_config) + + _check_is_graph_module(graph_module) + preserved_attr_names = convert_custom_config.preserved_attributes + preserved_attrs = { + attr: getattr(graph_module, attr) + for attr in preserved_attr_names + if hasattr(graph_module, attr) + } + + quantized = convert( + graph_module, + is_reference, + convert_custom_config, + is_standalone_module, + _remove_qconfig_flag=_remove_qconfig, + qconfig_mapping=qconfig_mapping, + backend_config=backend_config, + is_decomposed=is_decomposed, + keep_original_weights=keep_original_weights, + ) + + attach_preserved_attrs_to_model(quantized, preserved_attrs) + return quantized + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def convert_fx( + graph_module: GraphModule, + convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None, + _remove_qconfig: bool = True, + qconfig_mapping: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, + keep_original_weights: bool = False, +) -> GraphModule: + r"""Convert a calibrated or trained model to a quantized model + + Args: + * `graph_module` (torch.fx.GraphModule): A prepared and calibrated/trained model (GraphModule) + + * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function. + See :class:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig` for more details + + * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert. + + * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization. + + The keys must include the ones in the qconfig_mapping passed to `prepare_fx` or `prepare_qat_fx`, + with the same values or `None`. Additional keys can be specified with values set to `None`. + + For each entry whose value is set to None, we skip quantizing that entry in the model:: + + qconfig_mapping = QConfigMapping + .set_global(qconfig_from_prepare) + .set_object_type(torch.nn.functional.add, None) # skip quantizing torch.nn.functional.add + .set_object_type(torch.nn.functional.linear, qconfig_from_prepare) + .set_module_name("foo.bar", None) # skip quantizing module "foo.bar" + + * `backend_config` (BackendConfig): A configuration for the backend which describes how + operators should be quantized in the backend, this includes quantization + mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.), + observer placement for each operators and fused operators. + See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details + + Return: + A quantized model (torch.nn.Module) + + Example:: + + # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training + # convert_fx converts a calibrated/trained model to a quantized model for the + # target hardware, this includes converting the model first to a reference + # quantized model, and then lower the reference quantized model to a backend + # Currently, the supported backends are fbgemm (onednn), qnnpack (xnnpack) and + # they share the same set of quantized operators, so we are using the same + # lowering procedure + # + # backend_config defines the corresponding reference quantized module for + # the weighted modules in the model, e.g. nn.Linear + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + quantized_model = convert_fx(prepared_model) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_fx") + return _convert_fx( + graph_module, + is_reference=False, + convert_custom_config=convert_custom_config, + _remove_qconfig=_remove_qconfig, + qconfig_mapping=qconfig_mapping, + backend_config=backend_config, + keep_original_weights=keep_original_weights, + ) + + +def convert_to_reference_fx( + graph_module: GraphModule, + convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None, + _remove_qconfig: bool = True, + qconfig_mapping: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""Convert a calibrated or trained model to a reference quantized model, + see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details, + reference quantized model is a standard representation of a quantized model provided + by FX Graph Mode Quantization, it can be further lowered to run on the target + hardware, like accelerators + + Args: + * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule) + + * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert. + + * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `backend_config` (BackendConfig): A configuration for the backend which describes how + operators should be quantized in the backend. See + :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + Return: + A reference quantized model (GraphModule) + + Example:: + + # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + reference_quantized_model = convert_to_reference_fx(prepared_model) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_to_reference_fx") + return _convert_fx( + graph_module, + is_reference=True, + convert_custom_config=convert_custom_config, + _remove_qconfig=_remove_qconfig, + qconfig_mapping=qconfig_mapping, + backend_config=backend_config, + ) + + +def _convert_to_reference_decomposed_fx( + graph_module: GraphModule, + convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None, + qconfig_mapping: QConfigMapping | dict[str, Any] | None = None, + backend_config: BackendConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""Convert a calibrated or trained model to a reference quantized model, with + decomposed representation for quantized Tensor + see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details, + reference quantized model is a standard representation of a quantized model provided + by FX Graph Mode Quantization, it can be further lowered to run on the target + hardware, like accelerators + + Note: this is not public API + + Args: + * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule) + + * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert. + + * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `backend_config` (BackendConfig): A configuration for the backend which describes how + operators should be quantized in the backend. See + :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + Return: + A reference quantized model (GraphModule) with operators working with decomposed quantized Tensor + + Example:: + + # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + reference_quantized_model = _convert_to_reference_decomposed_fx(prepared_model) + + """ + torch._C._log_api_usage_once( + "quantization_api.quantize_fx._convert_to_reference_decomposed_fx" + ) + return _convert_fx( + graph_module, + is_reference=True, + convert_custom_config=convert_custom_config, + _remove_qconfig=False, + qconfig_mapping=qconfig_mapping, + backend_config=backend_config, + is_decomposed=True, + ) + + +def _convert_standalone_module_fx( + graph_module: GraphModule, + is_reference: bool = False, + convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None, +) -> GraphModule: + r"""[Internal use only] Convert a model produced by :func:`~torch.ao.quantization.prepare_standalone_module_fx` + and convert it to a quantized model + + Returns a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config, with + input_quantized_idxs, output_quantized_idxs, please + see docs for prepare_fx for details + """ + return _convert_fx( + graph_module, + is_reference, + convert_custom_config, + is_standalone_module=True, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..ec4caab1edcd010a66032cab51cae77ad8e4ed62 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py @@ -0,0 +1,423 @@ +# mypy: allow-untyped-defs + +import torch +from torch.ao.quantization.qconfig import QConfig +from torch.ao.quantization.quant_type import QuantType +from torch.jit._recursive import wrap_cpp_module + + +__all__ = [ + "script_qconfig", + "script_qconfig_dict", + "fuse_conv_bn_jit", + "prepare_jit", + "prepare_dynamic_jit", + "convert_jit", + "convert_dynamic_jit", + "quantize_jit", + "quantize_dynamic_jit", +] + + +def _check_is_script_module(model): + if not isinstance(model, torch.jit.ScriptModule): + raise ValueError("input must be a script module, got: " + str(type(model))) + + +def _check_forward_method(model): + if not model._c._has_method("forward"): + raise ValueError("input script module does not have forward method") + + +def script_qconfig(qconfig): + r"""Instantiate the activation and weight observer modules and script + them, these observer module instances will be deepcopied during + prepare_jit step. + """ + return QConfig( + activation=torch.jit.script(qconfig.activation())._c, + weight=torch.jit.script(qconfig.weight())._c, + ) + + +def script_qconfig_dict(qconfig_dict): + r"""Helper function used by `prepare_jit`. + Apply `script_qconfig` for all entries in `qconfig_dict` that is + not None. + """ + return {k: script_qconfig(v) if v else None for k, v in qconfig_dict.items()} + + +def fuse_conv_bn_jit(model, inplace=False): + r"""Fuse conv - bn module + Works for eval model only. + + Args: + model: TorchScript model from scripting or tracing + """ + torch._C._log_api_usage_once("quantization_api.quantize_jit.fuse_conv_bn_jit") + model_c = model._c + model_c = torch._C._jit_pass_fold_convbn(model_c) + if inplace: + model._reconstruct(model_c) + else: + model = wrap_cpp_module(model_c) + return model + + +def _prepare_jit(model, qconfig_dict, inplace=False, quant_type=QuantType.STATIC): + _check_is_script_module(model) + _check_forward_method(model) + if not all(isinstance(x, str) for x in qconfig_dict): + raise ValueError("qconfig_dict should only contain names(str) as keys.") + scripted_qconfig_dict = script_qconfig_dict(qconfig_dict) + model = fuse_conv_bn_jit(model, inplace) + model_c = torch._C._jit_pass_insert_observers( + model._c, "forward", scripted_qconfig_dict, inplace, quant_type + ) + if inplace: + model._reconstruct(model_c) + else: + model = wrap_cpp_module(model_c) + return model + + +def _prepare_ondevice_jit( + model, + qconfig_dict, + method_name="forward", + inplace=False, + quant_type=QuantType.STATIC, +): + _check_is_script_module(model) + if not all(isinstance(x, str) for x in qconfig_dict): + raise ValueError("qconfig_dict should only contain names(str) as keys.") + scripted_qconfig_dict = script_qconfig_dict(qconfig_dict) + method_graph = model._c._get_method(method_name).graph + torch._C._jit_pass_inline(method_graph) + model = fuse_conv_bn_jit(model, inplace) + model_c = torch._C._jit_pass_insert_observer_method_for_ondevice_ptq( + model._c, method_name, scripted_qconfig_dict, inplace, quant_type + ) + if inplace: + model._reconstruct(model_c) + else: + model = wrap_cpp_module(model_c) + return model + + +def prepare_jit(model, qconfig_dict, inplace=False): + torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_jit") + return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.STATIC) + + +def prepare_dynamic_jit(model, qconfig_dict, inplace=False): + torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_dynamic_jit") + return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC) + + +def _prepare_ondevice_dynamic_jit( + model, qconfig_dict, method_name="forward", inplace=False +): + return _prepare_ondevice_jit( + model, qconfig_dict, method_name, inplace, quant_type=QuantType.DYNAMIC + ) + + +def _convert_jit( + model, inplace=False, debug=False, quant_type=QuantType.STATIC, preserved_attrs=None +): + _check_is_script_module(model) + model.eval() + model_c = model._c + model_c = torch._C._jit_pass_insert_quant_dequant( + model_c, "forward", inplace, debug, quant_type + ) + if not debug: + is_xpu = all(p.device.type == "xpu" for p in model.parameters()) + if not is_xpu: + # Moving model parameters to CPU since quantized operators + # are only supported on CPU and XPU right now + model.cpu() + if preserved_attrs is None: + preserved_attrs = [] + model_c = torch._C._jit_pass_quant_finalize( + model_c, quant_type, preserved_attrs + ) + if inplace: + model._reconstruct(model_c) + else: + model = wrap_cpp_module(model_c) + torch._C._jit_pass_constant_propagation(model.graph) + torch._C._jit_pass_dce(model.graph) + return model + + +def _convert_ondevice_jit( + model, method_name, inplace=False, debug=False, quant_type=QuantType.STATIC +): + _check_is_script_module(model) + if quant_type != QuantType.DYNAMIC: + raise AssertionError( + "This API, while should work for static quant, is only tested for dynamic quant." + ) + if method_name.startswith("observe_"): + raise AssertionError("Pass in valid method to be quantized, e.g. forward") + observe_method_name = "observe_" + method_name + quantize_method_name = "quantize_" + method_name + model_c = model._c + model_c = torch._C._jit_pass_insert_quant_dequant_for_ondevice_ptq( + model._c, observe_method_name, inplace, debug, QuantType.DYNAMIC + ) + model_c = torch._C._jit_pass_quant_finalize_for_ondevice_ptq( + model_c, QuantType.DYNAMIC, quantize_method_name + ) + if inplace: + model._reconstruct(model_c) + else: + model = wrap_cpp_module(model_c) + return model + + +def convert_jit(model, inplace=False, debug=False, preserved_attrs=None): + torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_jit") + return _convert_jit( + model, + inplace, + debug, + quant_type=QuantType.STATIC, + preserved_attrs=preserved_attrs, + ) + + +def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None): + torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_dynamic_jit") + return _convert_jit( + model, + inplace, + debug, + quant_type=QuantType.DYNAMIC, + preserved_attrs=preserved_attrs, + ) + + +def _convert_ondevice_dynamic_jit(model, method_name, inplace=False, debug=False): + return _convert_ondevice_jit( + model, method_name, inplace, debug, quant_type=QuantType.DYNAMIC + ) + + +def _quantize_ondevice_dynamic_jit_impl( + model, qconfig_dict, method_name, inplace=False +): + model = _prepare_ondevice_dynamic_jit(model, qconfig_dict, method_name, inplace) + model = _convert_ondevice_dynamic_jit(model, method_name, inplace) + return model + + +def _quantize_jit( + model, + qconfig_dict, + run_fn=None, + run_args=None, + inplace=False, + debug=False, + quant_type=QuantType.STATIC, +): + # Always do inplace convert because the Tensor is already + # copied in prepare_jit when inplace is False + if quant_type == QuantType.DYNAMIC: + model = prepare_dynamic_jit(model, qconfig_dict, inplace) + model = convert_dynamic_jit(model, True, debug) + else: + if not run_fn: + raise AssertionError( + "Must provide calibration function for post training static quantization" + ) + if not run_args: + raise AssertionError( + "Must provide calibration dataset for post training static quantization" + ) + model = prepare_jit(model, qconfig_dict, inplace) + run_fn(model, *run_args) + model = convert_jit(model, True, debug) + + torch._C._jit_pass_constant_propagation(model.graph) + torch._C._jit_pass_dce(model.graph) + return model + + +def quantize_jit(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False): + r"""Quantize the input float TorchScript model with + post training static quantization. + + First it will prepare the model for calibration, then it calls + `run_fn` which will run the calibration step, after that we will + convert the model to a quantized model. + + Args: + `model`: input float TorchScript model + `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and + qconfig for that module as value, empty key means the qconfig will be applied + to whole model unless it's overwritten by more specific configurations, the + qconfig for each module is either found in the dictionary or fallback to + the qconfig of parent module. + + Right now qconfig_dict is the only way to configure how the model is quantized, + and it is done in the granularity of module, that is, we only support one type + of qconfig for each torch.nn.Module, and the qconfig for sub module will + override the qconfig for parent module, empty string means global configuration. + `run_fn`: a calibration function for calibrating the prepared model + `run_args`: positional arguments for `run_fn` + `inplace`: carry out model transformations in-place, the original module is + mutated + `debug`: flag for producing a debug friendly model (preserve weight attribute) + + Return: + Quantized TorchSciprt model. + + Example: + ```python + import torch + from torch.ao.quantization import get_default_qconfig + from torch.ao.quantization import quantize_jit + + ts_model = torch.jit.script( + float_model.eval() + ) # or torch.jit.trace(float_model, input) + qconfig = get_default_qconfig("fbgemm") + + + def calibrate(model, data_loader): + model.eval() + with torch.no_grad(): + for image, target in data_loader: + model(image) + + + quantized_model = quantize_jit( + ts_model, {"": qconfig}, calibrate, [data_loader_test] + ) + ``` + """ + torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_jit") + return _quantize_jit( + model, + qconfig_dict, + run_fn, + run_args, + inplace, + debug, + quant_type=QuantType.STATIC, + ) + + +def quantize_dynamic_jit(model, qconfig_dict, inplace=False, debug=False): + r"""Quantize the input float TorchScript model with + post training dynamic quantization. + Currently only qint8 quantization of torch.nn.Linear is supported. + + Args: + `model`: input float TorchScript model + `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and + qconfig for that module as value, please see detailed + descriptions in :func:`~torch.ao.quantization.quantize_jit` + `inplace`: carry out model transformations in-place, the original module is + mutated + `debug`: flag for producing a debug friendly model (preserve weight attribute) + + Return: + Quantized TorchSciprt model. + + Example: + ```python + import torch + from torch.ao.quantization import per_channel_dynamic_qconfig + from torch.ao.quantization import quantize_dynamic_jit + + ts_model = torch.jit.script( + float_model.eval() + ) # or torch.jit.trace(float_model, input) + qconfig = get_default_qconfig("fbgemm") + + + def calibrate(model, data_loader): + model.eval() + with torch.no_grad(): + for image, target in data_loader: + model(image) + + + quantized_model = quantize_dynamic_jit( + ts_model, {"": qconfig}, calibrate, [data_loader_test] + ) + ``` + """ + torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_dynamic_jit") + return _quantize_jit( + model, qconfig_dict, inplace=inplace, debug=debug, quant_type=QuantType.DYNAMIC + ) + + +def _quantize_ondevice_dynamic_jit( + model, qconfig_dict, method_name="forward", inplace=False +): + r"""Prepares the input float TorchScript model with + *on-device* post training dynamic quantization. + Currently only qint8 quantization of torch.nn.Linear is supported. + + Args: + `model`: input float TorchScript model + `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and + qconfig for that module as value, please see detailed + `method_name`: Name of the method within the model, to be prepared for quantization + descriptions in :func:`~torch.ao.quantization.quantize_jit` + `inplace`: carry out model transformations in-place, the original module is + mutated + + Return: + TorchScript model that is ready for on device quantization. + This means that the returned + model has: + - Method is inlined. + - Model has observer modules inserted in the model. + - Model has packed params inserted in the model. However they are empty as in they dont + contain valid quantized weights. + - observe_ is added that observe the values to be quantized. + - reset_observers_ to reset observers. + - quantize_ is added to the model. + - This method extract scale, zero points. + - Quantizes observed weights. + - Creates packed params from it and update the attribute of the model with the new values + for the packed params. + - Reset the original fp32 weights with empty tensor using SetAttr. + - quantized_ is added to the model. + - This method uses quantized weights and quantized linear ops instead of fp32 op. + - This method should be used for inference post PTQ. + - Note that all method's signatures should be the same as method_name. + + Later on device: + - Run reset_observers_ + - Run observe_ + - Run quantize_ + - Now model can be saved and loaded later. + - Run model with quantized_ + + Example: + ```python + import torch + from torch.ao.quantization import per_channel_dynamic_qconfig + from torch.ao.quantization.quantize_jit import _quantize_ondevice_dynamic_jit + + ts_model = torch.jit.script( + float_model.eval() + ) # or torch.jit.trace(float_model, input) + qconfig = get_default_qconfig("fbgemm") + quant_ready_model = _quantize_ondevice_dynamic_jit( + ts_model, {"": qconfig}, "forward", True + ) + ``` + """ + return _quantize_ondevice_dynamic_jit_impl( + model, qconfig_dict, method_name, inplace=inplace + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py new file mode 100644 index 0000000000000000000000000000000000000000..169e2905ddbdcc2ec86d92d1b858abe7e91af298 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py @@ -0,0 +1,262 @@ +import typing_extensions + +import torch +from torch._export.passes.constant_folding import constant_fold +from torch.ao.quantization.pt2e.duplicate_dq_pass import DuplicateDQPass +from torch.ao.quantization.pt2e.port_metadata_pass import PortNodeMetaForQDQ +from torch.ao.quantization.quantizer import ( # noqa: F401 + DerivedQuantizationSpec, + FixedQParamsQuantizationSpec, + QuantizationAnnotation, + QuantizationSpec, + QuantizationSpecBase, + Quantizer, + SharedQuantizationSpec, +) +from torch.fx import GraphModule, Node +from torch.fx.passes.infra.pass_manager import PassManager + +from .pt2e.prepare import prepare +from .pt2e.qat_utils import _fold_conv_bn_qat, _fuse_conv_bn_qat +from .pt2e.representation import reference_representation_rewrite +from .pt2e.utils import _disallow_eval_train, _fuse_conv_bn_, _get_node_name_to_scope +from .quantize_fx import _convert_to_reference_decomposed_fx +from .utils import DEPRECATION_WARNING + + +__all__ = [ + "prepare_pt2e", + "prepare_qat_pt2e", + "convert_pt2e", +] + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare_pt2e( + model: GraphModule, + quantizer: Quantizer, +) -> GraphModule: + """Prepare a model for post training quantization + + Args: + * `model` (torch.fx.GraphModule): a model captured by `torch.export.export_for_training` API. + * `quantizer`: A backend specific quantizer that conveys how user want the + model to be quantized. Tutorial for how to write a quantizer can be found here: + https://pytorch.org/tutorials/prototype/pt2e_quantizer.html + + Return: + A GraphModule with observer (based on quantizer annotation), ready for calibration + + Example:: + + import torch + from torch.ao.quantization.quantize_pt2e import prepare_pt2e + from torch.ao.quantization.quantizer import ( + XNNPACKQuantizer, + get_symmetric_quantization_config, + ) + + class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 10) + + def forward(self, x): + return self.linear(x) + + # initialize a floating point model + float_model = M().eval() + + # define calibration function + def calibrate(model, data_loader): + model.eval() + with torch.no_grad(): + for image, target in data_loader: + model(image) + + # Step 1. program capture + # NOTE: this API will be updated to torch.export API in the future, but the captured + # result should mostly stay the same + m = torch.export.export_for_training(m, *example_inputs).module() + # we get a model with aten ops + + # Step 2. quantization + # backend developer will write their own Quantizer and expose methods to allow + # users to express how they + # want the model to be quantized + quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) + m = prepare_pt2e(m, quantizer) + + # run calibration + # calibrate(m, sample_inference_data) + """ + torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_pt2e") + original_graph_meta = model.meta + node_name_to_scope = _get_node_name_to_scope(model) + # TODO: check qconfig_mapping to make sure conv and bn are both configured + # to be quantized before fusion + # TODO: (maybe) rewrite this with subgraph_rewriter + _fuse_conv_bn_(model) + model = quantizer.transform_for_annotation(model) + quantizer.annotate(model) + quantizer.validate(model) + model = prepare( + model, + node_name_to_scope, + is_qat=False, + obs_or_fq_callback=quantizer.prepare_obs_or_fq_callback, + ) + model.meta.update(original_graph_meta) + model = _disallow_eval_train(model) + return model + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def prepare_qat_pt2e( + model: GraphModule, + quantizer: Quantizer, +) -> GraphModule: + """Prepare a model for quantization aware training + + Args: + * `model` (torch.fx.GraphModule): see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e` + * `quantizer`: see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e` + + Return: + A GraphModule with fake quant modules (based on quantizer annotation), ready for + quantization aware training + + Example:: + import torch + from torch.ao.quantization.quantize_pt2e import prepare_qat_pt2e + from torch.ao.quantization.quantizer import ( + XNNPACKQuantizer, + get_symmetric_quantization_config, + ) + + class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(5, 10) + + def forward(self, x): + return self.linear(x) + + # initialize a floating point model + float_model = M().eval() + + # define the training loop for quantization aware training + def train_loop(model, train_data): + model.train() + for image, target in data_loader: + ... + + # Step 1. program capture + # NOTE: this API will be updated to torch.export API in the future, but the captured + # result should mostly stay the same + m = torch.export.export_for_training(m, *example_inputs).module() + # we get a model with aten ops + + # Step 2. quantization + # backend developer will write their own Quantizer and expose methods to allow + # users to express how they + # want the model to be quantized + quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) + m = prepare_qat_pt2e(m, quantizer) + + # run quantization aware training + train_loop(prepared_model, train_loop) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_qat_pt2e") + original_graph_meta = model.meta + node_name_to_scope = _get_node_name_to_scope(model) + model = quantizer.transform_for_annotation(model) + quantizer.annotate(model) + quantizer.validate(model) + # Perform fusion after annotate to avoid quantizing ops in the new + # subgraph that don't need to be quantized + # TODO: only fuse if conv and bn are both configured to be quantized + _fuse_conv_bn_qat(model) + model = prepare( + model, + node_name_to_scope, + is_qat=True, + obs_or_fq_callback=quantizer.prepare_obs_or_fq_callback, + ) + model.meta.update(original_graph_meta) + model = _disallow_eval_train(model) + return model + + +_QUANT_OPS = [ + torch.ops.quantized_decomposed.quantize_per_tensor.default, + torch.ops.quantized_decomposed.quantize_per_tensor.tensor, + torch.ops.quantized_decomposed.quantize_per_channel.default, + torch.ops.pt2e_quant.quantize_affine, +] + + +def _quant_node_constraint(n: Node) -> bool: + """If there is any pure ops between get_attr and quantize op they will be const propagated + e.g. get_attr(weight) -> transpose -> quantize -> dequantize* + (Note: dequantize op is not going to be constant propagated) + + This filter is added because we don't want to constant fold the things that are not + related to quantization + """ + return n.op == "call_function" and n.target in _QUANT_OPS + + +@typing_extensions.deprecated(DEPRECATION_WARNING) +def convert_pt2e( + model: GraphModule, + use_reference_representation: bool = False, + fold_quantize: bool = True, +) -> GraphModule: + """Convert a calibrated/trained model to a quantized model + + Args: + * `model` (torch.fx.GraphModule): calibrated/trained model + * `use_reference_representation` (bool): boolean flag to indicate whether to produce reference representation or not + * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not + + Returns: + quantized model, either in q/dq representation or reference representation + + Example:: + + # prepared_model: the model produced by `prepare_pt2e`/`prepare_qat_pt2e` and calibration/training + # `convert_pt2e` produces a quantized model that represents quantized computation with + # quantize dequantize ops and fp32 ops by default. + # Please refer to + # https://pytorch.org/tutorials/prototype/pt2e_quant_ptq_static.html#convert-the-calibrated-model-to-a-quantized-model + # for detailed explanation of output quantized model + quantized_model = convert_pt2e(prepared_model) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_pt2e.convert_pt2e") + if not isinstance(use_reference_representation, bool): + raise ValueError( + "Unexpected argument type for `use_reference_representation`, " + f"please make sure you intend to pass argument {use_reference_representation} to convert_pt2e" + ) + original_graph_meta = model.meta + model = _convert_to_reference_decomposed_fx(model) + model = _fold_conv_bn_qat(model) + + pm = PassManager([DuplicateDQPass()]) + model = pm(model).graph_module + + pm = PassManager([PortNodeMetaForQDQ()]) + model = pm(model).graph_module + + if fold_quantize: + constant_fold(model, _quant_node_constraint) + + if use_reference_representation: + model = reference_representation_rewrite(model) + + model.meta.update(original_graph_meta) + model = _disallow_eval_train(model) + return model diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5cd5e8696d39781004960f47e6f44d3b1987ff4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py @@ -0,0 +1,22 @@ +from .quantizer import ( + DerivedQuantizationSpec, + EdgeOrNode, + FixedQParamsQuantizationSpec, + QuantizationAnnotation, + QuantizationSpec, + QuantizationSpecBase, + Quantizer, + SharedQuantizationSpec, +) + + +__all__ = [ + "EdgeOrNode", + "Quantizer", + "QuantizationSpecBase", + "QuantizationSpec", + "FixedQParamsQuantizationSpec", + "SharedQuantizationSpec", + "DerivedQuantizationSpec", + "QuantizationAnnotation", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..15404cc560117713bf8c952f594c051b1c13e3a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .quantizer import QuantizationAnnotation, Quantizer + + +if TYPE_CHECKING: + import torch + from torch.fx import Node + +__all__ = [ + "ComposableQuantizer", +] + + +class ComposableQuantizer(Quantizer): + """ + ComposableQuantizer allows users to combine more than one quantizer into a single quantizer. + This allows users to quantize a model with multiple quantizers. E.g., embedding quantization + maybe supported by one quantizer while linear layers and other ops might be supported by another + quantizer. + + ComposableQuantizer is initialized with a list of `Quantizer` instances. + The order of the composition matters since that is the order in which the quantizers will be + applies. + Example: + ``` + embedding_quantizer = EmbeddingQuantizer() + linear_quantizer = MyLinearQuantizer() + xnnpack_quantizer = ( + XNNPackQuantizer() + ) # to handle ops not quantized by previous two quantizers + composed_quantizer = ComposableQuantizer( + [embedding_quantizer, linear_quantizer, xnnpack_quantizer] + ) + prepared_m = prepare_pt2e(model, composed_quantizer) + ``` + """ + + def __init__(self, quantizers: list[Quantizer]): + super().__init__() + self.quantizers = quantizers + self._graph_annotations: dict[Node, QuantizationAnnotation] = {} + + def _record_and_validate_annotations( + self, gm: torch.fx.GraphModule, quantizer: Quantizer + ) -> None: + for n in gm.graph.nodes: + if "quantization_annotation" in n.meta: + # check if the annotation has been changed by + # comparing QuantizationAnnotation object id + if n in self._graph_annotations and ( + id(self._graph_annotations[n]) + != id(n.meta["quantization_annotation"]) + ): + raise RuntimeError( + f"Quantizer {quantizer.__class__.__name__} has changed annotations on node {n}" + ) + else: + self._graph_annotations[n] = n.meta["quantization_annotation"] + else: + if n in self._graph_annotations: + raise RuntimeError( + f"Quantizer {quantizer.__class__.__name__} has removed annotations on node {n}" + ) + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + """just handling global spec for now""" + for quantizer in self.quantizers: + quantizer.annotate(model) + self._record_and_validate_annotations(model, quantizer) + return model + + def transform_for_annotation( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + for quantizer in self.quantizers: + model = quantizer.transform_for_annotation(model) + return model + + def validate(self, model: torch.fx.GraphModule) -> None: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..3b8ef1030bfdcdeb88b58179f4f2ea83c895aad2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py @@ -0,0 +1,94 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import copy + +import torch +import torch.nn.functional as F +from torch.ao.quantization.observer import PerChannelMinMaxObserver +from torch.ao.quantization.quantizer.quantizer import ( + QuantizationAnnotation, + QuantizationSpec, + Quantizer, +) +from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import ( + OperatorConfig, + OperatorPatternType, + QuantizationConfig, +) + + +__all__ = [ + "get_embedding_operators_config", + "EmbeddingQuantizer", +] + + +def get_embedding_operators_config() -> OperatorConfig: + weight_quantization_spec = QuantizationSpec( + dtype=torch.uint8, + qscheme=torch.per_channel_affine_float_qparams, + ch_axis=0, + observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(eps=2**-12), + ) + quantization_config = QuantizationConfig(None, None, weight_quantization_spec, None) + ops: list[OperatorPatternType] = [[torch.nn.Embedding]] + ops.append([F.embedding]) + supported_config_and_operators = OperatorConfig( + config=quantization_config, operators=ops + ) + return copy.deepcopy(supported_config_and_operators) + + +class EmbeddingQuantizer(Quantizer): + @classmethod + def get_supported_quantization_configs(cls) -> list[QuantizationConfig]: + op_configs: set[QuantizationConfig] = { + spec for spec, _ in cls.get_supported_operators() + } + return list(op_configs) + + @classmethod + def get_supported_operator_for_quantization_config( + cls, quantization_config: QuantizationConfig + ) -> list[OperatorPatternType]: + for config, ops in cls.get_supported_operators(): + # note: this assumes each entry in cls.supported_spec_and_operators + # corresponds to one spec, e.g. we don't have + # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)] + # where the first and second entry have the same spec but did not + # merge the op list + if config == quantization_config: + return ops + return [] + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + """just handling global spec for now""" + self._annotate_embedding_ops(model.graph) + return model + + def _annotate_embedding_ops(self, graph: torch.fx.Graph) -> None: + embedding_config: OperatorConfig = get_embedding_operators_config() + for node in graph.nodes: + # Keep node parsing based annotations instead of module partitioners + # just as an example of alternate ways of annotating + if ( + node.op == "call_function" + and node.target is torch.ops.aten.embedding.default + ): + if embedding_config.config.weight is None: + raise ValueError( + "Embedding config must have a valid weight quantization spec." + ) + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + node.args[0]: embedding_config.config.weight, + } + ) + + def validate(self, model: torch.fx.GraphModule) -> None: + pass + + @classmethod + def get_supported_operators(cls) -> list[OperatorConfig]: + return [get_embedding_operators_config()] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d6e10526b4cc4ca58d099523d32ebd57a393a1dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py @@ -0,0 +1,182 @@ +# mypy: allow-untyped-defs +from abc import ABC, abstractmethod +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Annotated + +import torch +from torch import Tensor +from torch.ao.quantization import ObserverOrFakeQuantize +from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor +from torch.fx import Node + + +__all__ = [ + "Quantizer", + "QuantizationSpecBase", + "QuantizationSpec", + "FixedQParamsQuantizationSpec", + "EdgeOrNode", + "SharedQuantizationSpec", + "DerivedQuantizationSpec", + "QuantizationAnnotation", +] + + +class QuantizationSpecBase(ABC): # noqa: B024 + """Base class for different types of quantization specs that allows users to + specify how to quantize a Tensor (input/output of a Node) in the model + """ + + +@dataclass(eq=True, frozen=True) +class QuantizationSpec(QuantizationSpecBase): + """Quantization spec for common operators that allows user to specify how to + quantize a Tensor, this includes dtype, quant_min, quant_max etc. + """ + + dtype: torch.dtype + # observer or fake_quantize constructor such as + # MinMaxObserver, PerChannelHistogramObserver etc. + # or we can attach some custom args to them + # e.g. MinMaxObserver.with_args(eps=eps) + observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor + quant_min: int | None = None + quant_max: int | None = None + qscheme: torch.qscheme | None = None + ch_axis: int | None = None + is_dynamic: bool = False + + def __post_init__(self): + # TODO: add init for quant_min/quant_max + # quant_min must be less than quant_max + if ( + self.quant_min is not None + and self.quant_max is not None + and self.quant_min > self.quant_max + ): + raise ValueError( + f"quant_min {self.quant_min} must be <= quant_max {self.quant_max}." + ) + + # ch_axis must be less than the number of channels + # but no way to check here. Just check that it is not < 0. + if self.ch_axis is not None and self.ch_axis < 0: + raise ValueError("Ch_axis is < 0.") + + +@dataclass(eq=True, frozen=True) +class FixedQParamsQuantizationSpec(QuantizationSpecBase): + dtype: torch.dtype + scale: float + zero_point: int + quant_min: int | None = None + quant_max: int | None = None + qscheme: torch.qscheme | None = None + is_dynamic: bool = False + + +""" +The way we refer to other points of quantization in the graph will be either +an input edge or an output value +input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node] +output value is an fx Node +""" +EdgeOrNode = Annotated[tuple[Node, Node] | Node, None] +EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer" + + +@dataclass(eq=True, frozen=True) +class SharedQuantizationSpec(QuantizationSpecBase): + """ + Quantization spec for the Tensors whose quantization parameters are shared with other Tensors + """ + + # the edge or node to share observer or fake quant instances with + edge_or_node: EdgeOrNode + + +@dataclass(eq=True, frozen=True) +class DerivedQuantizationSpec(QuantizationSpecBase): + """Quantization spec for the Tensors whose quantization parameters are derived from other Tensors""" + + derived_from: list[EdgeOrNode] + derive_qparams_fn: Callable[[list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor]] + dtype: torch.dtype + quant_min: int | None = None + quant_max: int | None = None + qscheme: torch.qscheme | None = None + ch_axis: int | None = None + is_dynamic: bool = False + + +@dataclass +class QuantizationAnnotation: + """How are input argument or output should be quantized, + expressed as QuantizationSpec, this corresponds to how a Tensor in the + operator Graph is observed (PTQ) or fake quantized (QAT) + """ + + # a map from torch.fx.Node to a type of QuantizationSpecBase + input_qspec_map: dict[Node, QuantizationSpecBase | None] = field( + default_factory=dict + ) + + # How the output of this node is quantized, expressed as QuantizationSpec + # TODO: change the value to QuantizationSpec in a separate PR + output_qspec: QuantizationSpecBase | None = None + + # For a Node: node1 and edge: (node1, node2), since they are observing the same + # Tensor, we may want to implicitly share observers, this flag allows people to + # turn off this behavior for the output of the node + allow_implicit_sharing: bool = True + + # whether the node is annotated or not + _annotated: bool = False + + +class Quantizer(ABC): + def transform_for_annotation( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + """Allows for user defined transforms to run before annotating the graph. + This allows quantizer to allow quantizing part of the model that are otherwise not quantizable. + For example quantizer can + a) decompose a compound operator like scaled dot product attention, + into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa + or b) transform scalars to tensor to allow quantizing scalares. + + Note: this is an optional method + """ + return model + + # annotate nodes in the graph with observer or fake quant constructors + # to convey the desired way of quantization + @abstractmethod + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + pass + + # validate the annotated graph is supported by the backend + @abstractmethod + def validate(self, model: torch.fx.GraphModule) -> None: + pass + + def prepare_obs_or_fq_callback( + self, + model: torch.fx.GraphModule, + edge_or_node_to_obs_or_fq: dict[EdgeOrNode, ObserverOrFakeQuantize], + ) -> None: + """A callback that will be called after the observers or fake quants are created + for each sharing group, but before they are inserted into the graph. The + callback can be used to make final quantization adjustments, such as enforcing + specific scale and zero point on model input or output. + + Args: + * `model`: the graph module being prepared. + * `edge_or_node_to_obs_or_fq`: a dictionary mapping each annotated edge and + node to the corresponding observer or fake quant object. Note that multiple + edges and/or nodes can map to the same observer / fake quant instance if + they were annotated with SharedQuantizationSpec. This dictionary can be + modified by the callback. + """ + return diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..06463ae0f2f3adb815d34b0f539fb6cde423e1ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py @@ -0,0 +1,90 @@ +from collections.abc import Callable + +from torch.ao.quantization.pt2e.utils import _is_sym_size_node +from torch.ao.quantization.quantizer.quantizer import ( + QuantizationAnnotation, + QuantizationSpecBase, +) +from torch.fx import Node + + +__all__: list[str] = [] + + +def _annotate_input_qspec_map( + node: Node, input_node: Node, qspec: QuantizationSpecBase | None +) -> None: + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + quantization_annotation.input_qspec_map[input_node] = qspec + node.meta["quantization_annotation"] = quantization_annotation + + +def _annotate_output_qspec(node: Node, qspec: QuantizationSpecBase | None) -> None: + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + quantization_annotation.output_qspec = qspec + node.meta["quantization_annotation"] = quantization_annotation + + +def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]) -> bool: + """ + This utility is used to handle cases when dynami_shape=True tracing leads + to symint nodes in the pattern of linear module. In those cases, we need to + distinguish between the nodes that are in input for just extracting value of + some dimensions (and symint nodes) vs. the one that is activation. + For example: + graph(x, y, weight): + size_0 = torch.ops.aten.sym_size([x], [0]) + size_1 = torch.ops.aten.sym_size([y], [1]) + view_size = size_0 * size_1 + size_3 = torch.ops.aten.sym_size([x], [2]) + vie_out = torch.ops.aten.view(x, [view_size, size_3]) + return mm(view_out, weight) + In the example above y node is not actual input. It exist only to extract size_1 + """ + if _is_sym_size_node(node): + return True + + return all( + ((user not in partition_nodes) or _is_sym_size_node(user)) + for user in node.users + ) + + +def _get_module_name_filter(module_name: str) -> Callable[[Node], bool]: + """Get the module_name_filter function for a given module name, the filter accepts + a node and checks if the node comes from a module that has certain module name + + For example: + node: linear_op = call_function[...](...) # comes from a module with name blocks.sub.linear1 + + + >> module_name_filter = _get_module_name_filter("blocks.sub") + >> print(module_name_filter(node)) + True # the node is from "blocks.sub" based on the fully qualified name "blocks.sub.linear1" + """ + + def module_name_filter(n: Node) -> bool: + # example: { + # 'L__self___sub': ("L['self'].sub", ), + # 'L__self___sub_linear': ("L['self'].sub.linear", ) + # } + # get_attr nodes doesn't have nn_module_stack? + nn_module_stack = n.meta.get("nn_module_stack", {}) + + def _normalize_path(n: str) -> str: + prefix = 0 + # TODO This is non standard behavior and should be removed when we migrate off capture_pre_autograd_graph. + if n.startswith("L['self']."): + prefix = len("L['self'].") + return n[prefix:] + + names = [_normalize_path(n) for n, _ in nn_module_stack.values()] + return module_name in names + + return module_name_filter diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..e9cde0e2d12a6d00abfef6c2564b679286d99262 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py @@ -0,0 +1,1605 @@ +# mypy: allow-untyped-defs +import functools +import itertools +import operator +import warnings +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from typing import Any, Optional, TYPE_CHECKING, TypeAlias + +import torch +import torch.nn.functional as F +from torch.ao.quantization.fake_quantize import ( + FakeQuantize, + FusedMovingAvgObsFakeQuantize, +) +from torch.ao.quantization.observer import ( + HistogramObserver, + MovingAverageMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, + PerChannelMinMaxObserver, + PlaceholderObserver, +) +from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions +from torch.ao.quantization.quantizer.quantizer import ( + QuantizationAnnotation, + QuantizationSpec, + Quantizer, + SharedQuantizationSpec, +) +from torch.ao.quantization.quantizer.utils import _get_module_name_filter +from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import ( + get_bias_qspec, + get_input_act_qspec, + get_output_act_qspec, + get_weight_qspec, + QuantizationConfig, +) +from torch.fx import Node +from torch.fx.passes.utils.source_matcher_utils import ( + get_source_partitions, + SourcePartition, +) + + +FilterFn: TypeAlias = Callable[[list[Node]], bool] + + +if TYPE_CHECKING: + from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor + +__all__ = [ + "X86InductorQuantizer", + "get_default_x86_inductor_quantization_config", + "get_x86_inductor_linear_dynamic_fp16_config", +] + + +@dataclass +class _X86InductorQuantizationAnnotation(QuantizationAnnotation): + # _is_output_of_quantized_pattern: + # * Node as output node of a fusion pattern. + # * The fusion pattern supports int8 data type. + # * The fusion pattern has inputs annotated to insert observer. + # * The quantization_config is not `None`. + _is_output_of_quantized_pattern: bool = False + + +# Operators that: +# 1. Operators are optimized to run with int8 when int8 input provided. +# 2. Operators do not support int8 input and produce fp32 output. +int8_in_int8_out_ops: set = { + torch.ops.aten.max_pool2d.default, + torch.ops.aten.cat.default, + torch.ops.aten.avg_pool2d.default, + torch.ops.aten.adaptive_avg_pool2d.default, + torch.ops.aten.flatten.using_ints, +} + +# Operators that support the int8 data type for quantization config propagation. +# A superset of int8_in_int8_out_ops incorporating additional operators. +propagation_quantizable_ops = int8_in_int8_out_ops + +# Operators support the int8 data type +# and recipe is configured by default in X86InductorQuantizer. +default_quantizable_ops = propagation_quantizable_ops | { + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.linear.default, +} + +# A superset of default_quantizable_ops includes operators support the int8 data type +# but not enabled by default recipe of X86InductorQuantizer. +quantizable_ops = default_quantizable_ops | { + torch.ops.aten.matmul.default, +} + +QUANT_ANNOTATION_KEY = "quantization_annotation" + + +def _skip_annotate(nodes: list[Node], filter_fn: FilterFn | None = None) -> bool: + """Determine whether to skip annotation for a list of nodes.""" + + # 1) Skip annotate if any node is already annotated + if _is_any_annotated(nodes): + return True + + # 2) Proceed annotate if a) a filter function is provided + # and b) the given nodes list passes the filter function check. + if filter_fn and filter_fn(nodes): + return False + + return True + + +def _create_module_name_filter(module_name: str) -> FilterFn: + """Create a filter function for a given module name. + + The filter function takes a list of nodes (as determined by the annotate function) + and return True if *all* nodes come from the specified module name, False otherwise. + + For example: + linear_1: "f32[3, 10]" = torch.ops.aten.linear.default(...) # comes from a module with name `sub.linear1` + relu: "f32[3, 10]" = torch.ops.aten.relu.default(linear_1); # comes from a module with name `sub.relu1` + + >> module_name_filter = _create_module_name_filter_inner("sub") + >> print(module_name_filter([relu, linear_1])) + # True # These two nodes are determined by `_annotate_linear_unary` function and from "sub". + """ + + filter_fn = _get_module_name_filter(module_name) + + def check_all_nodes_from_module(nodes: list[Node]) -> bool: + all_nodes_from_module_name: bool = all(filter_fn(n) for n in nodes) + return all_nodes_from_module_name + + return check_all_nodes_from_module + + +def _create_operator_type_filter( + operator_type: Callable, +) -> FilterFn: + """Create a filter function for a given operator type. + + The filter function takes a list of nodes and returns True if it contains + exactly one node with the specified operator type, False otherwise. + + For example: + linear_1: "f32[3, 10]" = torch.ops.aten.linear.default(...) # comes from a module with name `sub.linear1` + relu: "f32[3, 10]" = torch.ops.aten.relu.default(linear_1); # comes from a module with name `sub.relu1` + + >> operator_type_filter = _create_operator_type_filter(torch.ops.aten.linear.default) + >> print(operator_type_filter([relu, linear_1])) + # True # These two nodes are determined by `_annotate_linear_unary` function and the second node is `linear`. + """ + + def operator_type_filter(nodes: list[Node]): + num_nodes_with_operator_type = sum( + node.target == operator_type for node in nodes + ) + if num_nodes_with_operator_type > 1: + raise NotImplementedError( + f"Several nodes within a single pattern are {operator_type}." + ) + return num_nodes_with_operator_type == 1 + + return operator_type_filter + + +def _global_config_filter(nodes: list[Node]) -> bool: + """Filter function for global configuration. + + This filter function takes a list of nodes and returns True if there is exactly one node + in the list that is a default quantizable operation, False otherwise. + """ + num_nodes_in_default_quantizable_ops = sum( + node.target in default_quantizable_ops for node in nodes + ) + if num_nodes_in_default_quantizable_ops > 1: + raise NotImplementedError( + "Several nodes within a single pattern are default quantizable operations." + ) + return num_nodes_in_default_quantizable_ops == 1 + + +def _map_module_function_to_aten_operator_type(): + module_function_to_aten_operator: dict[Callable, torch._ops.OpOverloadPacket] = {} + map_list = ( + ([torch.nn.Conv2d, F.conv1d], torch.ops.aten.conv1d.default), + ([torch.nn.Conv2d, F.conv2d], torch.ops.aten.conv2d.default), + ([torch.nn.Linear, F.linear], torch.ops.aten.linear.default), + ([torch.nn.MaxPool2d, F.max_pool2d], torch.ops.aten.max_pool2d.default), + ( + [ + torch.cat, + ], + torch.ops.aten.cat.default, + ), + ([torch.nn.AvgPool2d, F.avg_pool2d], torch.ops.aten.avg_pool2d.default), + ( + [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], + torch.ops.aten.adaptive_avg_pool2d.default, + ), + ( + [ + torch.flatten, + ], + torch.ops.aten.flatten.using_ints, + ), + ( + [ + torch.matmul, + ], + torch.ops.aten.matmul.default, + ), + ) + for map_item in map_list: + module_function_to_aten_operator.update(dict.fromkeys(map_item[0], map_item[1])) # type: ignore[arg-type, call-overload] + return module_function_to_aten_operator + + +def _mark_nodes_as_annotated(nodes: list[Node]): + for node in nodes: + if node is not None: + if QUANT_ANNOTATION_KEY not in node.meta: + node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation() + node.meta[QUANT_ANNOTATION_KEY]._annotated = True + + +def _is_node_annotated(_node): + """ + return True if the node is annotated, otherwise return False + """ + return ( + QUANT_ANNOTATION_KEY in _node.meta + and _node.meta[QUANT_ANNOTATION_KEY]._annotated + ) + + +def _is_any_annotated(nodes: list[Node]): + """ + Given a list of nodes (that represents an operator pattern), + check if any of the node is annotated, return True if any of the node + is annotated, otherwise return False. + """ + return any(_is_node_annotated(node) for node in nodes) + + +def _is_all_annotated(nodes: list[Node]): + """ + Given a list of nodes (that represents an operator pattern), + return True if all of the node is annotated, otherwise return False. + """ + return all(_is_node_annotated(node) for node in nodes) + + +def _is_quantized_op_pt2e(node: torch.fx.Node): + """ + Used for pt2e flow to check if the node is a quantized node: + Case1: the node has been annotated as output node of a fusion pattern. + Case2: the node has been annotated as single quantized node. + """ + if not _is_any_annotated([node]): + # The node has not been annotated, directly return False + return False + quantization_annotation = node.meta.get(QUANT_ANNOTATION_KEY, None) + if not isinstance(quantization_annotation, _X86InductorQuantizationAnnotation): + raise AssertionError( + "quantization_annotation must be an _X86InductorQuantizationAnnotation" + ) + return quantization_annotation._is_output_of_quantized_pattern + + +@functools.lru_cache +def get_default_x86_inductor_quantization_config( + is_qat: bool = False, + is_dynamic: bool = False, + reduce_range: bool = False, +): + """ + reduce_range is False by default. Set it to True on earlier CPUs without VNNI to avoid accuracy issue. + """ + extra_args: dict[str, Any] = {"eps": 2**-12} + if is_qat: + if is_dynamic: + act_observer_or_fake_quant_ctr = FakeQuantize + dynamic_quant_observer = MovingAverageMinMaxObserver.with_args( + averaging_constant=1 + ) + extra_args["observer"] = dynamic_quant_observer + else: + act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize # type: ignore[assignment] + else: + if is_dynamic: + act_observer_or_fake_quant_ctr = PlaceholderObserver # type: ignore[assignment] + else: + act_observer_or_fake_quant_ctr = HistogramObserver # type: ignore[assignment] + + # Copy from x86 default qconfig from torch/ao/quantization/qconfig.py + act_quantization_spec = QuantizationSpec( + dtype=torch.uint8, + quant_min=0, + quant_max=127 if reduce_range else 255, + qscheme=torch.per_tensor_affine, + is_dynamic=is_dynamic, + observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + + weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = ( + FusedMovingAvgObsFakeQuantize if is_qat else PerChannelMinMaxObserver + ) + + if is_qat: + # Only support per channel quant for now + extra_args["observer"] = MovingAveragePerChannelMinMaxObserver # type: ignore[dict-item] + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_channel_symmetric, + ch_axis=0, # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv + is_dynamic=False, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + bias_quantization_spec = None # will use placeholder observer by default + quantization_config = QuantizationConfig( + act_quantization_spec, + act_quantization_spec, + weight_quantization_spec, + bias_quantization_spec, + is_qat, + ) + return quantization_config + + +@functools.lru_cache +def get_x86_inductor_linear_dynamic_fp16_config(): + """ + For linear_dynamic_fp16. The name may be confusing. + The op's behavior is fp32_input * (fp16_weight -> to_fp32) -> fp32_output. + """ + weight_quantization_spec = QuantizationSpec( + dtype=torch.float16, + observer_or_fake_quant_ctr=PlaceholderObserver, + ) + quantization_config = QuantizationConfig( + None, # input_quantization_spec + None, # output_quantization_spec + weight_quantization_spec, + None, # bias_quantization_spec + ) + return quantization_config + + +def _annotate_nodes_not_quantize(nodes: Node | list[Node]) -> None: + """Annotate nodes to exclude them from quantization (their `quantization_config` is `None`).""" + if not isinstance(nodes, list): + nodes = [nodes] + for node in nodes: + node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + _annotated=True + ) + + +def _config_checker(method: Callable) -> Callable: + @functools.wraps(method) + def wrapper( + quantizer: "X86InductorQuantizer", + name: Any, + quantization_config: Optional["QuantizationConfig"], + ) -> "X86InductorQuantizer": + if quantizer._need_skip_config(quantization_config): + warnings.warn( + f"Skip the quantization config for {name}.", + stacklevel=2, + ) + return quantizer + return method(quantizer, name, quantization_config) + + return wrapper + + +@dataclass +class _CurrentQuantizationMode: + r"""Configuration defining the current quantization mode for the quantizer. + + All possible current quantization modes are listed below: + ---------------------------------------------------------------------------------------------------------- + | dynamic_state + qat_state |--------------------------------------------------------------------------------------------- + | None | True | False + ---------------------------------------------------------------------------------------------------------- + None | quantizer does not receive a non-None `quantization_config` | \ | \ + False | quantizer will not do QAT | dynamic | static + True | quantizer will do QAT | QAT + dynamic | QAT + static + """ + + qat_state: bool | None + dynamic_state: bool | None + + +class X86InductorQuantizer(Quantizer): + module_function_to_aten_operator_type = _map_module_function_to_aten_operator_type() + + def __init__(self) -> None: + super().__init__() + self.global_config: QuantizationConfig | None = None + self.operator_type_qconfig: dict[ + torch._ops.OpOverloadPacket, QuantizationConfig | None + ] = {} + self.module_name_qconfig: dict[str, QuantizationConfig | None] = {} + + def _get_current_quantization_mode(self) -> _CurrentQuantizationMode: + """Retrieves the current quantization mode based on all configurations.""" + qat_state = None + dynamic_state = None + + # As we use `_need_skip_config` to skip all invalid configurations, + # we can safely assume that the all existing non-None configurations + # have the same quantization mode. + # pyrefly: ignore [bad-assignment] + for qconfig in ( + list(self.module_name_qconfig.values()) + + list(self.operator_type_qconfig.values()) + + [self.global_config] + ): + if qconfig is not None: + # Query the `is_qat` state + if qat_state is None: + qat_state = qconfig.is_qat + else: + if qat_state != qconfig.is_qat: + raise AssertionError( + f"All non-None quantization configs should have the same `is_qat`," + f"but got {qat_state} and {qconfig.is_qat}." + ) + # Query the `is_dynamic` state + input_activation_spec = qconfig.input_activation + if input_activation_spec is not None: + if dynamic_state is None: + dynamic_state = input_activation_spec.is_dynamic + else: + if dynamic_state != input_activation_spec.is_dynamic: + raise AssertionError( + f"All non-None `input_activation_spec` should have the same `is_dynamic`," + f"but got {dynamic_state} and {input_activation_spec.is_dynamic}." + ) + return _CurrentQuantizationMode( + qat_state=qat_state, dynamic_state=dynamic_state + ) + + def _need_skip_config(self, quantization_config: QuantizationConfig | None) -> bool: + """Check if the provided quantization config is valid for X86InductorQuantizer. + + Mixed static/dynamic configurations or mixed QAT/non-QAT configurations are not supported. + To avoid such a mix, we compare the incoming configuration with current configuration status. + Refer the `_CurrentQuantizationMode` definition for all possible modes. + """ + if quantization_config is None: + return False + + need_skip = False + current_mode = self._get_current_quantization_mode() + if ( + current_mode.qat_state is not None + and current_mode.qat_state != quantization_config.is_qat + ): + warnings.warn( + "Mixed QAT and Non-QAT quantization config is not supported.", + stacklevel=2, + ) + need_skip = True + if current_mode.dynamic_state is not None: + input_activation_spec = quantization_config.input_activation + if ( + input_activation_spec is not None + and current_mode.dynamic_state != input_activation_spec.is_dynamic + ): + warnings.warn( + "Mixed dynamic and static quantization config is not supported.", + stacklevel=2, + ) + need_skip = True + return need_skip + + def set_global(self, quantization_config: QuantizationConfig): + if self._need_skip_config(quantization_config): + warnings.warn("Skip the global quantization config.", stacklevel=2) + return self + self.global_config = quantization_config + return self + + def get_global_quantization_config(self): + if not isinstance(self.global_config, QuantizationConfig): + warnings.warn( + "The global_config for X86InductorQuantizer is currently invalid. \ + Please ensure that you use set_global to establish the global quantization configuration.", + stacklevel=2, + ) + return self.global_config + + @_config_checker + def set_function_type_qconfig( + self, + function_type: Callable, + quantization_config: QuantizationConfig | None, + ) -> "X86InductorQuantizer": + if function_type in X86InductorQuantizer.module_function_to_aten_operator_type: + self._set_aten_operator_qconfig( + X86InductorQuantizer.module_function_to_aten_operator_type[ + function_type + ], + quantization_config, + ) + else: + warnings.warn( + f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.", + stacklevel=2, + ) + return self + + @_config_checker + def set_module_type_qconfig( + self, + module_type: torch.nn.Module, + quantization_config: QuantizationConfig | None, + ) -> "X86InductorQuantizer": + if module_type in X86InductorQuantizer.module_function_to_aten_operator_type: + self._set_aten_operator_qconfig( + X86InductorQuantizer.module_function_to_aten_operator_type[module_type], + quantization_config, + ) + else: + warnings.warn( + f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.", + stacklevel=2, + ) + return self + + @_config_checker + def set_module_name_qconfig( + self, module_name: str, quantization_config: QuantizationConfig | None + ): + """Set quantization_config for a submodule with name: `module_name`, for example: + quantizer.set_module_name_qconfig("blocks.sub"), it will quantize all supported operator/operator + patterns in the submodule with this module name with the given `quantization_config` + + The supported operators include `quantizable_ops` and `propagation_quantizable_ops`. + """ + self.module_name_qconfig[module_name] = quantization_config + return self + + def _set_aten_operator_qconfig( + self, + operator_type: torch._ops.OpOverloadPacket, + quantization_config: QuantizationConfig | None, + ) -> "X86InductorQuantizer": + if operator_type in quantizable_ops: + self.operator_type_qconfig[operator_type] = quantization_config + else: + warnings.warn( + f"operator: Unable to quantize {operator} by X86InductorQuantizer.", + stacklevel=2, + ) + return self + + def _annotate_conv_node_helper( + self, + conv_node: torch.fx.Node, + annotate_output: bool, + quantization_config: QuantizationConfig | None, + ) -> None: + """Helper function to annotate the conv node""" + if quantization_config is None: + _annotate_nodes_not_quantize(conv_node) + return + input_qspec_map = {} + input_node = conv_node.args[0] + if not isinstance(input_node, Node): + raise AssertionError("input_node must be a FX Node") + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + weight_node = conv_node.args[1] + if not isinstance(weight_node, Node): + raise AssertionError("weight_node must be a FX Node") + input_qspec_map[weight_node] = get_weight_qspec(quantization_config) + bias_node = None if len(conv_node.args) == 2 else conv_node.args[2] + if isinstance(bias_node, Node): + input_qspec_map[bias_node] = get_bias_qspec(quantization_config) + if annotate_output: + conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + else: + conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + ) + + def _annotate_linear_node_helper( + self, + linear_node: torch.fx.Node, + annotate_output: bool, + quantization_config: QuantizationConfig | None, + ) -> None: + """Helper function to annotate the linear node""" + if quantization_config is None: + _annotate_nodes_not_quantize(linear_node) + return + input_qspec_map = {} + if linear_node.target is not torch.ops.aten.linear.default: + raise AssertionError( + "linear_node.target must be torch.ops.aten.linear.default" + ) + has_bias = len(linear_node.args) == 3 + input_index = 0 + weight_index = 1 + bias_index = 2 + + input_node = linear_node.args[input_index] + if not isinstance(input_node, Node): + raise AssertionError("input_node must be a FX Node") + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + + weight_node = linear_node.args[weight_index] + if not isinstance(weight_node, Node): + raise AssertionError("weight_node must be a FX Node") + input_qspec_map[weight_node] = get_weight_qspec(quantization_config) + + bias_node = linear_node.args[bias_index] if has_bias else None + if isinstance(bias_node, Node): + input_qspec_map[bias_node] = get_bias_qspec(quantization_config) + + if annotate_output: + linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + else: + linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, _annotated=True + ) + + def _get_output_nodes_of_partitions( + self, + partition_list: list[SourcePartition], + ) -> list[torch.fx.Node]: + """Helper function to get the output node list from partition list""" + output_node_list = [] + for partition in partition_list: + if len(partition.output_nodes) > 1: + raise ValueError("Input partition has more than one output node") + output_node = partition.output_nodes[0] + if not isinstance(output_node, Node): + raise AssertionError("output_node must be a FX Node") + output_node_list.append(output_node) + if len(output_node_list) != len(partition_list): + raise ValueError( + "length of output_node_list should equal to length of partition_list" + ) + return output_node_list + + def _get_input_idx_for_binary_node( + self, + conv_gemm_node: torch.fx.Node, + binary_node: torch.fx.Node, + ): + """Helper function to check conv_gemm and extra input node index + for binary node fused with conv_gemm. + """ + conv_gemm_node_idx = None + extra_input_node_idx = None + if (binary_node.args[0].op == "call_function") and ( # type: ignore[union-attr] + binary_node.args[0] == conv_gemm_node + ): + conv_gemm_node_idx = 0 + extra_input_node_idx = 1 + elif (binary_node.args[1].op == "call_function") and ( # type: ignore[union-attr] + binary_node.args[1] == conv_gemm_node + ): + conv_gemm_node_idx = 1 + extra_input_node_idx = 0 + extra_input_node = binary_node.args[extra_input_node_idx] # type: ignore[index] + if not isinstance(extra_input_node, Node): + raise AssertionError("extra_input_node must be a FX Node") + return conv_gemm_node_idx, extra_input_node_idx + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + """Annotate the given model with quantization configurations. + + Annotation contracts: + 1. Annotate each node according to the user's qconfig in the following order: + `module_name_qconfig`, `operator_type_qconfig`, and `global_config`. + 2. Avoid re-annotating nodes already annotated in prior stages. For example, + if `linear1` has been annotated by `module_name_qconfig`, it won't be annotated again + during the processing of the 'operator_type_qconfig' or 'global_config'. + 3. For config is `None`, the node will be annotated with `_X86InductorQuantizationAnnotation(_annotated=True)`. + + For each pair of (module_name_or_operator_type_or_global, qconfig), a filter function is created. + This filter function checks if the node is marked by current stage and not annotated by the previous stage. + """ + for module_name, quantization_config in self.module_name_qconfig.items(): + self._annotate_with_config( + model, quantization_config, _create_module_name_filter(module_name) + ) + + for operator_type, quantization_config in self.operator_type_qconfig.items(): + self._annotate_with_config( + model, quantization_config, _create_operator_type_filter(operator_type) + ) + + if self.global_config: + self._annotate_with_config( + model, + self.global_config, + _global_config_filter, + ) + + # Once we've annotated the model with quantization configurations, we also need to annotate + # the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs, + # we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op. + # Refer to + # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487 # noqa: B950 + + self._annotate_output_for_int8_in_int8_out_pattern_entry(model) + + return model + + def _annotate_with_config( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn, + ) -> None: + """Annotate the model with the given quantization configuration. + + High-level description of quantization recipe for X86 Inductor Backend: + Step 1: Apply quantization recipe for fusion patterns of conv/linear to enable int8 data type actively. + Step 2: Propagate quantization annotation for patterns besides conv/linear. Go through the pattern in model + from start to the end. If a pattern supports computation with int8 data type and inputs connected to + quantized patterns, annotate its inputs as quantized pattern. + """ + + # Step1: Recipe of fusion patterns like conv/linear. + self._annotate_conv2d_fusion_pattern(model, quantization_config, filter_fn) + self._annotate_linear_fusion_pattern(model, quantization_config, filter_fn) + self._annotate_matmul(model, quantization_config, filter_fn) + + # Step2: Recipe to propagate annotation for patterns beside conv/linear. + # Go through all the nodes from start to end. + # Recipe refer to + # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538 # noqa: B950 + + self._annotate_propagation_quantizable_pattern_entry( + model, quantization_config, filter_fn + ) + + def _annotate_qat_conv2d_fusion_pattern( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + # Annotate QAT Specific patterns + self._annotate_qat_conv2d_bn_binary_unary(model, quantization_config, filter_fn) + self._annotate_qat_conv2d_bn_binary(model, quantization_config, filter_fn) + self._annotate_qat_conv2d_bn_unary(model, quantization_config, filter_fn) + self._annotate_qat_conv2d_bn(model, quantization_config, filter_fn) + + def _annotate_qat_conv2d_bn_binary_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + fused_partitions = find_sequential_partitions( + gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add, torch.nn.ReLU] + ) + for fused_partition in fused_partitions: + ( + conv_partition, + bn_partition, + binary_partition, + unary_partition, + ) = fused_partition + + ( + conv_node, + bn_output_node, + binary_node, + unary_node, + ) = self._get_output_nodes_of_partitions( + [conv_partition, bn_partition, binary_partition, unary_partition] + ) + if len(bn_output_node.users) != 1: + # Conv BN pattern should only has 1 user. + continue + ( + bn_output_node_idx, + extra_input_node_idx, + ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node) + if (bn_output_node_idx is None) or (extra_input_node_idx is None): + continue + if bn_output_node != binary_node.args[bn_output_node_idx]: + raise ValueError(f"{bn_output_node} doesn't match input of binary node") + extra_input_node = binary_node.args[extra_input_node_idx] + + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + continue + + if _skip_annotate( + [unary_node, binary_node, bn_output_node, conv_node], filter_fn + ): + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + + if quantization_config is not None: + binary_node_input_qspec_map = {} + binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec( + quantization_config + ) + binary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + # pyrefly: ignore [bad-argument-type] + input_qspec_map=binary_node_input_qspec_map, + _annotated=True, + ) + ) + unary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + # TODO Remove the annotate of output in QAT when qat util support pattern matcher. + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + ) + else: + _annotate_nodes_not_quantize([binary_node, unary_node]) + nodes_to_mark_annotated = list(conv_partition.nodes) + nodes_to_mark_annotated.extend(list(bn_partition.nodes)) + nodes_to_mark_annotated.extend(list(binary_partition.nodes)) + nodes_to_mark_annotated.extend(list(unary_partition.nodes)) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + def _annotate_qat_conv2d_bn_binary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + fused_partitions = find_sequential_partitions( + gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add] + ) + for fused_partition in fused_partitions: + conv_partition, bn_partition, binary_partition = fused_partition + ( + conv_node, + bn_output_node, + binary_node, + ) = self._get_output_nodes_of_partitions( + [conv_partition, bn_partition, binary_partition] + ) + if len(bn_output_node.users) != 1: + # Conv BN pattern should only has 1 user. + continue + ( + bn_output_node_idx, + extra_input_node_idx, + ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node) + if (bn_output_node_idx is None) or (extra_input_node_idx is None): + continue + if bn_output_node != binary_node.args[bn_output_node_idx]: + raise ValueError(f"{bn_output_node} doesn't match input of binary node") + + extra_input_node = binary_node.args[extra_input_node_idx] + + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + continue + + if _skip_annotate([binary_node, bn_output_node, conv_node], filter_fn): + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + + if quantization_config is not None: + binary_node_input_qspec_map = {} + binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec( + quantization_config + ) + binary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + # pyrefly: ignore [bad-argument-type] + input_qspec_map=binary_node_input_qspec_map, + # TODO Remove the annotate of output in QAT when qat util support pattern matcher. + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + ) + else: + _annotate_nodes_not_quantize(binary_node) + nodes_to_mark_annotated = list(conv_partition.nodes) + nodes_to_mark_annotated.extend(list(bn_partition.nodes)) + nodes_to_mark_annotated.extend(list(binary_partition.nodes)) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + def _annotate_qat_conv2d_bn_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + fused_partitions = [] + unary_patterns = [ + [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU], + [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardtanh], + [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardswish], + [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU6], + [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.SiLU], + ] + for unary_pattern in unary_patterns: + partitions = find_sequential_partitions(gm, unary_pattern) + if partitions: + # Extend the fused_partitions if partitions is not empty + fused_partitions.extend(partitions) + + for fused_partition in fused_partitions: + conv_partition, bn_partition, unary_partition = fused_partition + ( + conv_node, + bn_output_node, + unary_node, + ) = self._get_output_nodes_of_partitions( + [conv_partition, bn_partition, unary_partition] + ) + + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + continue + + if _skip_annotate([unary_node, bn_output_node, conv_node], filter_fn): + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + if quantization_config is not None: + unary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + # TODO Remove the annotate of output in QAT when qat util support pattern matcher. + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + ) + else: + _annotate_nodes_not_quantize(unary_node) + nodes_to_mark_annotated = list(conv_partition.nodes) + nodes_to_mark_annotated.extend(list(bn_partition.nodes)) + nodes_to_mark_annotated.extend(list(unary_partition.nodes)) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + def _annotate_qat_conv2d_bn( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + fused_partitions = find_sequential_partitions( + gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d] + ) + for fused_partition in fused_partitions: + conv_partition, bn_partition = fused_partition + conv_node, bn_output_node = self._get_output_nodes_of_partitions( + [conv_partition, bn_partition] + ) + + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + continue + + if _skip_annotate([bn_output_node, conv_node], filter_fn): + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + if quantization_config is not None: + bn_output_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + # TODO Remove the annotate of output in QAT when qat util support pattern matcher. + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + ) + else: + _annotate_nodes_not_quantize(bn_output_node) + nodes_to_mark_annotated = list(conv_partition.nodes) + nodes_to_mark_annotated.extend(list(bn_partition.nodes)) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + def _annotate_conv2d_fusion_pattern( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + if (quantization_config is None) or (quantization_config.is_qat): + # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat + self._annotate_qat_conv2d_fusion_pattern( + model, quantization_config, filter_fn + ) + self._annotate_conv2d_binary_unary(model, quantization_config, filter_fn) + self._annotate_conv2d_binary(model, quantization_config, filter_fn) + self._annotate_conv2d_unary(model, quantization_config, filter_fn) + self._annotate_conv2d(model, quantization_config, filter_fn) + + def _annotate_linear_fusion_pattern( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + self._annotate_linear_binary_unary(model, quantization_config, filter_fn) + self._annotate_linear_unary(model, quantization_config, filter_fn) + self._annotate_linear(model, quantization_config, filter_fn) + + def _annotate_matmul( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + for node in model.graph.nodes: + if node.target != torch.ops.aten.matmul.default: + continue + if _skip_annotate([node], filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize(node) + continue + + input_qspec_map = {} + matmul_node = node + for input_node in matmul_node.args: + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + matmul_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_conv2d_binary_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + # Conv2d + add + unary op + fused_partitions = find_sequential_partitions( + gm, [torch.nn.Conv2d, operator.add, torch.nn.ReLU] + ) + for fused_partition in fused_partitions: + conv_partition, binary_partition, unary_partition = fused_partition + conv_node, binary_node, unary_node = self._get_output_nodes_of_partitions( + [conv_partition, binary_partition, unary_partition] + ) + if len(conv_node.users) != 1: + # Conv Node should only has 1 user node + continue + conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node( + conv_node, binary_node + ) + if (conv_node_idx is None) or (extra_input_node_idx is None): + continue + if conv_node != binary_node.args[conv_node_idx]: + raise ValueError(f"{conv_node} doesn't match input of binary node") + extra_input_node = binary_node.args[extra_input_node_idx] + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + # No conv node found to be fused with add + continue + if _skip_annotate([unary_node, binary_node, conv_node], filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize([conv_node, binary_node, unary_node]) + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + binary_node_input_qspec_map = {} + binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec( + quantization_config + ) + binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + # pyrefly: ignore [bad-argument-type] + input_qspec_map=binary_node_input_qspec_map, + _annotated=True, + ) + unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_conv2d_binary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + # Conv2d + add + fused_partitions = find_sequential_partitions( + gm, [torch.nn.Conv2d, operator.add] + ) + for fused_partition in fused_partitions: + conv_partition, binary_partition = fused_partition + conv_node, binary_node = self._get_output_nodes_of_partitions( + [conv_partition, binary_partition] + ) + if len(conv_node.users) != 1: + # Conv Node should only has 1 user node + continue + conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node( + conv_node, binary_node + ) + if (conv_node_idx is None) or (extra_input_node_idx is None): + continue + if conv_node != binary_node.args[conv_node_idx]: + raise ValueError(f"{conv_node} doesn't match input of binary node") + extra_input_node = binary_node.args[extra_input_node_idx] + if not isinstance(conv_node, Node): + raise AssertionError("conv_node must be a FX Node") + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + # No conv node found to be fused with add + continue + if _skip_annotate([binary_node, conv_node], filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize([conv_node, binary_node]) + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + binary_node_input_qspec_map = {} + binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec( + quantization_config + ) + binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + # pyrefly: ignore [bad-argument-type] + input_qspec_map=binary_node_input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_conv2d_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + fused_partitions = [] + unary_patterns = [ + [torch.nn.Conv2d, torch.nn.ReLU], + [torch.nn.Conv2d, torch.nn.Hardtanh], + [torch.nn.Conv2d, torch.nn.Hardswish], + [torch.nn.Conv2d, torch.nn.ReLU6], + [torch.nn.Conv2d, torch.nn.SiLU], + [torch.nn.Conv1d, torch.nn.ReLU], + ] + for unary_pattern in unary_patterns: + partitions = find_sequential_partitions(gm, unary_pattern) + if partitions: + # Extend the fused_partitions if partitions is not empty + fused_partitions.extend(partitions) + + for fused_partition in fused_partitions: + conv_partition, unary_partition = fused_partition + conv_node, unary_node = self._get_output_nodes_of_partitions( + [conv_partition, unary_partition] + ) + if conv_node.op != "call_function" or conv_node.target not in ( + torch.ops.aten.conv2d.default, + torch.ops.aten.conv1d.default, + ): + continue + if _skip_annotate([unary_node, conv_node], filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize([conv_node, unary_node]) + continue + + self._annotate_conv_node_helper(conv_node, False, quantization_config) + unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_conv2d( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + conv_partitions = get_source_partitions( + gm.graph, [torch.nn.Conv2d, torch.nn.functional.conv2d] + ) + conv_partitions = list(itertools.chain.from_iterable(conv_partitions.values())) + for conv_partition in conv_partitions: + if len(conv_partition.output_nodes) > 1: + raise ValueError("conv partition has more than one output node") + conv_node = conv_partition.output_nodes[0] + if ( + conv_node.op != "call_function" + or conv_node.target != torch.ops.aten.conv2d.default + ): + raise ValueError(f"{conv_node} is not an aten conv2d operator") + # skip annotation if it is already annotated + if _skip_annotate([conv_node], filter_fn): + continue + self._annotate_conv_node_helper(conv_node, True, quantization_config) + + def _annotate_maxpool2d( + self, + node: Node, + quantization_config: QuantizationConfig | None, + ) -> None: + if node.target is not torch.ops.aten.max_pool2d.default: + return + if quantization_config is None: + _annotate_nodes_not_quantize(node) + return + + maxpool_node = node + if _is_any_annotated( + [ + maxpool_node, + ] + ): + return + + input_node = maxpool_node.args[0] + if not isinstance(input_node, Node): + raise AssertionError("input_node must be a FX Node") + input_qspec_map = {} + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + maxpool_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_cat( + self, node: Node, quantization_config: QuantizationConfig + ) -> None: + if quantization_config is None: + _annotate_nodes_not_quantize(node) + return + cat_node = node + input_nodes = cat_node.args[0] + if not isinstance(input_nodes, Sequence): + raise AssertionError("input_nodes must be a Sequence of FX Nodes") + first_input_node = input_nodes[0] + input_qspec_map = {} + if not isinstance(first_input_node, Node): + raise AssertionError("first_input_node must be a FX Node") + if not isinstance(cat_node, Node): + raise AssertionError("cat_node must be a FX Node") + input_qspec_map[first_input_node] = get_input_act_qspec(quantization_config) + share_qparams_with_input_act0_qspec = SharedQuantizationSpec( + (first_input_node, cat_node) + ) + + for input_node in input_nodes[1:]: + if input_node not in input_qspec_map: + # There has the case of cat same nodes: torch.cat([input0, input0], 1) + if not isinstance(input_node, Node): + raise AssertionError("input_node must be a FX Node") + input_qspec_map[input_node] = share_qparams_with_input_act0_qspec + + cat_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_propagation_quantizable_pattern_entry( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + for node in gm.graph.nodes: + self._annotate_propagation_quantizable_pattern( + node, quantization_config, filter_fn + ) + + def _annotate_propagation_quantizable_pattern( + self, node: Node, quantization_config, filter_fn + ) -> None: + # Propagate annotation to quantizable patterns. + if ( + (node.target in propagation_quantizable_ops) + and (not _is_any_annotated([node])) + and (node.op == "call_function") + ): + + def is_all_inputs_connected_to_quantized_op(input_nodes): + # Ensure all the inputs connect to fusion pattern or quantized node + for input_node in input_nodes: + if not _is_quantized_op_pt2e(input_node): + return False + return True + + if _skip_annotate([node], filter_fn): + return + + if quantization_config is None: + _annotate_nodes_not_quantize(node) + return + + if node.target is torch.ops.aten.max_pool2d.default: + # Recipe of maxpool2d: check input arg[0] of maxpool2d is quantized or not + input_nodes_to_check = [node.all_input_nodes[0]] + if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check): + if quantization_config is not None: + warnings.warn( + f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.", + stacklevel=2, + ) + return + + self._annotate_maxpool2d(node, quantization_config) + return + elif node.target is torch.ops.aten.cat.default: + input_nodes_to_check = node.all_input_nodes + if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check): + return + self._annotate_cat(node, quantization_config) + elif ( + node.target is torch.ops.aten.flatten.using_ints + and len(node.users) > 0 + and not any(user.target in quantizable_ops for user in node.users) + ): + # Recipe of flatten: check if any users of flatten node are quantizable ops or not + return + else: + input_node = node.all_input_nodes[0] + if not is_all_inputs_connected_to_quantized_op( + [ + input_node, + ] + ): + return + input_qspec_map = {} + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + return + + def _annotate_output_share_observer_as_input( + self, input_node: Node, source_node: Node + ): + source_node_quantization_annotation = source_node.meta.get(QUANT_ANNOTATION_KEY) + if ( + source_node_quantization_annotation + and source_node_quantization_annotation._is_output_of_quantized_pattern + ): + edge_or_node = (input_node, source_node) + source_node_quantization_annotation.output_qspec = SharedQuantizationSpec( + edge_or_node + ) + return + + def _annotate_output_for_int8_in_int8_out_pattern_entry( + self, + model: torch.fx.GraphModule, + ): + for node in model.graph.nodes: + self._annotate_output_for_int8_in_int8_out_pattern(node) + + def _annotate_output_for_int8_in_int8_out_pattern( + self, + node: Node, + ) -> None: + r""" + Check and insert observer at output of node in int8_in_int8_out_ops if needed. + Recipe refers to + https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495 + """ # noqa: B950 + edge_or_node: tuple[Node, Node] + if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])): + if node.target is torch.ops.aten.max_pool2d.default: + maxpool_node = node + if not _is_all_annotated( + [ + maxpool_node, + ] + ): + return + + # Get the quantization_annotation from getitem_node + maxpool_node_quantization_annotation = maxpool_node.meta.get( + QUANT_ANNOTATION_KEY + ) + if ( + maxpool_node_quantization_annotation + and maxpool_node_quantization_annotation._is_output_of_quantized_pattern + ): + # Annotate the output_qspec of getitem_node + input_act = maxpool_node.args[0] + if not isinstance(input_act, Node): + raise AssertionError("input_act must be a FX Node") + if not isinstance(maxpool_node, Node): + raise AssertionError("maxpool_node must be a FX Node") + edge_or_node = (input_act, maxpool_node) + maxpool_node_quantization_annotation.output_qspec = ( + SharedQuantizationSpec(edge_or_node) + ) + else: + input_node = node.all_input_nodes[0] + self._annotate_output_share_observer_as_input(input_node, node) + return + + def _annotate_linear( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + linear_partitions = get_source_partitions( + gm.graph, [torch.nn.Linear, torch.nn.functional.linear] + ) + linear_partitions = list( + itertools.chain.from_iterable(linear_partitions.values()) + ) + for partition in linear_partitions: + if len(partition.output_nodes) > 1: + raise ValueError( + "Linear partition cannot have more than one output node" + ) + linear_node = partition.output_nodes[0] + if ( + linear_node.op != "call_function" + or linear_node.target != torch.ops.aten.linear.default + ): + raise ValueError(f"{linear_node} is not an aten linear operator") + # skip annotation if it is already annotated + if _skip_annotate([linear_node], filter_fn): + continue + self._annotate_linear_node_helper(linear_node, True, quantization_config) + + def _annotate_linear_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + postop_list = [ + torch.nn.ReLU, + torch.nn.LeakyReLU, + torch.nn.Tanh, + torch.nn.GELU, + ] + fused_partitions: list[tuple] = [] + for postop in postop_list: + fused_partitions = fused_partitions + find_sequential_partitions( + gm, [torch.nn.Linear, postop] + ) + for fused_partition in fused_partitions: + linear_partition, unary_partition = fused_partition + linear_node, unary_node = self._get_output_nodes_of_partitions( + [linear_partition, unary_partition] + ) + if ( + linear_node.op != "call_function" + or linear_node.target != torch.ops.aten.linear.default + ): + continue + if _skip_annotate([unary_node, linear_node], filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize([linear_node, unary_node]) + continue + + self._annotate_linear_node_helper(linear_node, False, quantization_config) + unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation( + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + + def _annotate_linear_binary_unary( + self, + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ) -> None: + # linear + binary_op + (optional) unary op + binary_op_list = [operator.add] + unary_op_list = [torch.nn.ReLU, None] + combinations = itertools.product(binary_op_list, unary_op_list) + for binary_op, unary_op in combinations: + has_unary = unary_op is not None + seq_partition = [torch.nn.Linear, binary_op] + if has_unary: + # pyrefly: ignore [bad-argument-type] + seq_partition.append(unary_op) + fused_partitions = find_sequential_partitions(gm, seq_partition) + for fused_partition in fused_partitions: + unary_partition, unary_node = None, None + if has_unary: + ( + linear_partition, + binary_partition, + unary_partition, + ) = fused_partition + ( + linear_node, + binary_node, + unary_node, + ) = self._get_output_nodes_of_partitions( + [linear_partition, binary_partition, unary_partition] + ) + else: + linear_partition, binary_partition = fused_partition + linear_node, binary_node = self._get_output_nodes_of_partitions( + [linear_partition, binary_partition] + ) + if len(linear_node.users) != 1: + # Linear Node should only has 1 user node + continue + ( + linear_node_idx, + extra_input_node_idx, + ) = self._get_input_idx_for_binary_node(linear_node, binary_node) + if (linear_node_idx is None) or (extra_input_node_idx is None): + continue + if linear_node != binary_node.args[linear_node_idx]: + raise ValueError( + f"{linear_node} doesn't match input of binary node" + ) + if not isinstance(linear_node, Node): + raise AssertionError("linear_node must be a FX Node") + if ( + linear_node.op != "call_function" + or linear_node.target != torch.ops.aten.linear.default + ): + # No linear node found to be fused with add + continue + node_list = ( + [binary_node, linear_node] + if unary_node is None + else [unary_node, binary_node, linear_node] + ) + if _skip_annotate(node_list, filter_fn): + continue + + if quantization_config is None: + _annotate_nodes_not_quantize(node_list) + continue + + self._annotate_linear_node_helper( + linear_node, False, quantization_config + ) + # We don't insert q-dq before the binary input node due to accuracy issues + binary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + input_qspec_map={}, + _annotated=True, + _is_output_of_quantized_pattern=(not has_unary), + ) + ) + if unary_node is not None: + unary_node.meta[QUANT_ANNOTATION_KEY] = ( + _X86InductorQuantizationAnnotation( + _annotated=True, + _is_output_of_quantized_pattern=True, + ) + ) + + def validate(self, model: torch.fx.GraphModule) -> None: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d3a2234fdff3f137170d2810ef82fe8b7c706c0c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py @@ -0,0 +1,451 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import copy +import functools +import typing_extensions +from typing import Any, TYPE_CHECKING + +import torch +import torch._dynamo as torchdynamo +import torch.nn.functional as F +from torch.ao.quantization.fake_quantize import ( + FakeQuantize, + FusedMovingAvgObsFakeQuantize, +) +from torch.ao.quantization.observer import ( + HistogramObserver, + MinMaxObserver, + MovingAverageMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, + PerChannelMinMaxObserver, + PlaceholderObserver, +) +from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer +from torch.ao.quantization.quantizer.utils import _get_module_name_filter +from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import ( + _convert_scalars_to_attrs, + OP_TO_ANNOTATOR, + OperatorConfig, + OperatorPatternType, + propagate_annotation, + QuantizationConfig, +) +from torch.fx._compatibility import compatibility + + +if TYPE_CHECKING: + from collections.abc import Callable + + from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor + from torch.fx import Node + + +__all__ = [ + "XNNPACKQuantizer", + "get_symmetric_quantization_config", +] + + +def _get_dynamo_graph(function: Callable, inputs) -> torch.fx.Graph: + gm, _ = torchdynamo.export(function, aten_graph=True)(*inputs) + gm.graph.eliminate_dead_code() + return gm.graph + + +def _get_linear_patterns(input_size: list[int]): + in_channels = input_size[-1] + out_channels = 8 # hard coding but this should not matter + weight = torch.ones((out_channels, in_channels)) + bias = torch.ones((out_channels,)) + act = torch.ones(input_size) + + def linear_op(act, weight, bias=None): + return F.linear(act, weight, bias) + + pattern_w_bias = _get_dynamo_graph(linear_op, (act, weight, bias)) + pattern_wo_bias = _get_dynamo_graph(linear_op, (act, weight)) + return [pattern_w_bias, pattern_wo_bias] + + +def _supported_symmetric_quantized_operators() -> dict[str, list[OperatorPatternType]]: + supported_operators: dict[str, list[OperatorPatternType]] = { + # Both conv and linear should be able to handle relu + hardtanh fusion since + # those are clamp ops + "conv2d": [ + [torch.nn.Conv2d, torch.nn.ReLU], + [torch.nn.Conv2d, F.relu], + [F.conv2d, torch.nn.ReLU], + [F.conv2d, F.relu], + ], + "linear": [[torch.nn.Linear], [F.linear]], + "add": [[torch.add]], + "adaptive_avg_pool2d": [ + [torch.nn.AdaptiveAvgPool2d], + [F.adaptive_avg_pool2d], + ], + } + return copy.deepcopy(supported_operators) + + +def _get_supported_symmetric_config_and_operators() -> list[OperatorConfig]: + supported_config_and_operators: list[OperatorConfig] = [] + for quantization_config in [ + get_symmetric_quantization_config(), + get_symmetric_quantization_config(is_qat=True), + get_symmetric_quantization_config(is_per_channel=True), + get_symmetric_quantization_config(is_per_channel=True, is_qat=True), + ]: + ops = _supported_symmetric_quantized_operators() + supported_config_and_operators.extend( + OperatorConfig(quantization_config, pattern_list) + for pattern_list in ops.values() + ) + return copy.deepcopy(supported_config_and_operators) + + +@functools.lru_cache +def get_symmetric_quantization_config( + is_per_channel: bool = False, + is_qat: bool = False, + is_dynamic: bool = False, + act_qmin: int = -128, + act_qmax: int = 127, + weight_qmin: int = -127, + weight_qmax: int = 127, +): + extra_args: dict[str, Any] = {"eps": 2**-12} + if is_qat: + if is_dynamic: + act_observer_or_fake_quant_ctr = FakeQuantize + dynamic_quant_observer = MovingAverageMinMaxObserver.with_args( + averaging_constant=1 + ) + extra_args["observer"] = dynamic_quant_observer + else: + act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize # type: ignore[assignment] + else: + if is_dynamic: + act_observer_or_fake_quant_ctr = PlaceholderObserver # type: ignore[assignment] + else: + act_observer_or_fake_quant_ctr = HistogramObserver # type: ignore[assignment] + + act_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=act_qmin, + quant_max=act_qmax, + qscheme=torch.per_tensor_affine, + is_dynamic=is_dynamic, + observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args( + **extra_args, + ), + ) + weight_qscheme = ( + torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric + ) + weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = ( + MinMaxObserver + ) + if is_qat: + # TODO: qat + per channel? + weight_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize + elif is_per_channel: + weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver + + extra_args: dict[str, Any] = {"eps": 2**-12} + if is_qat: + if weight_qscheme == torch.per_tensor_symmetric: + extra_args["observer"] = MovingAverageMinMaxObserver + else: + extra_args["observer"] = MovingAveragePerChannelMinMaxObserver # type: ignore[dict-item] + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=weight_qmin, + quant_max=weight_qmax, + qscheme=weight_qscheme, + ch_axis=0, + is_dynamic=False, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + + bias_quantization_spec = None + if is_dynamic: + quantization_config = QuantizationConfig( + act_quantization_spec, + None, + weight_quantization_spec, + bias_quantization_spec, + is_qat, + ) + else: + quantization_config = QuantizationConfig( + act_quantization_spec, + act_quantization_spec, + weight_quantization_spec, + bias_quantization_spec, + is_qat, + ) + return quantization_config + + +def _get_supported_config_and_operators() -> list[OperatorConfig]: + return _get_supported_symmetric_config_and_operators() + + +def _get_module_type_filter(tp: Callable): + """Get the module_type_filter function for a given module type, the filter accepts + a node and checks if the node comes from a module that has certain module type + + For example: + node: linear_op = call_function[...](...) # comes from a module with type Block -> Sub -> Linear + + + >> module_type_filter = _get_module_type_filter(Sub) # submodule with type `Sub`, under the `Block` submodule + >> print(module_type_filter(node)) + True # the node is from the submodule `Sub` (same for `Block` and `Linear` as well) + """ + + tp_str = tp.__module__ + "." + tp.__qualname__ + + def module_type_filter(n: Node) -> bool: + # example: { + # 'L__self___sub': ("L['self'].sub", ), + # 'L__self___sub_linear': ("L['self'].sub.linear", ) + # } + nn_module_stack = n.meta.get("nn_module_stack", {}) + types = [] + for _, t in nn_module_stack.values(): + # export() returns str, but older APIs (e.g. capture_pre_autograd_graph) + # return type. Handle both cases. + if isinstance(t, type): + t = t.__module__ + "." + t.__qualname__ + types.append(t) + return tp_str in types + + return module_type_filter + + +def _get_not_module_type_or_name_filter( + tp_list: list[Callable], module_name_list: list[str] +) -> Callable[[Node], bool]: + module_type_filters = [_get_module_type_filter(tp) for tp in tp_list] + module_name_list_filters = [_get_module_name_filter(m) for m in module_name_list] + + def not_module_type_or_name_filter(n: Node) -> bool: + return not any(f(n) for f in module_type_filters + module_name_list_filters) + + return not_module_type_or_name_filter + + +@compatibility(is_backward_compatible=False) +@typing_extensions.deprecated( + "XNNPACKQuantizer is deprecated! Please use xnnpack quantizer in " + "ExecuTorch (https://github.com/pytorch/executorch/tree/main/backends/xnnpack/quantizer) instead." +) +class XNNPACKQuantizer(Quantizer): + """ + !!! DEPRECATED !!! + XNNPACKQuantizer is a marked as deprecated. It will be removed in the future. + It has been moved to executorch.backends.xnnpack.quantizer.xnnpack_quantizer.XNNPACKQuantizer. + Please use the new quantizer instead. + """ + + supported_config_and_operators = _get_supported_config_and_operators() + STATIC_QAT_ONLY_OPS = [ + "conv_bn_relu", + "conv_bn", + "conv_transpose_bn_relu", + "conv_transpose_bn", + ] + + # static quantization ops (both PTQ and QAT) + # Preserve the order that fusions come before singular ops + STATIC_OPS = [ + "linear_relu", + "linear", + "conv_relu", + "conv", + "conv_transpose_relu", + "adaptive_avg_pool2d", + # TODO: move this to BoltNNQuantizer? + "gru_io_only", + "add_relu", + "add", + "mul_relu", + "mul", + "cat", + ] + + DYNAMIC_OPS = [ + "linear", + ] + + def __init__(self) -> None: + super().__init__() + self.global_config: QuantizationConfig | None = None + self.operator_type_config: dict[ + torch._ops.OpOverloadPacket, QuantizationConfig | None + ] = {} + self.module_type_config: dict[Callable, QuantizationConfig | None] = {} + self.module_name_config: dict[str, QuantizationConfig | None] = {} + + @classmethod + def get_supported_quantization_configs(cls) -> list[QuantizationConfig]: + op_configs: set[QuantizationConfig] = { + spec for spec, _ in cls.supported_config_and_operators + } + return list(op_configs) + + @classmethod + def get_supported_operator_for_quantization_config( + cls, quantization_config: QuantizationConfig | None + ) -> list[OperatorPatternType]: + if quantization_config is None: + all_ops = [] + for _, ops in cls.supported_config_and_operators: + all_ops.extend(ops) + return all_ops + + for config, ops in cls.supported_config_and_operators: + # note: this assumes each entry in cls.supported_spec_and_operators + # corresponds to one spec, e.g. we don't have + # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)] + # where the first and second entry have the same spec but did not + # merge the op list + if config == quantization_config: + return ops + return [] + + def set_global(self, quantization_config: QuantizationConfig) -> XNNPACKQuantizer: + self.global_config = quantization_config + return self + + def set_operator_type( + self, + operator_type: torch._ops.OpOverloadPacket, + quantization_config: QuantizationConfig, + ) -> XNNPACKQuantizer: + self.operator_type_config[operator_type] = quantization_config + return self + + def set_module_type( + self, module_type: Callable, quantization_config: QuantizationConfig + ): + """Set quantization_config for a submodule with type: `module_type`, for example: + quantizer.set_module_name(Sub) or quantizer.set_module_name(nn.Linear), it will quantize all supported operator/operator + patterns in the submodule with this module type with the given `quantization_config` + """ + self.module_type_config[module_type] = quantization_config + return self + + def set_module_name( + self, module_name: str, quantization_config: QuantizationConfig | None + ): + """Set quantization_config for a submodule with name: `module_name`, for example: + quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator + patterns in the submodule with this module name with the given `quantization_config` + """ + if quantization_config is None: + raise AssertionError("quantization_config == None is not supported yet") + self.module_name_config[module_name] = quantization_config + return self + + def transform_for_annotation( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + """Transforms scalar values to tensor attributes""" + return _convert_scalars_to_attrs(model) + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + """just handling global spec for now""" + # hacked for handling dynamic linear quant. will fix later. + if self.global_config and self.global_config.input_activation.is_dynamic: # type: ignore[union-attr] + model = self._annotate_for_dynamic_quantization_config(model) + else: + model = self._annotate_for_static_quantization_config(model) + propagate_annotation(model) + return model + + def _annotate_all_static_patterns( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, + ) -> torch.fx.GraphModule: + # TODO: implement the support for None to be canceling out previous annotations + if quantization_config is None: + return model + + if quantization_config.is_qat: + for op in self.STATIC_QAT_ONLY_OPS: + OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn) + for op in self.STATIC_OPS: + OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn) + return model + + def _annotate_all_dynamic_patterns( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, + ) -> torch.fx.GraphModule: + # TODO: implement the support for None to be canceling out previous annotations + if quantization_config is None: + return model + + for op in self.DYNAMIC_OPS: + OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn) + return model + + def _annotate_for_static_quantization_config( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + module_name_list = list(self.module_name_config.keys()) + for module_name, config in self.module_name_config.items(): + self._annotate_all_static_patterns( + model, config, _get_module_name_filter(module_name) + ) + + tp_list = list(self.module_type_config.keys()) + for module_type, config in self.module_type_config.items(): + self._annotate_all_static_patterns( + model, config, _get_module_type_filter(module_type) + ) + + self._annotate_all_static_patterns( + model, + self.global_config, + _get_not_module_type_or_name_filter(tp_list, module_name_list), + ) + return model + + def _annotate_for_dynamic_quantization_config( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + module_name_list = list(self.module_name_config.keys()) + for module_name, config in self.module_name_config.items(): + self._annotate_all_dynamic_patterns( + model, config, _get_module_name_filter(module_name) + ) + + tp_list = list(self.module_type_config.keys()) + for module_type, config in self.module_type_config.items(): + self._annotate_all_dynamic_patterns( + model, config, _get_module_type_filter(module_type) + ) + + self._annotate_all_dynamic_patterns( + model, + self.global_config, + _get_not_module_type_or_name_filter(tp_list, module_name_list), + ) + return model + + def validate(self, model: torch.fx.GraphModule) -> None: + pass + + @classmethod + def get_supported_operators(cls) -> list[OperatorConfig]: + return cls.supported_config_and_operators diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..22282d3d071a899e31cd4607027aa3abec249c7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py @@ -0,0 +1,1152 @@ +# mypy: allow-untyped-defs +import itertools +import typing +from collections.abc import Callable +from dataclasses import dataclass +from typing import NamedTuple + +import torch +import torch.nn.functional as F +from torch._subclasses import FakeTensor +from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix +from torch.ao.quantization.pt2e.export_utils import _WrapperModule +from torch.ao.quantization.pt2e.utils import ( + _get_aten_graph_module_for_pattern, + _is_conv_node, + _is_conv_transpose_node, +) +from torch.ao.quantization.quantizer import ( + QuantizationAnnotation, + QuantizationSpec, + SharedQuantizationSpec, +) +from torch.ao.quantization.quantizer.utils import ( + _annotate_input_qspec_map, + _annotate_output_qspec, +) +from torch.fx import Node +from torch.fx.passes.utils.matcher_with_name_node_map_utils import ( + SubgraphMatcherWithNameNodeMap, +) +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +__all__ = [ + "OperatorConfig", + "OperatorPatternType", + "QuantizationConfig", + "get_input_act_qspec", + "get_output_act_qspec", + "get_weight_qspec", + "get_bias_qspec", + "OP_TO_ANNOTATOR", + "propagate_annotation", +] + + +# In the absence of better name, just winging it with QuantizationConfig +@dataclass(eq=True, frozen=True) +class QuantizationConfig: + input_activation: QuantizationSpec | None + output_activation: QuantizationSpec | None + weight: QuantizationSpec | None + bias: QuantizationSpec | None + # TODO: remove, since we can use observer_or_fake_quant_ctr to express this + is_qat: bool = False + + +# Use Annotated because list[Callable].__module__ is read-only. +OperatorPatternType = typing.Annotated[list[Callable], None] +OperatorPatternType.__module__ = ( + "torch.ao.quantization.quantizer.xnnpack_quantizer_utils" +) + +AnnotatorType = Callable[ + [ + torch.fx.GraphModule, + QuantizationConfig | None, + Callable[[Node], bool] | None, + ], + list[list[Node]] | None, +] +OP_TO_ANNOTATOR: dict[str, AnnotatorType] = {} + + +def register_annotator(op: str) -> Callable[[AnnotatorType], None]: + def decorator(annotator: AnnotatorType) -> None: + OP_TO_ANNOTATOR[op] = annotator + + return decorator + + +class OperatorConfig(NamedTuple): + # fix List[str] with List[List[Union[nn.Module, FunctionType, BuiltinFunctionType]]] + # Basically we are mapping a quantization config to some list of patterns. + # a pattern is defined as a list of nn module, function or builtin function names + # e.g. [nn.Conv2d, torch.relu, torch.add] + # We have not resolved whether fusion can be considered internal details of the + # quantizer hence it does not need communication to user. + # Note this pattern is not really informative since it does not really + # tell us the graph structure resulting from the list of ops. + config: QuantizationConfig + operators: list[OperatorPatternType] + + +def _is_annotated(nodes: list[Node]): + """ + Given a list of nodes (that represents an operator pattern), + check if any of the node is annotated, return True if any of the node + is annotated, otherwise return False + """ + annotated = False + for node in nodes: + annotated = annotated or ( + "quantization_annotation" in node.meta + and node.meta["quantization_annotation"]._annotated + ) + return annotated + + +def _mark_nodes_as_annotated(nodes: list[Node]): + for node in nodes: + if node is not None: + if "quantization_annotation" not in node.meta: + node.meta["quantization_annotation"] = QuantizationAnnotation() + node.meta["quantization_annotation"]._annotated = True + + +def get_input_act_qspec(quantization_config: QuantizationConfig | None): + if quantization_config is None: + return None + if quantization_config.input_activation is None: + return None + quantization_spec: QuantizationSpec = quantization_config.input_activation + if quantization_spec.qscheme not in [ + torch.per_tensor_affine, + torch.per_tensor_symmetric, + ]: + raise AssertionError( + f"Unsupported activation qscheme: {quantization_spec.qscheme}" + ) + return quantization_spec + + +def get_output_act_qspec(quantization_config: QuantizationConfig | None): + if quantization_config is None: + return None + if quantization_config.output_activation is None: + return None + quantization_spec: QuantizationSpec = quantization_config.output_activation + if quantization_spec.qscheme not in [ + torch.per_tensor_affine, + torch.per_tensor_symmetric, + ]: + raise AssertionError( + f"Unsupported activation qscheme: {quantization_spec.qscheme}" + ) + return quantization_spec + + +def get_weight_qspec(quantization_config: QuantizationConfig | None): + if quantization_config is None: + return None + if quantization_config is None: + raise AssertionError("quantization_config must not be None") + if quantization_config.weight is None: + return None + quantization_spec: QuantizationSpec = quantization_config.weight + if quantization_spec.qscheme not in [ + torch.per_tensor_symmetric, + torch.per_channel_symmetric, + None, + ]: + raise ValueError( + f"Unsupported quantization_spec {quantization_spec} for weight" + ) + return quantization_spec + + +def get_bias_qspec(quantization_config: QuantizationConfig | None): + if quantization_config is None: + return None + if quantization_config is None: + raise AssertionError("quantization_config must not be None") + if quantization_config.bias is None: + return None + quantization_spec: QuantizationSpec = quantization_config.bias + if quantization_spec.dtype != torch.float: + raise AssertionError( + "Only float dtype for bias is supported for bias right now" + ) + return quantization_spec + + +@register_annotator("linear") +def _annotate_linear( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + weight_qspec = get_weight_qspec(quantization_config) + bias_qspec = get_bias_qspec(quantization_config) + for node in gm.graph.nodes: + if node.op != "call_function" or node.target != torch.ops.aten.linear.default: + continue + if filter_fn and not filter_fn(node): + continue + act_node = node.args[0] + weight_node = node.args[1] + bias_node = None + if len(node.args) > 2: + bias_node = node.args[2] + + if _is_annotated([node]) is False: # type: ignore[list-item] + _annotate_input_qspec_map( + node, + act_node, + input_act_qspec, + ) + _annotate_input_qspec_map( + node, + weight_node, + weight_qspec, + ) + nodes_to_mark_annotated = [node, weight_node] + if bias_node: + _annotate_input_qspec_map( + node, + bias_node, + bias_qspec, + ) + nodes_to_mark_annotated.append(bias_node) + _annotate_output_qspec(node, output_act_qspec) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + annotated_partitions.append(nodes_to_mark_annotated) + + return annotated_partitions + + +@register_annotator("linear_relu") +def _annotate_linear_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + weight_qspec = get_weight_qspec(quantization_config) + bias_qspec = get_bias_qspec(quantization_config) + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ]: + continue + relu_node = node + maybe_linear_node = node.args[0] + if ( + not isinstance(maybe_linear_node, Node) + or maybe_linear_node.op != "call_function" + or maybe_linear_node.target != torch.ops.aten.linear.default + ): + continue + + linear_node = maybe_linear_node + if len(linear_node.users) > 1: + # if linear node has multiple users, then it can't be fused with relu + continue + + input_qspec_map = {} + input_act = linear_node.args[0] + if not isinstance(input_act, Node): + raise AssertionError("input activation must be a FX Node") + input_qspec_map[input_act] = input_act_qspec + + weight = linear_node.args[1] + if not isinstance(weight, Node): + raise AssertionError("weight must be a FX Node") + input_qspec_map[weight] = weight_qspec + + # adding weight node to the partition as well + partition = [relu_node, linear_node, weight] + bias = linear_node.args[2] if len(linear_node.args) > 2 else None + if isinstance(bias, Node): + input_qspec_map[bias] = bias_qspec + partition.append(bias) + + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + linear_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + ) + relu_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=output_act_qspec, + _annotated=True, + ) + _mark_nodes_as_annotated(partition) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("conv") +def _annotate_conv( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + for n in gm.graph.nodes: + if n.op != "call_function" or n.target not in [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + ]: + continue + conv_node = n + + input_qspec_map = {} + input_act = conv_node.args[0] + if not isinstance(input_act, Node): + raise AssertionError("input activation must be a FX Node") + input_qspec_map[input_act] = get_input_act_qspec(quantization_config) + + weight = conv_node.args[1] + if not isinstance(weight, Node): + raise AssertionError("weight must be a FX Node") + input_qspec_map[weight] = get_weight_qspec(quantization_config) + + # adding weight node to the partition as well + partition = [conv_node, conv_node.args[1]] + + bias = conv_node.args[2] if len(conv_node.args) > 2 else None + if isinstance(bias, Node): + input_qspec_map[bias] = get_bias_qspec(quantization_config) + partition.append(bias) + + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + conv_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=get_output_act_qspec(quantization_config), + _annotated=True, + ) + _mark_nodes_as_annotated(partition) + annotated_partitions.append(partition) + return annotated_partitions + + +def _do_annotate_conv_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, + is_conv_transpose: bool = False, +): + annotated_partitions = [] + for n in gm.graph.nodes: + if n.op != "call_function" or n.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ]: + continue + relu_node = n + maybe_conv_node = n.args[0] + + is_conv_node = _is_conv_transpose_node if is_conv_transpose else _is_conv_node + if not isinstance(maybe_conv_node, Node) or not is_conv_node(maybe_conv_node): + continue + conv_node = maybe_conv_node + + if len(conv_node.users) > 1: + # relu shouldn't be fuseable to conv if there are other users + # of convolution + continue + + input_qspec_map = {} + input_act = conv_node.args[0] + if not isinstance(input_act, Node): + raise AssertionError("input activation must be a FX Node") + input_qspec_map[input_act] = get_input_act_qspec(quantization_config) + + weight = conv_node.args[1] + if not isinstance(weight, Node): + raise AssertionError("weight must be a FX Node") + input_qspec_map[weight] = get_weight_qspec(quantization_config) + + # adding weight node to the partition as well + partition = [relu_node, conv_node, conv_node.args[1]] + bias = conv_node.args[2] if len(conv_node.args) > 2 else None + if isinstance(bias, Node): + input_qspec_map[bias] = get_bias_qspec(quantization_config) + partition.append(bias) + + # pyrefly: ignore [bad-argument-type] + if _is_annotated(partition): + continue + + # pyrefly: ignore [bad-argument-type] + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + conv_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, _annotated=True + ) + relu_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + ) + # pyrefly: ignore [bad-argument-type] + _mark_nodes_as_annotated(partition) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("conv_relu") +def _annotate_conv_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + return _do_annotate_conv_relu( + gm, quantization_config, filter_fn, is_conv_transpose=False + ) + + +@register_annotator("conv_transpose_relu") +def _annotate_conv_transpose_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + return _do_annotate_conv_relu( + gm, quantization_config, filter_fn, is_conv_transpose=True + ) + + +@register_annotator("conv_bn") +def _annotate_conv_bn( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + """ + Find conv + batchnorm partitions + Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv. + """ + return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False) + + +@register_annotator("conv_bn_relu") +def _annotate_conv_bn_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + """ + Find conv + batchnorm + relu partitions + Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv. + """ + return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True) + + +@register_annotator("conv_transpose_bn") +def _annotate_conv_transpose_bn( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + """ + Find conv_transpose + batchnorm partitions + Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv. + """ + return _do_annotate_conv_bn( + gm, quantization_config, filter_fn, has_relu=False, is_conv_transpose=True + ) + + +@register_annotator("conv_transpose_bn_relu") +def _annotate_conv_transpose_bn_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + """ + Find conv_transpose + batchnorm + relu partitions + Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv. + """ + return _do_annotate_conv_bn( + gm, quantization_config, filter_fn, has_relu=True, is_conv_transpose=True + ) + + +def _do_annotate_conv_bn( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None, + has_relu: bool, + is_conv_transpose: bool = False, +) -> list[list[Node]]: + """ + Given a function that takes in a `conv_fn` and returns a conv-bn[-relu] pattern, + return a list of annotated partitions. + + The output of the pattern must include a dictionary from string name to node + for the following names: "input", "conv", "weight", "bias", and "output". + """ + + # Example inputs for conv-bn1d patterns + _conv1d_bn_example_inputs = ( + torch.randn(1, 1, 3), # x + torch.randn(1, 1, 1), # conv_weight + torch.randn(1), # conv_bias + torch.randn(1), # bn_weight + torch.randn(1), # bn_bias + torch.randn(1), # bn_running_mean + torch.randn(1), # bn_running_var + ) + + # Example inputs for conv-bn2d patterns + _conv2d_bn_example_inputs = ( + torch.randn(1, 1, 3, 3), # x + torch.randn(1, 1, 1, 1), # conv_weight + torch.randn(1), # conv_bias + torch.randn(1), # bn_weight + torch.randn(1), # bn_bias + torch.randn(1), # bn_running_mean + torch.randn(1), # bn_running_var + ) + + def get_pattern(conv_fn: Callable, relu_is_inplace: bool): + def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv): + conv = conv_fn(x, conv_weight, conv_bias) + bn = F.batch_norm(conv, bn_rm, bn_rv, bn_weight, bn_bias, training=True) + if has_relu: + output = F.relu_(bn) if relu_is_inplace else F.relu(bn) + else: + output = bn + return output, { + "input": x, + "conv": conv, + "weight": conv_weight, + "bias": conv_bias, + "output": output, + } + + return _WrapperModule(_conv_bn) + + # Needed for matching, otherwise the matches gets filtered out due to unused + # nodes returned by batch norm + gm.graph.eliminate_dead_code() + gm.recompile() + + matches = [] + if is_conv_transpose: + combinations = [ + (F.conv_transpose1d, _conv1d_bn_example_inputs), + (F.conv_transpose2d, _conv2d_bn_example_inputs), + ] + else: + combinations = [ + (F.conv1d, _conv1d_bn_example_inputs), # type: ignore[list-item] + (F.conv2d, _conv2d_bn_example_inputs), # type: ignore[list-item] + ] + + # Add `is_cuda` and `relu_is_inplace` dimensions + combinations = itertools.product( # type: ignore[assignment] + combinations, + [True, False] if torch.cuda.is_available() else [False], # is_cuda + [True, False] if has_relu else [False], # relu_is_inplace + ) + + # Match against all conv dimensions and cuda variants + for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations: # type: ignore[misc] + pattern = get_pattern(conv_fn, relu_is_inplace) # type: ignore[has-type] + pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs, is_cuda) # type: ignore[has-type] + pattern.graph.eliminate_dead_code() + pattern.recompile() + matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True) + matches.extend(matcher.match(gm.graph)) + + # Annotate nodes returned in the matches + annotated_partitions = [] + for match in matches: + name_node_map = match.name_node_map + input_node = name_node_map["input"] + conv_node = name_node_map["conv"] + weight_node = name_node_map["weight"] + bias_node = name_node_map["bias"] + output_node = name_node_map["output"] + + # TODO: annotate the uses of input, weight, and bias separately instead + # of assuming they come from a single conv node. This is not possible today + # because input may have multiple users, and we can't rely on the conv node + # always being the first user. This was the case in models with skip + # connections like resnet18 + + # Validate conv args + if conv_node.args[0] is not input_node: + raise ValueError("Conv arg did not contain input node ", input_node) + if conv_node.args[1] is not weight_node: + raise ValueError("Conv arg did not contain weight node ", weight_node) + if len(conv_node.args) > 2 and conv_node.args[2] is not bias_node: + raise ValueError("Conv arg did not contain bias node ", bias_node) + + # Skip if the partition is already annotated or is filtered out by the user + partition = [conv_node, weight_node] + if bias_node is not None: + partition.append(bias_node) + if _is_annotated(partition): + continue + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + # Annotate conv inputs and pattern output + input_qspec_map = {} + input_qspec_map[input_node] = get_input_act_qspec(quantization_config) + input_qspec_map[weight_node] = get_weight_qspec(quantization_config) + if bias_node is not None: + input_qspec_map[bias_node] = get_bias_qspec(quantization_config) + conv_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + ) + output_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=get_output_act_qspec(quantization_config), # type: ignore[arg-type] + _annotated=True, + ) + _mark_nodes_as_annotated(partition) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("gru_io_only") +def _annotate_gru_io_only( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + gru_partitions = get_source_partitions(gm.graph, [torch.nn.GRU], filter_fn) + gru_partitions = list(itertools.chain.from_iterable(gru_partitions.values())) + annotated_partitions = [] + for gru_partition in gru_partitions: + annotated_partitions.append(gru_partition.nodes) + output_nodes = gru_partition.output_nodes + input_nodes = gru_partition.input_nodes + # skip annotation if it is already annotated + if _is_annotated(input_nodes + output_nodes): + continue + # inside each GRU partition, we should be able to annotate each linear + # subgraph + input_act = input_nodes[0] + input_act_user = next(iter(input_act.users.keys())) + if not isinstance(input_act, Node): + raise AssertionError("input activation must be a FX Node") + if not isinstance(input_act_user, Node): + raise AssertionError("input activation user must be a FX Node") + input_act_user.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + input_act: get_input_act_qspec(quantization_config), + }, + _annotated=True, + ) + + hidden_state = input_nodes[1] + hidden_state_user = next(iter(hidden_state.users.keys())) + if not isinstance(hidden_state, Node): + raise AssertionError("hidden state must be a FX Node") + if not isinstance(hidden_state_user, Node): + raise AssertionError("hidden state user must be a FX Node") + hidden_state_user.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + hidden_state: get_input_act_qspec(quantization_config), + }, + _annotated=True, + ) + + if len(output_nodes) != 2: + raise AssertionError("expecting GRU to have two outputs") + for output in output_nodes: + output.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=get_output_act_qspec(quantization_config), + _annotated=True, + ) + nodes_to_mark_annotated = list(gru_partition.nodes) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + return annotated_partitions + + +@register_annotator("adaptive_avg_pool2d") +def _annotate_adaptive_avg_pool2d( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + """Always annotate adaptive_avg_pool2d op""" + module_partitions = get_source_partitions( + gm.graph, [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], filter_fn + ) + partitions = list(itertools.chain.from_iterable(module_partitions.values())) + annotated_partitions = [] + for partition in partitions: + pool_node = partition.output_nodes[0] + if ( + pool_node.op != "call_function" + or pool_node.target != torch.ops.aten.adaptive_avg_pool2d.default + ): + raise ValueError(f"{pool_node} is not an aten adaptive_avg_pool2d operator") + + if _is_annotated([pool_node]): + continue + + annotated_partitions.append(partition.nodes) + input_act = pool_node.args[0] + if not isinstance(input_act, Node): + raise AssertionError("input activation must be a FX Node") + + # only annotate input output sharing operator + # when the output of the input node is annotated + if ( + "quantization_annotation" not in input_act.meta + or not input_act.meta["quantization_annotation"]._annotated + or input_act.meta["quantization_annotation"].output_qspec is None + ): + input_act_qspec = get_input_act_qspec(quantization_config) + else: + input_act_qspec = SharedQuantizationSpec(input_act) + + # output sharing with input + output_act_qspec = SharedQuantizationSpec((input_act, pool_node)) + pool_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + input_act: input_act_qspec, + }, + output_qspec=output_act_qspec, + _annotated=True, + ) + return annotated_partitions + + +def _is_input_large_scalar(node: Node, gm: torch.fx.GraphModule): + """Check if input is a large scalar value. So that we can skip quantization for the node + since histc op (in HistogramObserver) only works for values up to certain upper bound + """ + if node.op == "get_attr": + qualified_name = str(node.target) + module_path, _, name = qualified_name.rpartition(".") + submod = gm.get_submodule(module_path) + tensor = getattr(submod, name) + # torch.histc works until this upper bound + HISTC_UPPER_BOUND = 3.4028235e15 + return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND + return False + + +def _is_input_non_float_tensor(node: Node): + """Check if the input is not a float tensor, so that we can skip quantization for the node + since observers only works with float Tensors + """ + if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor): + return True + return node.meta["val"].dtype != torch.float32 + + +@register_annotator("add_relu") +def _annotate_add_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ]: + continue + relu_node = node + maybe_add = node.args[0] + if ( + not isinstance(maybe_add, Node) + or maybe_add.op != "call_function" + or maybe_add.target + not in [ + torch.ops.aten.add.Tensor, + torch.ops.aten.add_.Tensor, + ] + ): + continue + + add_node = maybe_add + + if len(add_node.users) > 1: + # add can't be fused with ReLU if the result of add is being used + # else where in the graph + continue + + partition = [relu_node, add_node] + + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + + input_qspec_map = {} + input_act0 = add_node.args[0] + if isinstance(input_act0, Node): + if _is_input_large_scalar(input_act0, gm): + continue + if _is_input_non_float_tensor(input_act0): + continue + partition.append(input_act0) + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = add_node.args[1] + if isinstance(input_act1, Node): + if _is_input_large_scalar(input_act1, gm): + continue + if _is_input_non_float_tensor(input_act1): + continue + partition.append(input_act1) + input_qspec_map[input_act1] = input_act_qspec + + add_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + ) + relu_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=output_act_qspec, + _annotated=True, + ) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("add") +def _annotate_add( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in [ + torch.ops.aten.add.Tensor, + torch.ops.aten.add_.Tensor, + ]: + continue + add_node = node + partition = [add_node] + + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + + input_qspec_map = {} + input_act0 = add_node.args[0] + if isinstance(input_act0, Node): + if _is_input_large_scalar(input_act0, gm): + continue + if _is_input_non_float_tensor(input_act0): + continue + input_qspec_map[input_act0] = input_act_qspec + partition.append(input_act0) + + input_act1 = add_node.args[1] + if isinstance(input_act1, Node): + if _is_input_large_scalar(input_act1, gm): + continue + if _is_input_non_float_tensor(input_act1): + continue + input_qspec_map[input_act1] = input_act_qspec + partition.append(input_act1) + + add_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("mul_relu") +def _annotate_mul_relu( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ]: + continue + relu_node = node + maybe_mul = node.args[0] + if ( + not isinstance(maybe_mul, Node) + or maybe_mul.op != "call_function" + or maybe_mul.target + not in [ + torch.ops.aten.mul.Tensor, + torch.ops.aten.mul_.Tensor, + ] + ): + continue + + mul_node = maybe_mul + if len(mul_node.users) > 1: + # mul can't be fused with ReLU if the result of mul is being used + # else where in the graph + continue + + partition = [relu_node, mul_node] + + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + + input_qspec_map = {} + input_act0 = mul_node.args[0] + if isinstance(input_act0, Node): + if _is_input_large_scalar(input_act0, gm): + continue + if _is_input_non_float_tensor(input_act0): + continue + partition.append(input_act0) + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = mul_node.args[1] + if isinstance(input_act1, Node): + if _is_input_large_scalar(input_act1, gm): + continue + if _is_input_non_float_tensor(input_act1): + continue + partition.append(input_act1) + input_qspec_map[input_act1] = input_act_qspec + + mul_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + _annotated=True, + ) + relu_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=output_act_qspec, + _annotated=True, + ) + annotated_partitions.append(partition) + return annotated_partitions + + +@register_annotator("mul") +def _annotate_mul( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + annotated_partitions = [] + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in [ + torch.ops.aten.mul.Tensor, + torch.ops.aten.mul_.Tensor, + ]: + continue + + mul_node = node + partition = [mul_node] + if _is_annotated(partition): + continue + + if filter_fn and any(not filter_fn(n) for n in partition): + continue + + input_act_qspec = get_input_act_qspec(quantization_config) + output_act_qspec = get_output_act_qspec(quantization_config) + + input_qspec_map = {} + input_act0 = mul_node.args[0] + if isinstance(input_act0, Node): + if _is_input_large_scalar(input_act0, gm): + continue + if _is_input_non_float_tensor(input_act0): + continue + input_qspec_map[input_act0] = input_act_qspec + partition.append(input_act0) + + input_act1 = mul_node.args[1] + if isinstance(input_act1, Node): + if _is_input_large_scalar(input_act1, gm): + continue + if _is_input_non_float_tensor(input_act1): + continue + input_qspec_map[input_act1] = input_act_qspec + partition.append(input_act0) + + mul_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + annotated_partitions.append(partition) + return annotated_partitions + + +# TODO: remove Optional in return type, fix annotated_partitions logic +@register_annotator("cat") +def _annotate_cat( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: Callable[[Node], bool] | None = None, +) -> list[list[Node]] | None: + cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn) + cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values())) + annotated_partitions = [] + for cat_partition in cat_partitions: + cat_node = cat_partition.output_nodes[0] + if _is_annotated([cat_node]): + continue + + if cat_node.target != torch.ops.aten.cat.default: + # TODO: change this to AnnotationException + raise Exception( # noqa: TRY002 + f"Expected cat node: torch.ops.aten.cat.default, but found {cat_node.target}" + " please check if you are calling the correct capture API" + ) + + annotated_partitions.append(cat_partition.nodes) + + input_act_qspec = get_input_act_qspec(quantization_config) + inputs = cat_node.args[0] + + input_qspec_map = {} + input_act0 = inputs[0] # type: ignore[index] + if isinstance(input_act0, Node): + input_qspec_map[input_act0] = input_act_qspec + + shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node)) # type: ignore[arg-type] + for input_act in inputs[1:]: # type: ignore[index, union-attr] + if input_act not in input_qspec_map: + input_qspec_map[input_act] = shared_with_input0_qspec # type: ignore[index] + + output_act_qspec = shared_with_input0_qspec + + cat_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + return annotated_partitions + + +def _is_share_obs_or_fq_op(op: Callable) -> bool: + return op in [ + torch.ops.aten.relu.default, + torch.ops.aten.hardtanh.default, + torch.ops.aten.hardtanh_.default, + torch.ops.aten.max_pool2d.default, + torch.ops.aten.mean.default, + torch.ops.aten.mean.dim, + torch.ops.aten.permute.default, + torch.ops.aten.permute_copy.default, + torch.ops.aten.squeeze.dim, + torch.ops.aten.squeeze_copy.dim, + # TODO: remove? + torch.ops.aten.adaptive_avg_pool2d.default, + torch.ops.aten.view_copy.default, + torch.ops.aten.view.default, + torch.ops.aten.slice_copy.Tensor, + torch.ops.aten.flatten.using_ints, + ] + + +def propagate_annotation(model: torch.fx.GraphModule) -> None: + for n in model.graph.nodes: + if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target): + continue + + prev_node = n.args[0] + if not isinstance(prev_node, Node): + continue + + quantization_annotation = prev_node.meta.get("quantization_annotation", None) + if not quantization_annotation: + continue + + output_qspec = quantization_annotation.output_qspec + if not output_qspec: + continue + + # make sure current node is not annotated + if ( + "quantization_annotation" in n.meta + and n.meta["quantization_annotation"]._annotated + ): + continue + + shared_qspec = SharedQuantizationSpec(prev_node) + # propagate the previous output_qspec to the current node + n.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + prev_node: shared_qspec, + }, + output_qspec=shared_qspec, + _annotated=True, + ) + + +# TODO: make the list of ops customizable +def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule: + for n in model.graph.nodes: + if n.op != "call_function" or n.target not in [ + torch.ops.aten.add.Tensor, + torch.ops.aten.mul.Tensor, + ]: + continue + args = list(n.args) + new_args = [] + for i in range(len(args)): + if isinstance(args[i], torch.fx.Node): + new_args.append(args[i]) + continue + prefix = "_tensor_constant_" + get_new_attr_name = get_new_attr_name_with_prefix(prefix) + tensor_constant_name = get_new_attr_name(model) + float_tensor = torch.tensor(float(args[i])) + model.register_buffer(tensor_constant_name, float_tensor) + fake_mode = n.meta["val"].fake_mode + with model.graph.inserting_before(n): + get_attr_node = model.graph.create_node( + "get_attr", tensor_constant_name, (), {} + ) + get_attr_node.meta["val"] = fake_mode.from_tensor( + float_tensor, static_shapes=True + ) + new_args.append(get_attr_node) + n.args = tuple(new_args) + model.recompile() + return model diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..1c0fc48fd54fa17b6ed0db900677ab339d62a988 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py @@ -0,0 +1,117 @@ +# mypy: allow-untyped-defs +import functools +from typing import Any, TYPE_CHECKING + +import torch +from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver +from torch.ao.quantization.quantizer.quantizer import QuantizationSpec +from torch.ao.quantization.quantizer.x86_inductor_quantizer import ( + _is_any_annotated, + FilterFn, + int8_in_int8_out_ops, + X86InductorQuantizer, +) +from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig +from torch.fx import Node + + +if TYPE_CHECKING: + from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor + +__all__ = [ + "XPUInductorQuantizer", + "get_default_xpu_inductor_quantization_config", +] + + +@functools.lru_cache +def get_default_xpu_inductor_quantization_config(): + extra_args: dict[str, Any] = {"eps": 2**-12} + act_observer_or_fake_quant_ctr = HistogramObserver + act_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + + weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = ( + PerChannelMinMaxObserver + ) + + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_channel_symmetric, + ch_axis=0, # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv + is_dynamic=False, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + + bias_quantization_spec = None # will use placeholder observer by default + quantization_config = QuantizationConfig( + act_quantization_spec, + act_quantization_spec, + weight_quantization_spec, + bias_quantization_spec, + False, + ) + return quantization_config + + +class XPUInductorQuantizer(X86InductorQuantizer): + """ + XPUInductorQuantizer is a class designed to facilitate + quantization capability at Intel GPU backend. The class + highly reuses the existing implementation of + X86InductorQuantizer as both are intended to take advantage + of the optimized kernels in oneDNN library. + """ + + """ + Following annotate_xx overrides the impls in base class, as + no XPU implementation for these operators currently. We would + gradually enable the XPU implementation and remove following + overrides. We keep the annotate methods but make the function + body empty, aiming to let `_generate_qdq_quantized_model` + generate qdq around op and graph execute on fp32 dtype for + unsupported operators. + """ + + def _annotate_qat_conv2d_fusion_pattern( + self, + model: torch.fx.GraphModule, + quantization_config: QuantizationConfig | None, + filter_fn: FilterFn | None = None, + ): + pass + + def _annotate_maxpool2d( + self, + node: Node, + quantization_config: QuantizationConfig | None, + ) -> None: + """ + Here we skip the annotate logic for maxpool at XPU backend + as the quantized::max_pool2d is only implemented for CPU. + """ + return + + def _annotate_output_for_int8_in_int8_out_pattern( + self, + node: Node, + ) -> None: + if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])): + if node.target is torch.ops.aten.max_pool2d.default: + return + else: + input_node = node.all_input_nodes[0] + self._annotate_output_share_observer_as_input(input_node, node) + return diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py new file mode 100644 index 0000000000000000000000000000000000000000..8dd05374eff844be2cec2d913b88a338aded4e6a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py @@ -0,0 +1,74 @@ +from typing import Any + +import torch +from torch import nn +from torch.ao.quantization import QConfig + + +__all__ = ["QuantStub", "DeQuantStub", "QuantWrapper"] + + +class QuantStub(nn.Module): + r"""Quantize stub module, before calibration, this is same as an observer, + it will be swapped as `nnq.Quantize` in `convert`. + + Args: + qconfig: quantization configuration for the tensor, + if qconfig is not provided, we will get qconfig from parent modules + """ + + def __init__(self, qconfig: QConfig | None = None): + super().__init__() + if qconfig: + self.qconfig = qconfig + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + +class DeQuantStub(nn.Module): + r"""Dequantize stub module, before calibration, this is same as identity, + this will be swapped as `nnq.DeQuantize` in `convert`. + + Args: + qconfig: quantization configuration for the tensor, + if qconfig is not provided, we will get qconfig from parent modules + """ + + def __init__(self, qconfig: Any | None = None): + super().__init__() + if qconfig: + self.qconfig = qconfig + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + +class QuantWrapper(nn.Module): + r"""A wrapper class that wraps the input module, adds QuantStub and + DeQuantStub and surround the call to module with call to quant and dequant + modules. + + This is used by the `quantization` utility functions to add the quant and + dequant modules, before `convert` function `QuantStub` will just be observer, + it observes the input tensor, after `convert`, `QuantStub` + will be swapped to `nnq.Quantize` which does actual quantization. Similarly + for `DeQuantStub`. + """ + + quant: QuantStub + dequant: DeQuantStub + module: nn.Module + + def __init__(self, module: nn.Module): + super().__init__() + qconfig = getattr(module, "qconfig", None) + self.add_module("quant", QuantStub(qconfig)) + self.add_module("dequant", DeQuantStub(qconfig)) + self.add_module("module", module) + self.train(module.training) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + X = self.quant(X) + X = self.module(X) + return self.dequant(X) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..84a027e17e6b07cfbddc8b7b436ba0299b32ef91 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py @@ -0,0 +1,875 @@ +# mypy: allow-untyped-defs +""" +Utils shared by different modes of quantization (eager/graph) +""" + +import functools +import sys +import warnings +from collections import OrderedDict +from collections.abc import Callable +from inspect import getfullargspec, signature +from typing import Any, Union + +import torch +from torch.ao.quantization.quant_type import QuantType +from torch.fx import Node +from torch.nn.utils.parametrize import is_parametrized + + +if sys.version_info < (3, 12): + NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any] + NodePattern.__module__ = "torch.ao.quantization.utils" +else: + from typing import TypeAliasType + + NodePattern = TypeAliasType( + "NodePattern", tuple[Node, Node] | tuple[Node, tuple[Node, Node]] | Any + ) + + +# This is the Quantizer class instance from torch/quantization/fx/quantize.py. +# Define separately to prevent circular imports. +# TODO(future PR): improve this. +# make this public once fixed (can't be public as is because setting the module directly +# doesn't work) +QuantizerCls = Any + +# Type for fusion patterns, it can be more complicated than the following actually, +# see pattern.md for docs +# TODO: not sure if typing supports recursive data types + +if sys.version_info < (3, 12): + Pattern = Union[ + Callable, + tuple[Callable, Callable], + tuple[Callable, tuple[Callable, Callable]], + Any, + ] + Pattern.__module__ = "torch.ao.quantization.utils" +else: + from typing import TypeAliasType + + Pattern = TypeAliasType( + "Pattern", + Callable + | tuple[Callable, Callable] + | tuple[Callable, tuple[Callable, Callable]] + | Any, + ) + + +# TODO: maybe rename this to MatchInputNode +class MatchAllNode: + """A node pattern that matches all nodes, used in defining + fusion patterns in FX Graph Mode Quantization + """ + + +module_type_list = { + torch.nn.ReLU, + torch.nn.ReLU6, + torch.nn.AdaptiveAvgPool1d, + torch.nn.AdaptiveAvgPool2d, + torch.nn.AdaptiveAvgPool3d, + torch.nn.AvgPool1d, + torch.nn.AvgPool2d, + torch.nn.AvgPool3d, + torch.nn.MaxPool1d, + torch.nn.MaxPool2d, + torch.nn.MaxPool3d, + torch.nn.Identity, + torch.nn.Hardsigmoid, + torch.nn.Sigmoid, + torch.nn.Tanh, +} +func_list = { + torch.nn.functional.adaptive_avg_pool1d, + torch.nn.functional.adaptive_avg_pool2d, + torch.nn.functional.adaptive_avg_pool3d, + torch.nn.functional.elu, + torch.nn.functional.hardswish, + torch.nn.functional.instance_norm, + torch.nn.functional.layer_norm, + torch.nn.functional.leaky_relu, + torch.nn.functional.silu, + torch.nn.functional.mish, + torch.nn.functional.dropout, + torch.nn.functional.max_pool1d, + torch.nn.functional.max_pool2d, + torch.nn.functional.max_pool3d, + torch.nn.functional.relu, + torch.nn.functional.hardtanh, + torch.nn.functional.hardtanh_, + torch.nn.functional.hardsigmoid, + torch.nn.functional.sigmoid, + torch.transpose, + torch.repeat_interleave, + torch.sigmoid, + torch.squeeze, + torch.stack, + torch.sum, + torch.tanh, + torch.unsqueeze, + torch.cat, +} +method_list = { + torch.mean, + "relu", + "relu_", + "contiguous", + "detach", + "detach_", + "hardsigmoid", + "hardsigmoid_", + "permute", + "repeat", + "repeat_interleave", + "reshape", + "resize_", + "shape", + "sigmoid", + "sigmoid_", + "size", + "squeeze", + "squeeze_", + "tanh", + "tanh_", + "transpose", + "unsqueeze", + "unsqueeze_", + "view", +} + + +# TODO: not used now, remove +def check_node(node, modules): + # TODO: reuse is_fixed_qparam_node after we move this function to _lower_to_native_backend.py + is_call_function = node.op == "call_function" and node.target in func_list + is_call_method = node.op == "call_method" and node.target in method_list + is_call_module = ( + node.op == "call_module" and type(modules[str(node.target)]) in module_type_list + ) + return is_call_function, is_call_method, is_call_module + + +def get_combined_dict(default_dict, additional_dict): + """ + Combines two dictionaries. + + This function takes two dictionaries as input and returns a new dictionary + that contains all the key-value pairs from both input dictionaries. + If there are any duplicate keys in the `additional_dict`, the values + from the `additional_dict` will overwrite those in the `default_dict`. + Args: + default_dict (dict): The main dictionary that will be used as the base + additional_dict (dict): The dictionary used to update `default_dict` + + Returns: + dict: The resulting dictionary + Example: + >>> x = dict(a=1, b=1) + >>> y = dict(b=2, c=3) + >>> get_combined_dict(x, y) + {'a': 1, 'b': 2, 'c': 3} + """ + d = default_dict.copy() + d.update(additional_dict) + return d + + +def is_per_tensor(qscheme): + return qscheme == torch.per_tensor_affine or qscheme == torch.per_tensor_symmetric + + +def is_per_channel(qscheme): + return qscheme in [ + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + torch.per_channel_symmetric, + ] + + +def getattr_from_fqn(obj: Any, fqn: str) -> Any: + """ + Given an obj and a fqn such as "foo.bar.baz", returns gm.foo.bar.baz. + """ + return functools.reduce(getattr, fqn.split("."), obj) + + +def to_underlying_dtype(qdtype): + DTYPE_MAPPING = { + torch.quint8: torch.uint8, + torch.qint8: torch.int8, + torch.qint32: torch.int32, + torch.quint4x2: torch.uint8, + torch.quint2x4: torch.uint8, + torch.uint8: torch.uint8, + torch.int8: torch.int8, + torch.uint16: torch.uint16, + torch.int16: torch.int16, + torch.int32: torch.int32, + torch.float8_e5m2: torch.float8_e5m2, + torch.float8_e4m3fn: torch.float8_e4m3fn, + } + if qdtype not in DTYPE_MAPPING: + raise AssertionError("Unsupported dtype: " + str(qdtype)) + return DTYPE_MAPPING[qdtype] + + +def get_qparam_dict(observer_or_fake_quant): + from torch.ao.quantization.observer import PlaceholderObserver + + qscheme = getattr(observer_or_fake_quant, "qscheme", None) + dtype = observer_or_fake_quant.dtype + qparams = {"qscheme": qscheme, "dtype": dtype} + + if not qscheme or isinstance(observer_or_fake_quant, PlaceholderObserver): + return {"qscheme": None, "dtype": dtype} + + if is_per_tensor(qscheme): + qscheme = torch.per_tensor_affine + elif is_per_channel(qscheme): + # change symmetric to affine since we do not have symmetric + # quantized Tensor + if qscheme == torch.per_channel_symmetric: + qscheme = torch.per_channel_affine + qparams["axis"] = observer_or_fake_quant.ch_axis + else: + raise RuntimeError(f"Unrecognized qscheme: {qscheme}") + # update qscheme, since we don't have symmetric quant qscheme + # in quantized Tensor + qparams["qscheme"] = qscheme + + scale, zero_point = observer_or_fake_quant.calculate_qparams() + qparams["scale"] = scale + qparams["zero_point"] = zero_point + + if hasattr(observer_or_fake_quant, "quant_min"): + qparams["quant_min"] = observer_or_fake_quant.quant_min + if hasattr(observer_or_fake_quant, "quant_max"): + qparams["quant_max"] = observer_or_fake_quant.quant_max + + return qparams + + +def get_swapped_custom_module_class( + custom_module, custom_module_class_mapping, qconfig +): + """Get the observed/quantized custom module class that we need + to swap `custom_module` to + Input: + custom_module: input, can be an instance of either a float or observed custom module + custom_module_class_mapping: the float to observed or observed to quantized custom module class mapping + qconfig: qconfig configured for the custom module + + Output: + corresponding observed/quantized custom module class for input custom module instance + """ + quant_type = get_quant_type(qconfig) + class_mapping = custom_module_class_mapping.get(quant_type, {}) + if type(custom_module) not in class_mapping: + raise AssertionError( + "did not find corresponding observed " + f"module class for {type(custom_module)} in mapping: {class_mapping}" + ) + return class_mapping[type(custom_module)] + + +def activation_dtype(qconfig): + if qconfig is None: + raise AssertionError("qconfig must be provided to determine activation dtype") + activation = qconfig.activation() + return activation.dtype + + +def weight_dtype(qconfig): + if qconfig is None: + raise AssertionError("qconfig must be provided to determine weight dtype") + weight = qconfig.weight() + return weight.dtype + + +def activation_is_statically_quantized(qconfig): + """Given a qconfig, decide if the activation needs to be + quantized or not, this includes quantizing to quint8, qint8 and qint32 and float16 + """ + return activation_dtype(qconfig) in [ + torch.quint8, + torch.qint8, + torch.qint32, + torch.float16, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.float8_e5m2, + torch.float8_e4m3fn, + ] and (not activation_is_dynamically_quantized(qconfig)) + + +def activation_is_dynamically_quantized(qconfig): + """Given a qconfig, decide if the activation needs to be + dynamically quantized or not, this includes dynamically quantizing to + quint8, qint8 and float16 + """ + _activation_dtype, _, activation_is_dynamic = get_qconfig_dtypes(qconfig) + return activation_is_dynamic + + +def activation_is_int8_quantized(qconfig): + """Given a qconfig, decide if the activation needs to be + quantized to int8 or not, this includes quantizing to quint8, qint8 + """ + return activation_dtype(qconfig) in [ + torch.quint8, + torch.qint8, + torch.uint8, + torch.int8, + ] + + +def activation_is_int32_quantized(qconfig): + """Given a qconfig, decide if the activation needs to be + quantized to int32 or not + """ + return activation_dtype(qconfig) in [torch.qint32, torch.int32] + + +def weight_is_quantized(qconfig): + """Given a qconfig, decide if the weight needs to be + quantized or not + """ + return weight_dtype(qconfig) in [ + torch.quint8, + torch.qint8, + torch.float16, + torch.quint4x2, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.float8_e5m2, + torch.float8_e4m3fn, + ] + + +def weight_is_statically_quantized(qconfig): + """Given a qconfig, decide if the weight needs to be statically + quantized or not + """ + return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.uint8, torch.int8] + + +def op_is_int8_dynamically_quantized(qconfig) -> bool: + """Given a qconfig, returns True if this op is using int8 dynamic + quantization + """ + activation_dtype, weight_dtype, activation_is_dynamic = get_qconfig_dtypes(qconfig) + return ( + activation_dtype in [torch.quint8, torch.uint8] + and + # for now, the lines below assume fbgemm or qnnpack + weight_dtype in [torch.qint8, torch.int8] + and activation_is_dynamic + ) + + +def get_qconfig_dtypes(qconfig): + r"""returns the qconfig tuple for qconfig: + (activation_dtype, weight_dtype, activation_is_dynamic) + """ + if qconfig is None: + raise AssertionError("qconfig must be provided to extract dtypes") + activation = qconfig.activation() + weight = qconfig.weight() + act_is_dynamic = getattr(activation, "is_dynamic", False) + return (activation.dtype, weight.dtype, act_is_dynamic) + + +def get_quant_type(qconfig): + if qconfig is None: + raise AssertionError("qconfig must be provided to determine quant type") + activation = qconfig.activation() + weight = qconfig.weight() + static_dtypes = [ + torch.quint8, + torch.qint8, + torch.quint4x2, + torch.qint32, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.float8_e5m2, + torch.float8_e4m3fn, + ] + if weight.dtype in static_dtypes: + if hasattr(activation, "is_dynamic") and activation.is_dynamic: + return QuantType.DYNAMIC + elif activation.dtype in static_dtypes: + return QuantType.STATIC + else: + return QuantType.WEIGHT_ONLY + + if weight.dtype == torch.float16: + if hasattr(activation, "is_dynamic") and activation.is_dynamic: + return QuantType.DYNAMIC + elif activation.dtype == torch.float16: + return QuantType.STATIC + + raise Exception( # noqa: TRY002 + f"Unrecognized dtype combination in get_quant_type: activation({activation.dtype})," + f"weight({weight.dtype})" + ) + + +def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool: + """Checks if the given minimum and maximum values are valid, meaning that + they exist and the min value is less than the max value. + """ + if min_val.numel() == 0 or max_val.numel() == 0: + warnings.warn( + "must run observer before calling calculate_qparams. " + + "Returning default values.", + stacklevel=2, + ) + return False + + if min_val.dim() == 0 or max_val.dim() == 0: + if min_val == float("inf") and max_val == float("-inf"): + warnings.warn( + "must run observer before calling calculate_qparams. " + + "Returning default values.", + stacklevel=2, + ) + + return False + + if min_val > max_val: + raise AssertionError(f"min {min_val} should be less than max {max_val}") + else: + if torch.any(min_val > max_val): + raise AssertionError(f"min {min_val} should be less than max {max_val}") + + return True + + +def calculate_qmin_qmax( + quant_min: int, + quant_max: int, + has_customized_qrange: bool, + dtype: torch.dtype, + reduce_range: bool, +) -> tuple[int, int]: + r"""Calculates actual qmin and qmax based on the quantization range, + observer datatype and if range is reduced. + """ + # TODO(jerryzh): Figure out why custom quant_min/quant_max are still adjusted. + if has_customized_qrange: + # This initialization here is to be resolve TorchScript compilation issues and allow + # using of refinement to decouple initial_qmin and initial_qmax from quantization range. + # The actual values of initial_qmin and initial_qmax will be reset below. + if dtype in [torch.qint32, torch.int32]: + initial_quant_min, initial_quant_max = 0, 2**32 - 1 + else: + initial_quant_min, initial_quant_max = 0, 255 + # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the + # attribute from Optional valid integers for use, based on TorchScript's requirements. + custom_quant_min, custom_quant_max = quant_min, quant_max + if custom_quant_min is not None and custom_quant_max is not None: + initial_quant_min, initial_quant_max = ( + custom_quant_min, + custom_quant_max, + ) + + qrange_len = initial_quant_max - initial_quant_min + 1 + if dtype in [torch.qint8, torch.int8]: + if not (0 < qrange_len <= 256): + raise AssertionError( + "quantization range should be positive and not exceed the maximum bit range (=256)." + ) + elif dtype in [torch.qint32, torch.int32]: + if not (0 < qrange_len <= 2**32): + raise AssertionError( + "quantization range should be positive and not exceed the maximum bit range (=4294967296)." + ) + if reduce_range: + quant_min, quant_max = quant_min // 2, quant_max // 2 + else: + # Fallback onto default 8-bit qmin and qmax calculation if dynamic range is not used. + if dtype in [torch.qint8, torch.int8]: + if reduce_range: + quant_min, quant_max = -64, 63 + else: + quant_min, quant_max = -128, 127 + elif dtype in [torch.quint8, torch.uint8]: + if reduce_range: + quant_min, quant_max = 0, 127 + else: + quant_min, quant_max = 0, 255 + elif dtype in [torch.qint32, torch.int32]: + quant_min, quant_max = -1 * (2**31), (2**31) - 1 + elif dtype == torch.uint16: + quant_min, quant_max = 0, 2**16 - 1 + elif dtype == torch.int16: + quant_min, quant_max = -(2**15), 2**15 - 1 + else: + quant_min, quant_max = 0, 15 + return quant_min, quant_max + + +def _parent_name(target): + """ + Turn 'foo.bar' into ['foo', 'bar'] + """ + r = target.rsplit(".", 1) + if len(r) == 1: + return "", r[0] + else: + return r[0], r[1] + + +def has_no_children_ignoring_parametrizations(module): + """ + Checks if module._modules is empty or + if module is a parametrization, checks that module._modules only has + the 'parametrizations' module + """ + if len(module._modules) == 0: + return True + elif is_parametrized(module): + return len(module._modules) == 1 and "parametrizations" in module._modules + else: + return False + + +def _get_path_of_module( + root: torch.nn.Module, submodule: torch.nn.Module +) -> str | None: + """Get the path (fully qualified name) of a submodule + + Example:: + + >> class M(torch.nn.Module): + def __init__(self) -> None: + self.linear = torch.nn.Linear(5, 5) + def forward(self, x): + return self.linear(x) + + >> m = M() + >> l = m.linear + >> _get_path_of_module(m, l) + "linear" + """ + for n, p in root.named_modules(): + if submodule is p: + return n + return None + + +def _get_signature_locals(f: Callable, loc: dict[str, Any]) -> dict[str, Any]: + """Get local keyword arguments + + Example:: + + >> def f(self, a, b=9): + pass + >> loc = {"a": 6, "c": 7} + >> _get_signature_locals(f, loc) + {"a": 6} + """ + return {k: v for k, v in loc.items() if k in signature(f).parameters} + + +def _get_default_kwargs(f: Callable) -> "OrderedDict[str, Any]": + """Get all default keyword arguments from function signature + + Example:: + + >> def f(self, a, b=9): + pass + >> _get_default_kwargs(f) + {"b": 9} + """ + kwargs = {} + for name, param in signature(f).parameters.items(): + if param.default is not param.empty: + kwargs[name] = param.default + elif param.kind is param.VAR_POSITIONAL: + kwargs[name] = () + elif param.kind is param.VAR_KEYWORD: + kwargs[name] = {} + return OrderedDict(kwargs) + + +def _normalize_kwargs(func: Callable, loc: dict[str, Any]) -> "OrderedDict[str, Any]": + """Given a function and local function arguments, normalize the keyword + arguments by filling in default arguments from function signature + + Example:: + + >> def f(self, key1=3, key2=3): + pass + >> loc = {"key2": 6} + >> _normalize_kwargs(f, loc) + {"key1": 3, "key2": 6} + """ + default_kwargs = _get_default_kwargs(func) + local_kwargs = _get_signature_locals(func, loc) + normalized_kwargs = default_kwargs.copy() + for attr, val in local_kwargs.items(): + if attr in normalized_kwargs: + # override the default keyword arguments + normalized_kwargs[attr] = val + return normalized_kwargs + + +def validate_qmin_qmax(quant_min: int, quant_max: int) -> None: + r"""Validates that the user-specified quantization range is properly initialized + and within the given bound supported by the observer dtype. + + To accommodate lower-bit quantization with respect to the existing torch.qint8 and + torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing + in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax + values are used to calculate static estimates of the scale and zero point for aggressive lower-bit + fake quantization. These estimates are compared against parameters learned through backpropagation. + The related literatures for scale and zero point via backpropagation are as follows: + + Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS + Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf + """ + # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted + # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer. + if not (quant_min <= 0 <= quant_max): + raise AssertionError("Used-specified quantization range must include 0.") + if quant_min >= quant_max: + raise AssertionError( + "qmin must be strictly less than qmax for user-specified quantization range." + ) + + +# Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme +# as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer +# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikely to change +# (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168) +def determine_qparams( + min_val: torch.Tensor, + max_val: torch.Tensor, + quant_min: int, + quant_max: int, + dtype: torch.dtype, + eps: torch.Tensor, + has_customized_qrange: bool, + qscheme: torch.qscheme = torch.per_tensor_affine, +) -> tuple[torch.Tensor, torch.Tensor]: + r"""Calculates the quantization parameters, given min and max + value tensors. Works for both per tensor and per channel cases + + Args: + min_val: Minimum values per channel + max_val: Maximum values per channel + + Returns: + scales: Scales tensor of shape (#channels,) + zero_points: Zero points tensor of shape (#channels,) + """ + if not check_min_max_valid(min_val, max_val): + return torch.tensor([1.0], device=min_val.device.type), torch.tensor( + [0], device=min_val.device.type + ) + + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + + device = min_val_neg.device + scale = torch.ones(min_val_neg.size(), dtype=torch.double, device=device) + zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device) + eps = eps.to(device) + + if qscheme == torch.per_tensor_symmetric or qscheme == torch.per_channel_symmetric: + max_val_pos = torch.max(-min_val_neg, max_val_pos) + scale = max_val_pos / (float(quant_max - quant_min) / 2) + scale = torch.max(scale, eps) + if dtype in [torch.uint8, torch.quint8]: + if has_customized_qrange: + # When customized quantization range is used, down-rounded midpoint of the range is chosen. + zero_point = zero_point.new_full( + zero_point.size(), (quant_min + quant_max) // 2 + ) + else: + zero_point = zero_point.new_full(zero_point.size(), 128) + elif qscheme == torch.per_channel_affine_float_qparams: + scale = (max_val - min_val) / float(quant_max - quant_min) + scale = torch.where(scale > eps, scale, torch.ones_like(scale)) + # We use the quantize function + # xq = Round(Xf * inv_scale + zero_point), + # setting zero_point to (-1 * min *inv_scale) we get + # Xq = Round((Xf - min) * inv_scale) + zero_point = -1 * min_val / scale + else: + scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min) + scale = torch.max(scale, eps) + zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int) + zero_point = torch.clamp(zero_point, quant_min, quant_max) + + # For scalar values, cast them to Tensors of size 1 to keep the shape + # consistent with default values in FakeQuantize. + if len(scale.shape) == 0: + # TODO: switch to scale.item() after adding JIT support + scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device) + if len(zero_point.shape) == 0: + # TODO: switch to zero_point.item() after adding JIT support + zero_point = torch.tensor( + [int(zero_point)], dtype=zero_point.dtype, device=device + ) + if qscheme == torch.per_channel_affine_float_qparams: + zero_point = torch.tensor( + [float(zero_point)], dtype=zero_point.dtype, device=device + ) + + return scale.to(torch.double), zero_point.to(torch.int64) + + +def _get_num_pos_args(f: Callable) -> int: + """Get number of positional args for a function + + Example:: + + >> def f(self, key1=3, key2=3): + pass + >> _get_num_pos_args(f) + 3 + """ + return len(getfullargspec(f).args) + + +def get_fqn_to_example_inputs( + model: torch.nn.Module, example_inputs: tuple[Any, ...] +) -> dict[str, tuple[Any, ...]]: + """Given a model and its example inputs, return a dictionary from + fully qualified name of submodules to example_inputs for that submodule, + e.g. {"linear1": (tensor1,), "linear2": (tensor2,), "sub": (tensor3,), + "sub.linear1": (tensor4,), ...} + + Used to make quantizing submodules easier now that FX Graph Mode Quantization requires + example inputs. + + Also works for keyword arguments with default values, we would flatten keyword + arguments as positional arguments and fill in the missing keyword args with default + values, e.g. if we have a forward function: + def forward(self, x, key1=3, key2=3): + ... + + and we call it with self.submodule(x, key2=6) + we'll get example_inputs: (x, 3, 6) + + user can also override `key1` with positional arguments as well: + for self.submodule(x, 5, key2=6) + we'll get: (x, 5, 6) + + variable positional arguments and variable positional keyword arguments in forward + function are not supported currently, so please make sure no submodules is using + them. + """ + root = model + fqn_to_example_inputs = {} + + def _patched_module_call(self, *args, **kwargs): + submodule_example_inputs = list(args).copy() + normalized_kwargs = _normalize_kwargs(self.forward, kwargs) + # minus 1 to skipping counting `self` + num_args = _get_num_pos_args(self.forward) - 1 + num_to_pop = num_args - len(submodule_example_inputs) + while num_to_pop and normalized_kwargs: + normalized_kwargs.popitem(last=False) + num_to_pop -= 1 + submodule_example_inputs.extend(normalized_kwargs.values()) + submodule_example_inputs_tuple = tuple(submodule_example_inputs) + fqn = _get_path_of_module(root, self) + if fqn is not None: + fqn_to_example_inputs[fqn] = submodule_example_inputs_tuple + return orig_module_call(self, *args, **kwargs) + + orig_module_call = torch.nn.Module.__call__ + torch.nn.Module.__call__ = _patched_module_call # type: ignore[method-assign] + try: + model(*example_inputs) + finally: + # restore the module call even if there is an exception + torch.nn.Module.__call__ = orig_module_call # type: ignore[method-assign] + return fqn_to_example_inputs + + +def _assert_and_get_unique_device(module: torch.nn.Module) -> Any: + """ + Returns the unique device for a module, or None if no device is found. + Throws an error if multiple devices are detected. + """ + devices = {p.device for p in module.parameters()} | { + p.device for p in module.buffers() + } + """ + As a temp workaround for AIMP HHC publish we added CPU check.remove it later. T163614564 + """ + if {torch.device("cpu"), torch.device("meta")} == devices: + warnings.warn( + "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.", + stacklevel=2, + ) + devices = {torch.device("cpu")} + "" + if len(devices) > 1: + raise AssertionError( + "prepare only works with cpu or single-device CUDA modules, " + f"but got devices {devices}" + ) + device = next(iter(devices)) if len(devices) > 0 else None + return device + + +DEPRECATION_WARNING = ( + "torch.ao.quantization is deprecated and will be removed in 2.10. \n" + "For migrations of users: \n" + "1. Eager mode quantization (torch.ao.quantization.quantize, " + "torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode " + "quantize_ API instead \n" + "2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx," + "torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization " + "API instead (prepare_pt2e, convert_pt2e) \n" + "3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) \n" + "see https://github.com/pytorch/ao/issues/2259 for more details" +) + + +__all__ = [ + "NodePattern", + "Pattern", + "MatchAllNode", + "check_node", + "get_combined_dict", + "is_per_tensor", + "is_per_channel", + "getattr_from_fqn", + "get_qparam_dict", + "get_swapped_custom_module_class", + "activation_dtype", + "weight_dtype", + "activation_is_statically_quantized", + "activation_is_dynamically_quantized", + "activation_is_int8_quantized", + "activation_is_int32_quantized", + "weight_is_quantized", + "weight_is_statically_quantized", + "op_is_int8_dynamically_quantized", + "get_qconfig_dtypes", + "get_quant_type", + "check_min_max_valid", + "calculate_qmin_qmax", + "has_no_children_ignoring_parametrizations", + "get_fqn_to_example_inputs", + "to_underlying_dtype", + "determine_qparams", + "validate_qmin_qmax", + "DEPRECATION_WARNING", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c55e7477bfeaa358e8cea60b9f8f83766b49e5f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5e4684a41489bf2d116e5da37807c6971bb2541 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41f030eb7229531435763f8ddd5ed22fcd7a646d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e4d015304837ed7e7b61676c199a984198b56dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e20098b30541791318208544dfbce7b73810198 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5580cf9d35c439cc51b6d3cc306bd8dd788d5109 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9563c01a46c519d4f5e02d832bc3c350454e785b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6667431d7ff0ee39a46a93f563859517b59adba6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..331e0c59e25295447a08c5e0a72e0b978620ee47 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..88ef0b5acac5e5bdeb034169052bcf5aa7456e33 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py @@ -0,0 +1,13 @@ +# pyrefly: ignore [deprecated] +from .autocast_mode import autocast, custom_bwd, custom_fwd +from .common import amp_definitely_not_available +from .grad_scaler import GradScaler + + +__all__ = [ + "amp_definitely_not_available", + "autocast", + "custom_bwd", + "custom_fwd", + "GradScaler", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..e6b63c708d3f2ddfc162a4431e114a2bcf47e9eb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py @@ -0,0 +1,110 @@ +# mypy: allow-untyped-defs +import functools +import sys +from typing import Any +from typing_extensions import deprecated + +import torch + + +__all__ = ["autocast", "custom_fwd", "custom_bwd"] + + +@deprecated( + "`torch.cuda.amp.autocast(args...)` is deprecated. " + "Please use `torch.amp.autocast('cuda', args...)` instead.", + category=FutureWarning, +) +class autocast(torch.amp.autocast_mode.autocast): + r"""See :class:`torch.autocast`. + + ``torch.cuda.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cuda", args...)`` instead. + """ + + # TODO: remove this conditional once we stop supporting Python < 3.13 + # Prior to Python 3.13, inspect.signature could not retrieve the correct + # signature information for classes decorated with @deprecated (unless + # the __new__ static method was explicitly defined); + # + # However, this issue has been fixed in Python 3.13 and later versions. + if sys.version_info < (3, 13): + + def __new__( + cls, + enabled: bool = True, + dtype: torch.dtype = torch.float16, + cache_enabled: bool = True, + ): + return super().__new__(cls) + + def __init_subclass__(cls): + pass + + def __init__( + self, + enabled: bool = True, + dtype: torch.dtype = torch.float16, + cache_enabled: bool = True, + ): + if torch._jit_internal.is_scripting(): + self._enabled = enabled + self.device = "cuda" + self.fast_dtype = dtype + return + super().__init__( + "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled + ) + + def __enter__(self): + if torch._jit_internal.is_scripting(): + return self + return super().__enter__() + + # TODO: discuss a unified TorchScript-friendly API for autocast + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): # type: ignore[override] + if torch._jit_internal.is_scripting(): + return + return super().__exit__(exc_type, exc_val, exc_tb) + + def __call__(self, func): + if torch._jit_internal.is_scripting(): + return func + return super().__call__(func) + + +# Preserved only for BC reasons +@deprecated( + "`torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. " + "Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.", + category=FutureWarning, +) +def _cast(value, dtype): + return torch.amp.autocast_mode._cast(value, "cuda", dtype) + + +@deprecated( + "`torch.cuda.amp.custom_fwd(args...)` is deprecated. " + "Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.", + category=FutureWarning, +) +def custom_fwd(fwd=None, *, cast_inputs=None): + """ + ``torch.cuda.amp.custom_fwd(args...)`` is deprecated. Please use + ``torch.amp.custom_fwd(args..., device_type='cuda')`` instead. + """ + return functools.partial(torch.amp.custom_fwd, device_type="cuda")( + fwd=fwd, cast_inputs=cast_inputs + ) + + +@deprecated( + "`torch.cuda.amp.custom_bwd(args...)` is deprecated. " + "Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.", + category=FutureWarning, +) +def custom_bwd(bwd): + """ + ``torch.cuda.amp.custom_bwd(args...)`` is deprecated. Please use + ``torch.amp.custom_bwd(args..., device_type='cuda')`` instead. + """ + return functools.partial(torch.amp.custom_bwd, device_type="cuda")(bwd) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py new file mode 100644 index 0000000000000000000000000000000000000000..915a9b4f4a9ca6c147abefd7c8ab1891ee5a8179 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py @@ -0,0 +1,11 @@ +# mypy: allow-untyped-defs +from importlib.util import find_spec + +import torch + + +__all__ = ["amp_definitely_not_available"] + + +def amp_definitely_not_available(): + return not (torch.cuda.is_available() or find_spec("torch_xla")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py new file mode 100644 index 0000000000000000000000000000000000000000..62e2020073c8ed99f7295edd1aaea4c54d815f63 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py @@ -0,0 +1,38 @@ +from typing_extensions import deprecated + +import torch + +# We need to keep this unused import for BC reasons +from torch.amp.grad_scaler import OptState # noqa: F401 + + +__all__ = ["GradScaler"] + + +class GradScaler(torch.amp.GradScaler): + r""" + See :class:`torch.amp.GradScaler`. + ``torch.cuda.amp.GradScaler(args...)`` is deprecated. Please use ``torch.amp.GradScaler("cuda", args...)`` instead. + """ + + @deprecated( + "`torch.cuda.amp.GradScaler(args...)` is deprecated. " + "Please use `torch.amp.GradScaler('cuda', args...)` instead.", + category=FutureWarning, + ) + def __init__( + self, + init_scale: float = 2.0**16, + growth_factor: float = 2.0, + backoff_factor: float = 0.5, + growth_interval: int = 2000, + enabled: bool = True, + ) -> None: + super().__init__( + "cuda", + init_scale=init_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + enabled=enabled, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4cc8b31ffb08fe3da29174f3b6c6f8d8fea3cb3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7649a5e1241acc8adf4cdf15f39b504b0787a4f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h @@ -0,0 +1,218 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +struct IValue; +using Stack = std::vector; + +class OperatorHandle; +class KernelFunction; + +// This kernel implements the behavior of falling through to the next available +// registered dispatch key. The implementation of this function is FAST; it is +// no overhead to fallthrough to the next key. See cpp file for some more +// implementation notes; notably, this does NOT actually go through the +// boxing/unboxing codepath. +TORCH_API void fallthrough_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*unused*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +// Note [Ambiguity in AutogradOther kernel] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This error-reporting kernel is registered to the AutogradOther entry in the +// dispatch table when there is both a CompositeImplicitAutograd kernel and a +// backend kernel for ANY backend that maps to AutogradOther. To see why +// this is necessary in the AutogradOther case, it's helpful to first see +// why everything works out fine for a backend that has a reserved Autograd +// entry (see rule 2.2 in [Note] DispatchTable computation): +// +// CPU AutogradCPU +// reg? registers with... +// ------------------------------------------------- +// y Autograd registration takes precedence +// over CompositeImplicitAutograd. +// This is good, because the CPU specific backend +// implementation is more specialized and typically better; +// if we used the composite, we would bypass it. +// (NB: the Autograd key is guaranteed to exist because +// the autograd codegen requires it!) +// +// n CompositeImplicitAutograd takes precedence. +// This is also good, because the Autograd +// registration (if it exists) would try to redispatch +// to the (non-existent) CPU implementation; by +// using the composite, we ensure the operator +// actually works. +// +// As you can see, when we have a specific Autograd key (AutogradCPU), we can +// decide whether or not to use the CompositeImplicitAutograd kernel or the +// Autograd kernel based on whether or not the backend kernel exists. +// +// However, for AutogradOther (which is the catchall autograd kernel for +// everything that doesn't have a specific Autograd key), we can't do this +// trick because there isn't any unique backend to peek at to disambiguate; +// if there are some backends that have implementations they prefer Autograd, +// but unimplemented backends would prefer CompositeImplicitAutograd. Rather +// than arbitrarily pick one or the other, we just register a kernel that raises +// an error and let the user decide how to proceed. +TORCH_API void ambiguous_autogradother_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +// Note [named_not_supported_kernel] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This kernel implements reporting an error message saying that named tensor is +// not supported. This kernel doesn't rely on the Stack, and so it is special +// cased in the dispatcher to be triggered before we attempt boxing (so we can +// give a good error message in cases when boxing is not supported). When +// boxing is universally supported this can be removed. +[[noreturn]] TORCH_API void named_not_supported_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +/** + * BoxedKernel is similar to a std::function storing a boxed kernel. + */ +class TORCH_API BoxedKernel final { + public: + // This is how boxed kernels are actually stored + // + // Note [Plumbing Keys Through The Dispatcher] + // Benchmarks have shown that it is expensive for the dispatcher to read from + // thread-local storage (TLS) upon every dispatch call into order to compute + // which kernel to dispatch to. + // + // To mitigate this, we've updated the calling convention inside the + // dispatcher to expect every kernel that it stores to have a first argument + // of type DispatchKeySet. + // + // What are the invariants of the DispatchKeySet when it gets passed to a + // kernel? + // - All keys to the left of the current dispatch key have been masked out. + // (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the + // highest bit to be DispatchKey::Tracer) + // - All other keys that dispatcher normally would have computed through TLS + + // global state + op arguments + // are still in the set. + // + // Kernels can then opt into using this keyset to save the dispatcher from + // doing repeated work during redispatches: recalculating the highest-priority + // dispatch key, which involves reading from TLS. Instead, the kernels that + // opt in will calculate an updated DispatchKeySet directly from the old one, + // and pass the updated set directly into the dispatcher upon redispatching. + // + // This is an opt-in mechanism: Kernels can automatically opt in by setting + // the first argument in their signature to be of type DispatchKeySet. See the + // kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for + // examples. + // + // The mechanism for optionally passing that DispatchKeySet into the kernel + // lives in make_boxed_from_unboxed_functor.h. See Note [Plumbing Keys Through + // The Dispatcher 2] for details. + using InternalBoxedKernelFunction = + void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*); + // This is the public API for how boxed kernels are defined + using BoxedKernelFunction = void(const OperatorHandle&, Stack*); + using BoxedKernelFunction_withDispatchKeys = + void(const OperatorHandle&, DispatchKeySet, Stack*); + + BoxedKernel(); + + // Fast path for dispatch to allow not touching the boxed kernel in + // the common case where unboxed is available. + bool isValid() const; + bool isFallthrough() const; + + /** + * Call the function with boxed arguments. + */ + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + /** + * Create a KernelFunction from a boxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>(); + */ + template + static BoxedKernel makeFromFunction(); + + /** + * TODO: This will only be useful if we write a backend fallback that plumbs + * dispatch keys (currently there are none) See Note [Plumbing Keys Through + * The Dispatcher] for details. + */ + template + static BoxedKernel makeFromFunction(); + + /** + * Create a KernelFunction from a boxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...} + * > }; + * > BoxedKernel func = + * BoxedKernel::makeFromFunctor(std::make_unique()); + */ + template + static BoxedKernel makeFromFunctor( + std::unique_ptr kernelFunctor); + + static BoxedKernel makeFallthrough(); + static BoxedKernel makeAmbiguousAutogradOther(); + static BoxedKernel makeNamedNotSupported(); + + private: + friend class KernelFunction; + + template + static void make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*unused*/, + Stack* stack); + + template + static void make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*ks*/, + Stack* stack); + + explicit BoxedKernel( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func); + + OperatorKernel* getFunctor() const; + InternalBoxedKernelFunction* getFnPtr() const; + + c10::intrusive_ptr functor_; + InternalBoxedKernelFunction* boxed_kernel_func_; +}; + +} // namespace c10 + +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..69c8b2cf65d6f0256193ee3899708ad18c7d6768 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace c10 { + +inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {} + +inline BoxedKernel::BoxedKernel( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func) + : functor_(std::move(functor)), boxed_kernel_func_(boxed_kernel_func) {} + +template +inline void BoxedKernel::make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*unused*/, + Stack* stack) { + // Note that we're dropping the DispatchKeySet argument. + // See Note [Plumbing Keys Through The Dispatcher 2] for details. + func(opHandle, stack); +} + +template +inline void BoxedKernel::make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet ks, + Stack* stack) { + // See Note [Plumbing Keys Through The Dispatcher 2] for details. + func(opHandle, ks, stack); +} + +inline bool BoxedKernel::isValid() const { + return boxed_kernel_func_ != nullptr; +} + +inline bool BoxedKernel::isFallthrough() const { + return boxed_kernel_func_ == &fallthrough_kernel; +} + +inline void BoxedKernel::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + boxed_kernel_func_ != nullptr, + "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel."); + (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunction() { + return BoxedKernel( + nullptr, // no functor_ object + &make_boxed_function); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunction() { + return BoxedKernel( + nullptr, // no functor_ object + &make_boxed_function); +} + +inline BoxedKernel BoxedKernel::makeFallthrough() { + return BoxedKernel( + nullptr, // no functor_ object + &fallthrough_kernel); +} + +inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() { + return BoxedKernel( + nullptr, // no functor_ object + &ambiguous_autogradother_kernel); +} + +inline BoxedKernel BoxedKernel::makeNamedNotSupported() { + return BoxedKernel( + nullptr, // no functor_ object + &named_not_supported_kernel); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunctor( + std::unique_ptr kernelFunctor) { + static_assert( + std::is_base_of_v, + "Tried to call BoxedKernel::makeFromFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + return BoxedKernel( + std::move(kernelFunctor), + [](OperatorKernel* kernel, + const OperatorHandle& op, + DispatchKeySet ks, + Stack* stack) { + (*static_cast(kernel))(op, ks, stack); + }); +} + +inline OperatorKernel* BoxedKernel::getFunctor() const { + return functor_.get(); +} +inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const { + return boxed_kernel_func_; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..fa53454d22edd1caa9d146b6dd3a5647a0b7dfee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h @@ -0,0 +1,346 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack + // to the c10 namespace. + +class OperatorHandle; +struct OperatorKernel; +class KernelFunction; + +class KernelToken; +class SafeKernelFunction; + +template +using has_symint = std::disjunction< + std::is_same, + std::is_same, + std::is_same, + std::is_same, T>>; + +template +struct remove_symint { + using type = T; +}; + +template <> +struct remove_symint { + using type = int64_t; +}; + +template <> +struct remove_symint { + using type = OptionalIntArrayRef; +}; + +template <> +struct remove_symint { + using type = c10::IntArrayRef; +}; + +template <> +struct remove_symint> { + using type = std::optional; +}; + +template +struct maybe_keep_symint final {}; + +template +struct maybe_keep_symint { + using type = T; +}; + +template +struct maybe_keep_symint { + using type = typename remove_symint::type; +}; + +template +using fn_has_symint = typename guts::typelist::true_for_any_type< + has_symint, + typename guts::infer_function_traits::type::parameter_types>; + +template +struct fn_remove_symint; + +template +struct fn_remove_symint { + using type = Ret(typename remove_symint::type...); +}; + +/** + * KernelFunction is similar to std::function but stores a kernel function. + * You can create a KernelFunction from a boxed or unboxed + * function/functor/lambda and call it in a boxed or unboxed way. If the way it + * was created doesn't match the way it was called, it will do boxing or + * unboxing as necessary. + */ +class TORCH_API KernelFunction final { + public: + using InternalBoxedKernelFunction = BoxedKernel::InternalBoxedKernelFunction; + using BoxedKernelFunction = BoxedKernel::BoxedKernelFunction; + using BoxedKernelFunction_withDispatchKeys = + BoxedKernel::BoxedKernelFunction_withDispatchKeys; + + KernelFunction(); + ~KernelFunction(); + + KernelFunction(const KernelFunction& other); + KernelFunction& operator=(const KernelFunction& other); + + KernelFunction(KernelFunction&&) noexcept = default; + + // Fast path for dispatch to allow not touching the boxed kernel in + // the common case where unboxed is available. + bool isValidUnboxed() const; + bool isValidSymUnboxed() const; + bool isValid() const; + bool isFallthrough() const; + + /** + * Call the function in a boxed way. + * If the kernel function was created with an unboxed function, + * this will call an unboxing wrapper which then calls into that + * unboxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func); + * > Tensor result = func.callBoxed(stack); + * + * Or, with an unboxed implementation: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + * > Tensor result = func.callBoxed(stack); + */ + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + /** + * Call the function in an unboxed way. + * If the kernel function was created with a boxed function, + * this will box all inputs and then call into that boxed function. + * + * Note that this doesn't work for all types yet. + * + * Example: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + * > Tensor result = func.call(tensor1, true); + * + * Or, with a boxed implementation: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func); + * > Tensor result = func.call(tensor1, true); + */ + template + Return call( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) const; + + /** + * Create a KernelFunction from a BoxedKernel. + */ + static KernelFunction makeFromBoxedKernel(BoxedKernel boxed_fn); + + /** + * Create a KernelFunction from a boxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = + * KernelFunction::makeFromBoxedFunction<&boxed_func>(); + */ + template + static KernelFunction makeFromBoxedFunction(); + + /** + * TODO: This will only be useful if we write a backend fallback that plumbs + * dispatch keys (currently there are none) See Note [Plumbing Keys Through + * The Dispatcher] for details. + */ + template + static KernelFunction makeFromBoxedFunction(); + + /** + * Create a KernelFunction from an unboxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > KernelFunction func = + * KernelFunction::makeFromUnboxedFunctor(std::make_unique()); + */ + template + static KernelFunction makeFromUnboxedFunctor( + std::unique_ptr kernelFunctor); + + /** + * Create a KernelFunction from a boxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...} + * > }; + * > KernelFunction func = + * KernelFunction::makeFromBoxedFunctor(std::make_unique()); + */ + template + static KernelFunction makeFromBoxedFunctor( + std::unique_ptr kernelFunctor); + + /** + * Create a KernelFunction from an unboxed function. + * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction + * because knowing the function pointer as a template argument (i.e. at + * compile time) allows the compiler to inline the function into its + * unboxing wrapper and yields better performance when calling the function. + * + * Example: + * + * > Tensor unboxed_func(Tensor a, Tensor b) {...} + * > KernelFunction func = + * KernelFunction::makeFromUnboxedFunction(); + */ + template + static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/); + + /** + * Create a KernelFunction from an unboxed function. + * KernelFunction::makeFromUnboxedFunction is usually a better choice than + * this if you know the function pointer at compile time, see doc comment + * there for an explanation. + * + * Example: + * + * > Tensor unboxed_func(Tensor a, Tensor b) {...} + * > KernelFunction func = + * KernelFunction::makeFromUnboxedRuntimeFunction(&unboxed_func); + */ + template + static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func); + + static KernelFunction makeFallthrough(); + static KernelFunction makeAmbiguousAutogradOther(); + static KernelFunction makeNamedNotSupported(); + + /** + * Create a KernelFunction from an unboxed lambda. + * + * Example: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + */ + template + static std::enable_if_t< + guts::is_stateless_lambda>::value, + KernelFunction> + makeFromUnboxedLambda(Lambda&& lambda); + template + static std::enable_if_t< + !guts::is_stateless_lambda>::value, + KernelFunction> + makeFromUnboxedLambda(Lambda&& lambda); + + std::string dumpState() const; + // For testing internal invariants only + bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const; + + // Register a token to be invalidated when this KernelFunction is destroyed + void registerToken(std::weak_ptr token) const; + + private: + explicit KernelFunction( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func); + explicit KernelFunction( + BoxedKernel boxed_fn, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func); + + BoxedKernel boxed_kernel_func_; + void* unboxed_kernel_func_; + void* sym_unboxed_kernel_func_; + // List of tokens that need to be invalidated when this KernelFunction is + // destroyed (lazy allocation to save memory when empty) + mutable std::unique_ptr>> tokens_; +}; + +// Token held by SafeKernelFunction that gets invalidated when KernelFunction is +// destroyed +class KernelToken { + public: + bool isValid() const; + void invalidate(); + + private: + std::atomic invalid_{false}; +}; + +class SafeKernelFunction { + public: + SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle); + + // Safe callBoxed - checks token validity first + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + // Get debug information + const std::string& debug() const { + return debug_; + } + + // Get the OpHandle that lives on this SafeKernelFunction + const OperatorHandle& opHandle() const { + return *opHandle_; + } + + private: + KernelFunction kernel_; + std::shared_ptr token_; + std::string debug_; + std::shared_ptr opHandle_; +}; + +} // namespace c10 + +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1d190e1809da3abeeff6b5ded93cf1694fef94f6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h @@ -0,0 +1,395 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include + +#include +#include + +namespace c10 { + +namespace detail { +template +std::enable_if_t< + !std::is_array_v && !std::is_array_v && + std::is_base_of_v, + std::unique_ptr> +make_unique_base(Args&&... args) { + return std::make_unique(std::forward(args)...); +} +} // namespace detail + +inline KernelFunction::KernelFunction() + : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} + +inline KernelFunction::~KernelFunction() { + if (tokens_) { + for (auto& weak_token : *tokens_) { + if (auto token = weak_token.lock()) { + token->invalidate(); + } + } + } +} + +inline KernelFunction::KernelFunction(const KernelFunction& other) + : boxed_kernel_func_(other.boxed_kernel_func_), + unboxed_kernel_func_(other.unboxed_kernel_func_), + sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) { + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed +} + +inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) { + if (this != &other) { + boxed_kernel_func_ = other.boxed_kernel_func_; + unboxed_kernel_func_ = other.unboxed_kernel_func_; + sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_; + + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed + } + return *this; +} + +inline KernelFunction::KernelFunction( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func = nullptr) + : boxed_kernel_func_(std::move(functor), boxed_kernel_func), + unboxed_kernel_func_(unboxed_kernel_func), + sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {} + +inline KernelFunction::KernelFunction( + BoxedKernel boxed_fn, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func = nullptr) + : boxed_kernel_func_(std::move(boxed_fn)), + unboxed_kernel_func_(unboxed_kernel_func), + sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {} + +inline bool KernelFunction::isValidUnboxed() const { + return unboxed_kernel_func_ != nullptr; +} + +inline bool KernelFunction::isValidSymUnboxed() const { + return sym_unboxed_kernel_func_ != nullptr; +} + +inline bool KernelFunction::isValid() const { + return boxed_kernel_func_.isValid(); +} + +inline bool KernelFunction::isFallthrough() const { + return boxed_kernel_func_.isFallthrough(); +} + +inline void KernelFunction::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + boxed_kernel_func_.callBoxed(opHandle, dispatchKeySet, stack); +} + +template +inline Return callUnboxedKernelFunction( + void* unboxed_kernel_func, + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Args&&... args) { + using ActualSignature = Return(OperatorKernel*, DispatchKeySet, Args...); + ActualSignature* func = + reinterpret_cast(unboxed_kernel_func); + return (*func)(functor, dispatchKeySet, std::forward(args)...); +} + +// This template requires you to explicitly specify the argument you want to +// forward; it doesn't work if you try to deduce it +// NB: keep this in sync with cloneWithRealTypes in function_schema.cpp + +template +inline typename remove_symint::type unpackSymInt(T x) { + return x; +} + +template <> +inline remove_symint::type unpackSymInt(c10::SymInt x) { + return x.guard_int(__FILE__, __LINE__); +} + +template <> +inline remove_symint::type unpackSymInt( + c10::SymIntArrayRef x) { + return C10_AS_INTARRAYREF_SLOW(x); +} + +template <> +inline remove_symint>::type unpackSymInt( + std::optional x) { + return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__)) + : std::nullopt; +} + +template <> +inline remove_symint::type unpackSymInt( + at::OptionalSymIntArrayRef x) { + return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) + : std::nullopt; +} + +template +C10_ALWAYS_INLINE Return KernelFunction::call( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) const { + // note: Args above is intentionally not Args&&. We don't want perfect + // forwarding, which would require Args to be deduced, but instead we + // want callers to explicitly specify the Args. + + if constexpr (std::disjunction_v...>) { + if (sym_unboxed_kernel_func_ != nullptr) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction( + sym_unboxed_kernel_func_, + functor, + dispatchKeySet, + std::forward(args)...); + } + + if (unboxed_kernel_func_ != nullptr) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction< + Return, + typename remove_symint::type...>( + unboxed_kernel_func_, + functor, + dispatchKeySet, + unpackSymInt(args)...); + } + } else { + if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction( + unboxed_kernel_func_, + functor, + dispatchKeySet, + std::forward(args)...); + } + } + + return impl::BoxedKernelWrapper::call( + boxed_kernel_func_, + opHandle, + dispatchKeySet, + std::forward(args)...); +} + +inline void KernelFunction::registerToken( + std::weak_ptr token) const { + if (!tokens_) { + tokens_ = std::make_unique>>(); + } + tokens_->push_back(std::move(token)); +} + +inline KernelFunction KernelFunction::makeFromBoxedKernel( + BoxedKernel boxed_fn) { + return KernelFunction( + std::move(boxed_fn), nullptr); // no unboxed function pointer +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunction() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunction()); +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunction() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunction()); +} + +inline KernelFunction KernelFunction::makeFallthrough() { + return KernelFunction::makeFromBoxedKernel(BoxedKernel::makeFallthrough()); +} + +inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeAmbiguousAutogradOther()); +} + +inline KernelFunction KernelFunction::makeNamedNotSupported() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeNamedNotSupported()); +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedFunctor( + std::unique_ptr kernelFunctor) { +#ifndef NDEBUG + // This assertion is costly for build time so it's debug-gated. + static_assert( + guts::is_functor::value, + "Tried to call KernelFunction::makeFromUnboxedFunctor but the argument is not a functor."); +#endif + static_assert( + std::is_base_of_v, + "Tried to call KernelFunction::makeFromUnboxedFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + + auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed::call; + void* void_unboxed_fn = reinterpret_cast(unboxed_fn); + bool is_symint = fn_has_symint::value; + return KernelFunction( + std::move(kernelFunctor), + &impl::make_boxed_from_unboxed_functor:: + call, + is_symint ? nullptr : void_unboxed_fn, + is_symint ? void_unboxed_fn : nullptr); +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunctor( + std::unique_ptr kernelFunctor) { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunctor(std::move(kernelFunctor))); +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedFunction( + FuncPtr func_ptr) { + static_assert( + is_compile_time_function_pointer::value, + "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); + static_assert( + !std::is_same_v, + "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); +#if defined(__GNUC__) && defined(__SANITIZE_ADDRESS__) && !defined(__CUDACC__) + TORCH_INTERNAL_ASSERT( + FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); +#else + static_assert( + FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); +#endif + +#if !defined(C10_MOBILE) + (void)func_ptr; // Suppress unused variable warning + return makeFromUnboxedFunctor< + AllowLegacyTypes, + typename impl::WrapFunctionIntoFunctor::type>( + detail::make_unique_base< + OperatorKernel, + typename impl::WrapFunctionIntoFunctor::type>()); +#else + // On mobile, we rather want to optimize for binary size than for performance, + // so let's not inline the kernel into the wrapper but use + // makeFromUnboxedRuntimeFunction instead. + return makeFromUnboxedRuntimeFunction(func_ptr.func_ptr()); +#endif +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction( + FuncType* func) { + static_assert( + guts::is_function_type::value, + "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); + static_assert( + !std::is_same_v, + "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); + TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr"); + + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>(func)); +} + +template +inline std::enable_if_t< + guts::is_stateless_lambda>::value, + KernelFunction> +KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { + static_assert( + guts::is_functor>::value, + "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); + +#if !defined(C10_MOBILE) + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>( + std::forward(lambda))); +#else + // On mobile, we rather want to optimize for binary size than for performance, + // so let's not inline the kernel into the wrapper but use + // makeFromUnboxedRuntimeFunction instead. + using FuncType = + typename guts::infer_function_traits_t>::func_type; + return makeFromUnboxedRuntimeFunction(lambda); +#endif +} + +template +inline std::enable_if_t< + !guts::is_stateless_lambda>::value, + KernelFunction> +KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { + static_assert( + guts::is_functor>::value, + "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); + + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>( + std::forward(lambda))); +} + +inline bool KernelToken::isValid() const { + return !invalid_.load(std::memory_order_acquire); +} + +inline void KernelToken::invalidate() { + invalid_.store(true, std::memory_order_release); +} + +inline SafeKernelFunction::SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle) + : kernel_(kernel ? *kernel : KernelFunction()), + token_(std::make_shared()), + debug_(std::move(debug)), + opHandle_(std::move(opHandle)) { + // Register the token with the original kernel so it gets invalidated when the + // kernel is destroyed + if (kernel) { + kernel->registerToken(token_); + } +} + +inline void SafeKernelFunction::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + TORCH_CHECK( + token_ && token_->isValid(), + "SafeKernelFunction has been invalidated ", + debug_); + kernel_.callBoxed(opHandle, dispatchKeySet, stack); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5bf328983091cf4e02f66e60462c5b9ffb082462 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace c10 { + +/** + * Inherit from OperatorKernel to implement a c10 kernel. + * + * Example: + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * + * The kernel class is allowed to have members but these are equivalent + * to global variables. The kernel implementation is responsible for + * preventing race conditions on them. + * + * See below for how to register this kernel with PyTorch. + */ +struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target { + ~OperatorKernel() override = default; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h new file mode 100644 index 0000000000000000000000000000000000000000..aa1e5eb02d879ff1ca90a0261369e9b3e3ead4b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { +namespace detail { +template +class WrapFunctionIntoFunctor_ {}; +template +class WrapFunctionIntoFunctor_< + FuncPtr, + ReturnType, + guts::typelist::typelist> + final : public c10::OperatorKernel { + public: + C10_ALWAYS_INLINE decltype(auto) operator()(Parameters... args) { + return (*FuncPtr::func_ptr())(std::forward(args)...); + } +}; +} // namespace detail + +// WrapFunctionIntoFunctor: Wraps a compile time function pointer into a kernel +// functor. Since it is a compile time function pointer, many compilers can +// inline it into the wrapper and you don't get any performance overhead for +// wrapping. +template +struct WrapFunctionIntoFunctor final { + static_assert( + c10::is_compile_time_function_pointer::value, + "WrapFunctionIntoFunctor can only wrap functions created with TORCH_FN."); + using type = detail::WrapFunctionIntoFunctor_< + FuncPtr, + typename guts::function_traits::return_type, + typename guts::function_traits< + typename FuncPtr::FuncType>::parameter_types>; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h new file mode 100644 index 0000000000000000000000000000000000000000..0ff4e3dbc917c8dc86605c403c3733539c4779db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { + +namespace detail { +template +class WrapFunctionIntoRuntimeFunctor_ {}; +template +class WrapFunctionIntoRuntimeFunctor_< + FuncType, + ReturnType, + guts::typelist::typelist> + final : public c10::OperatorKernel { + public: + template + explicit WrapFunctionIntoRuntimeFunctor_(FuncType_&& kernel_func) + : kernel_func_(std::forward(kernel_func)) {} + + decltype(auto) operator()(Parameters... args) { + return kernel_func_(std::forward(args)...); + } + + private: + FuncType kernel_func_; +}; +} // namespace detail + +// WrapFunctionIntoRuntimeFunctor: Wraps any runtime functor into a functor that +// inherits from c10::OperatorKernel, so it can be used as a c10 kernel. +// This can, for example, be used for lambdas, functors or even function +// pointers. In the case of function pointers, since it is a runtime function +// pointer, there is an overhead for calling it whenever the kernel is invoked. +template +using WrapFunctionIntoRuntimeFunctor = detail::WrapFunctionIntoRuntimeFunctor_< + FuncType, + typename guts::infer_function_traits_t::return_type, + typename guts::infer_function_traits_t::parameter_types>; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h new file mode 100644 index 0000000000000000000000000000000000000000..ed93dfef4637046783ab9d7e88c7919e9fc75d04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h @@ -0,0 +1,415 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file contains boxing (not unboxing) logic, +// i.e. how to make a vector from a set of concrete arguments. + +#include +#include +#include + +#include + +#include +#include + +namespace c10::impl { + +// +// utils +// + +// is_mutable_tensor_ref +template +struct is_mutable_tensor_ref : std::false_type {}; +template <> +struct is_mutable_tensor_ref : std::true_type {}; + +// is_tuple_of_mutable_tensor_refs +// +template +struct is_tuple_of_mutable_tensor_refs : std::false_type {}; + +template +struct is_tuple_of_mutable_tensor_refs< + T, + std::enable_if_t::value, void>> + : guts::typelist:: + all> {}; + +// has_ivalue_to tests the presence/absence of instance method +// IValue::to() +// +template +struct has_ivalue_to : std::false_type {}; + +template +struct ivalue_to_helper { + using type = decltype(std::declval().template to()); +}; +template +using ivalue_to_helper_t = typename ivalue_to_helper::type; + +template +struct has_ivalue_to>> : std::true_type {}; + +// +// boxing predicates +// + +// A boxable arg type is one that IValue has a constructor for. +template +using can_box = std::disjunction< + std::is_constructible>, + // TensorOptions are not directly constructible into IValue, + // but torch::jit::push knows how to handle them + std::is_same>>; + +template +using can_box_all = std::conjunction...>; + +// an unboxable result is one that can be extracted from an IValue +template +using can_unbox = std::conjunction< + std::disjunction< + has_ivalue_to, + // void returns are ok + std::is_same>, + std::negation>>; + +// +// boxArgs - utility for pushing unboxed args onto IValue stack +// +template +torch::jit::Stack boxArgs(Args... args) { + // TODO Reuse stack vector instead of allocating? + torch::jit::Stack stack; + stack.reserve(sizeof...(Args)); + torch::jit::push(stack, std::forward(args)...); + return stack; +} + +template +inline constexpr size_t boxed_size_one() { + static_assert( + !std::is_same_v, c10::TensorOptions>, + "need to patch this path to support TensorOptions passed by reference"); + return 1; +} + +// torch::jit::push pushes 4 values for a TensorOptions; this needs to +// be kept in sync. +template <> +inline constexpr size_t boxed_size_one() { + return 4; +} + +// NOTE: this could probably be simplified with C++17 fold expressions. +template +struct BoxedSize : std::integral_constant {}; +template +struct BoxedSize + : std::integral_constant< + size_t, + boxed_size_one() + BoxedSize::value> {}; + +template +static inline constexpr size_t boxed_size() { + return BoxedSize::value; +} + +template +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValue*& dest, T& arg) { + new (dest++) IValue(arg); +} + +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack( + IValue*& dest, + c10::TensorOptions options) { + new (dest++) IValue(c10::typeMetaToScalarType(options.dtype())); + new (dest++) IValue(options.layout()); + new (dest++) IValue(options.device()); + new (dest++) IValue(options.pinned_memory()); +} + +inline void boxArgsToStack(IValue*& /*unused*/) {} + +template +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack( + IValue*& dest, + T& arg, + Args&... args) { + boxToStack(dest, arg); + boxArgsToStack(dest, args...); +} + +// +// PopResult is a helper class whose specializations handle popping single and +// multiple return values, respectively. +// +template +struct PopResult final { + static Result call(Stack& stack) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return one value on the stack, ", + "but instead pushed ", + stack.size(), + " values."); + return std::move(stack[0]).to(); + } +}; + +template +struct PopResult> final { + using Result = std::tuple; + + static Result call(Stack& stack) { + // for tuple return types, boxed kernel has pushed multiple values onto the + // stack + constexpr int RetCount = sizeof...(Types); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == RetCount, + "Boxed kernel was expected to return ", + RetCount, + " values on the stack, ", + "but instead pushed ", + stack.size(), + " values."); + return pop_to_tuple_impl(stack, std::make_index_sequence()); + } + + private: + // note: this has been moved into its own helper only to avoid a parse error + // on `indices` otherwise. I'm sure there's an incantation that slips it past + // the parser but eh + template + static Result pop_to_tuple_impl( + Stack& stack, + std::index_sequence /*unused*/) { + return std::make_tuple((std::move(stack[indices]).template to())...); + } +}; + +// +// BoxedKernelWrapper +// +// For a given function type FT, BoxedKernelWrapper implements +// a `call` method that +// - takes a boxed kernel and unboxed arguments as specified by FT, +// - calls `boxArgs` to box the arguments +// - calls the boxed kernel +// - unboxes and returns the result +// +// The partial specializations below handle various cases: in +// particular, not all types appearing in op signatures are supported, +// and ops returning references have nonstandard wrapper implementations. +// + +// 1. The base specialization of BoxedKernelWrapper should never be +// instantiated. A "no call method defined on BoxedKernelWrapper" compile error +// means that an op signature has failed to trigger any of the partial +// specializations that follow this one. +// +template +struct BoxedKernelWrapper { + // The reason we're not just doing straight up static_assert(false, ...) here: + // Basically, the way to make sure a static_assert only fires if a template + // is actually instantiated (rather than every time the file is parsed) is to + // use template parameters in the expression, e.g. FuncType here. However, + // since `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the + // same effect. + static_assert( + sizeof(FuncType) != sizeof(FuncType), + "Function signature contains one or more unsupported parameter and/or return types. " + "Look for a nearby error like " + "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" " + "- (your function type) is the unsupported signature."); +}; + +// +// 2. Supported signatures, other than those involving non-const Tensor refs - +// i.e., "functional" ops. +// + +template +struct BoxedKernelWrapper< + Result(Args...), + std::enable_if_t< + can_box_all::value && can_unbox::value && + !is_tuple_of_mutable_tensor_refs::value, + void>> { + static Result call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) { + torch::jit::Stack stack = boxArgs(std::forward(args)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + + if constexpr (!std::is_same_v) { + // op has pushed one or more values onto the stack. + return PopResult::call(stack); + } else { + // op returns void, boxed kernel has pushed nothing onto stack. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.empty(), + "Boxed kernel was expected to return no values on the stack, ", + "but instead returned ", + stack.size(), + " values."); + } + } +}; + +// +// 3. in-place ops take a single non-const Tensor reference +// as their first argument, and return it. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// Because of this, the generated BoxedKernelWrapper specializations simply +// return the in-place argument. +// + +template +struct BoxedKernelWrapper< + at::Tensor&(at::Tensor&, OtherArgs...), + std::enable_if_t::value, void>> { + static at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + at::Tensor& outArg, + OtherArgs... otherArgs) { + torch::jit::Stack stack = boxArgs( + outArg, std::forward(otherArgs)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + return outArg; + } +}; + +// +// 3.5. In-process migration to make in-place ops take and return +// const references instead. +template +struct BoxedKernelWrapper< + const at::Tensor&(const at::Tensor&, OtherArgs...), + std::enable_if_t::value, void>> { + static const at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + const at::Tensor& outArg, + OtherArgs... otherArgs) { + torch::jit::Stack stack = boxArgs(outArg, otherArgs...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + return outArg; + } +}; + +// +// 4. out of place ops that take a single non-const Tensor reference as their +// final argument, and also return it. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to +// simply return out arguments. +// +template +struct BoxedKernelWrapper< + at::Tensor&(FirstArg, RestArgs...), + std::enable_if_t< + can_box_all::value + // this skips over in-place kernels with a non-const Tensor + // arg at the front, so those can unambiguously trigger the + // preceding specialization. + && !is_mutable_tensor_ref::value, + void>> { + static at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + FirstArg firstArg, + RestArgs... restArgs) { + torch::jit::Stack stack = boxArgs( + std::forward(firstArg), std::forward(restArgs)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + // reusing restArgs after it has been forwarded here is ok because we know + // that the last element is of type `Tensor&`. + return std::get( + std::tuple{restArgs...}); + } +}; + +// +// 5. out of place ops that take multiple non-const Tensor references as their +// final arguments, and return them in a std::tuple. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to +// simply return the out arguments. +// +template +struct BoxedKernelWrapper< + Result(Args...), + std::enable_if_t< + can_box_all::value && + is_tuple_of_mutable_tensor_refs::value, + void>> { + static Result call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) { + using ArgTuple = std::tuple; + constexpr int RetCount = std::tuple_size(); + + torch::jit::Stack stack = boxArgs(std::forward(args)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == RetCount, + "Boxed kernel was expected to return ", + RetCount, + " values on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + // reusing args after it has been forwarded here is ok because we know + // that the last RetCount elements are of type `Tensor&`. + auto result = guts::tuple_take( + ArgTuple{std::forward(args)...}); + static_assert( + std::is_same_v, + "The parameter list of an op returning a tuple of Tensor references " + "must end with an equal number of Tensor reference parameters."); + return result; + } +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..fac192e893c4fc6d40714ebc2d24f848d736819a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -0,0 +1,790 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace c10 { + +using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack + // to the c10 namespace. +class OperatorHandle; + +/* + * [Note: Argument forwarding in the dispatcher] + * + * The dispatcher uses a somewhat unusual way to forward arguments through + * several layers of wrapper functions. This can be confusing because an + * experienced C++ programmer would look at this and think "oh this is supposed + * to be forwarding a universal reference but the && is missing. This is a + * bug.". It is not a bug. The common way in C++ to forward arguments is to use + * universal references: + * + * > template void func(T&& arg) { func2(std::forward(arg)); } + * + * but that relies on inferring the correct reference type (i.e. value vs & vs + * &&) from the argument. In our case, we cannot rely on the argument as + * supplied by the caller, because that could infer a different reference type + * than was used in the kernel function. The correct reference type is dictated + * by the kernel signature and must be identical since we cast function pointers + * through void* pointers and mismatches would be UB. So we need a forwarding + * pattern that determines the reference type to use by looking at the + * explicitly supplied operator signature, not by looking at the argument we're + * calling it with. + * + * What does std::forward do, exactly? + * ------------------------------------ + * std::forward(t) is a way to cast t to the reference type supplied in T. + * Let's assume decay_t == U and T is either U or some reference of U. + * - std::forward(t) will return U&, no matter what kind of reference t is. + * - std::forward(t) will return U&&, no matter what kind of reference t + * is. + * - std::forward(t) will return U&& (not U!), no matter what kind of + * reference t is. + * + * For universal references, that means that in the following function + * > template void func(T&& arg) { func2(std::forward(arg)); } + * + * - when called with arg being a rvalue reference or non-reference value, T + * gets inferred to be a non-reference U, and std::forward(t) will return + * U&&, correctly moving the argument. + * - when called with arg behind a lvalue reference, T gets inferred to be U& + * because that's the only way to match the signature (in C++, a type that is + * (T&)&& will collapse to T&). That means std::forward(t) will return U& and + * the value will not be moved but passed on as a lvalue reference. + * + * How do we use that? + * ------------------------------------ + * But std::forward can also be used outside of the common "universal + * forwarding" pattern to change reference types. So instead of following the + * common C++ pattern, we notice what std::forward() actually does, and that + * is it takes a value and changes its reference to the type of reference passed + * in as T. If we don't infer T but explicitly specify it, we can use this to + * forward based on an explicitly specified reference type instead of the + * inferred argument type. + * + * This is why many of the dispatcher functions look like + * > template func(T t) { func2(std::forward(t)); } + * instead of the common + * > template func(T&& t) { func2(std::forward(t)); } + * + * and are expected to be called by explicitly specifying the template + * parameters in a way that matches the expected operator signature at each call + * site. + */ + +namespace impl { +// supported_primitive_arg_types defines which primitive types we allow in +// kernel functions as arguments or returns. +// Additionally, we support lists, dicts and optionals containing these types. +using supported_primitive_arg_types = guts::typelist::typelist< + int64_t, + double, + bool, + std::string_view, + at::Tensor, + at::Scalar, + c10::QScheme, + c10::ScalarType, + c10::Device, + c10::DeviceIndex, + c10::Layout, + c10::MemoryFormat, + at::Dimname>; + +// We have an unboxed functor in hand that takes C++ arguments, and +// we're building a boxed functor wrapper for it that takes IValues. +// So "outside" is boxed and "inside" is unboxed. +// +// So a valid input type is one that our boxed functor wrapper can +// unbox from an IValue into a C++ value. +// +// Whereas a valid output type is one that our wrapper can receive +// as a C++ value from the unboxed functor, and box into an IValue. + +// +// assert_is_valid_input_type +// checks that T can be unboxed from an IValue into a C++ value. +// + +template +struct assert_is_valid_input_type { + assert_is_valid_input_type() { + if constexpr (guts::typelist::contains:: + value) { + /* everything is ok, this is a primitive type */ + } else { + /* otherwise this must be an instance of a valid custom class, since it + can only have been created via IValue(x), which ensures this. */ + } + } +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type {}; + +template +struct TypeCheckHelper; + +template +struct TypeCheckHelper {}; + +template +struct TypeCheckHelper + : TypeCheckHelper { + assert_is_valid_input_type check; +}; + +template +struct assert_is_valid_input_type< + std::tuple, + AllowDeprecatedTypes> + : TypeCheckHelper {}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported input type: Dict where Key is invalid. We only support int64_t, double, bool, and string."); +}; + +template +struct assert_is_valid_input_type< + std::unordered_map, + AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + AllowDeprecatedTypes, + "You tried to register a kernel with an unsupported input type: std::unordered_map. Please use Dict instead."); + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported input type: std::unordered_map where Key is invalid. We only support int64_t, double, bool, and string."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: List. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: ArrayRef. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type< + c10::OptionalArrayRef, + AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: OptionalArrayRef. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: std::array. Please use std::array instead."); +}; + +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + // There is no reason to support float when we have double. Keep the API lean. + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: const char*. Please use std::string_view instead."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t, T>>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: vector. Please use List instead."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t< + std::is_integral_v && + !guts::typelist::contains::value>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead."); +}; + +// TODO: it probably would be good to tighten this up quite a bit more with +// an explicit list for everything + +// +// assert_is_valid_output_type +// + +template +struct assert_is_valid_output_type { + assert_is_valid_output_type() { + if constexpr (guts::typelist::contains:: + value) { + /* everything is ok, this is a primitive type */ + } else { + /* otherwise T is verified to be a registered custom class in the IValue + constructor, so no benefit in double-checking here */ + } + } +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type {}; + +template +struct assert_is_valid_output_type< + c10::OptionalArrayRef, + AllowDeprecatedTypes> + : assert_is_valid_output_type {}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported output type: Dict where Key is invalid. We only support int64_t, double, bool, and string."); + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: Dict. Please use Dict or Dict."); +}; + +template +struct assert_is_valid_output_type< + std::unordered_map, + AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + AllowDeprecatedTypes, + "You tried to register a kernel with an unsupported output type: std::unordered_map. Please use Dict instead."); + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported output type: std::unordered_map where Key is invalid. We only support int64_t, double, bool, and string."); + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::unordered_map. Please use Dict or Dict."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: List. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::vector. Please use List, List or Tensor instead."); + // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel + // with an unsupported output type: std::vector. Please use List + // instead."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::array. Please use std::array instead."); +}; + +// The following specialisations of assert_is_valid_output_type are technically +// not necessary since we would hit the base case and show an error message +// there if they didn't exist, but we can show a better error message +// in some common error scenarios. +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + // There is no reason to support float when we have double. Keep the API lean. + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: const char*. Please use std::string_view instead."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t, T>>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: vector. Please use List instead."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t< + std::is_integral_v && + !guts::typelist::contains::value>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); +}; + +// ivalue_to_arg + +template +struct decay_if_not_tensor final { + using type = std::decay_t; +}; + +template <> +struct decay_if_not_tensor final { + using type = at::Tensor&; +}; + +template <> +struct decay_if_not_tensor final { + using type = const at::Tensor&; +}; + +template +struct ivalue_to_arg final { + static decltype(auto) call(IValue& v) { + assert_is_valid_input_type(); + return std::move(v).to(); + } +}; + +// The following two specializations take advantage of specialized +// `toTensor()` overloads on IValue to avoid copying. +template +struct ivalue_to_arg final { + // We cannot use the default implementation if they asked for a + // `at::Tensor&` because it moves from the IValue, so it can't get + // an lvalue reference. + static at::Tensor& call(IValue& v) { + // Tensor& is valid, don't bother asserting + return v.toTensor(); + } +}; + +template +struct ivalue_to_arg final { + // We should not use the default implementation if they asked for + // a `const at::Tensor&` because it moves from the IValue and they + // didn't ask for that. + static const at::Tensor& call(IValue& v) { + // const Tensor& is valid, don't bother asserting + return v.toTensor(); + } +}; + +template +struct ivalue_to_arg final { + static List call(IValue& v) { + return v.toTensorList(); + } +}; + +template +struct ivalue_to_arg, AllowDeprecatedTypes> final { + // If an argument is ArrayRef, convert the IValue to a std::vector and + // pass that to the operator. std::vector is implicitly convertible to + // ArrayRef. + static std::vector call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; +template +struct ivalue_to_arg final { + static std::vector call(IValue& v) { + if (v.isIntList()) { + std::vector r; + auto src = v.toIntList(); + std::transform( + src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { + return c10::SymInt(i); + }); + return r; + } else { + return ivalue_to_arg, AllowDeprecatedTypes>:: + call(v); + } + } +}; +template +struct ivalue_to_arg, AllowDeprecatedTypes> + final { + static OptionalArray call(IValue& v) { + if (v.isIntList()) { + std::vector r; + auto src = v.toIntList(); + std::transform( + src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { + return c10::SymInt(i); + }); + return OptionalArray(std::move(r)); + } else { + return std::move(v).to>(); + } + } +}; +template +struct ivalue_to_arg>, AllowDeprecatedTypes> final { + // If an argument is std::optional>, convert the IValue to an + // std::optional> and pass that to the operator. + // OptionalArray is basically a std::optional> but + // implicitly convertible to std::optional>. + static OptionalArray call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; + +template +struct ivalue_to_arg, AllowDeprecatedTypes> final { + // If an argument is OptionalArrayRef, convert the IValue to an + // std::optional> and pass that to the operator. + // OptionalArray is basically a std::optional> but + // implicitly convertible to OptionalArrayRef + static OptionalArray call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; + +// return_to_ivalue +template +struct return_to_ivalue final {}; + +template +struct return_to_ivalue< + T, + AllowDeprecatedTypes, + std::enable_if_t>> + final { + static IValue call(T&& v) { + assert_is_valid_output_type(); + return c10::ivalue::from(std::move(v)); + } + static IValue copy(const T& v) { + assert_is_valid_output_type(); + return IValue(v); + } +}; + +// Special case to allow kernels to return `Tensor&`. +// TODO Delete this once kernels don't do that anymore +template +struct return_to_ivalue final { + static IValue call(at::Tensor& v) { + return c10::ivalue::from(v); + } + static IValue copy(at::Tensor& v) { + return IValue(v); + } +}; + +// wrap_kernel_functor_unboxed_ + +template +struct wrap_kernel_functor_unboxed_ final {}; + +// This specialization is for kernels with a first argument that is NOT of type +// DispatchKeySet This includes kernels with 0 arguments. +template +struct wrap_kernel_functor_unboxed_< + KernelFunctor, + ReturnType(ParameterTypes...)> + final { + static_assert( + std::is_same_v< + ReturnType, + typename guts::infer_function_traits_t::return_type>, + "Return type mismatch"); + static_assert( + std::is_same_v< + guts::typelist::typelist, + typename guts::infer_function_traits_t< + KernelFunctor>::parameter_types>, + "Parameter types mismatch"); + + // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes + // doesn't use && + static ReturnType call( + OperatorKernel* functor, + DispatchKeySet /*unused*/, + ParameterTypes... args) { + KernelFunctor* functor_ = static_cast(functor); + // Note [Plumbing Keys Through The Dispatcher 2] + // See Note [Plumbing Keys Through The Dispatcher] for the background. + // This functor explicitly takes in a dispatchKeySet and drops it on the + // floor- it does not forward it to the registered kernel. + // + // This is due to the calling convention within the dispatcher, which + // expects all registered kernels to have a first argument of type + // DispatchKeySet. + // This is not the case for pretty much all manually written kernels, + // however- this functor serves to separate the calling convention of the + // dispatcher from the calling convention of manually written kernels. + return (*functor_)(std::forward(args)...); + } +}; + +// This specialization is for kernels with a first argument of type +// DispatchKeySet +template +struct wrap_kernel_functor_unboxed_< + KernelFunctor, + ReturnType(DispatchKeySet, ParameterTypes...)> + final { + static_assert( + std::is_same_v< + ReturnType, + typename guts::infer_function_traits_t::return_type>, + "Return type mismatch"); + static_assert( + std::is_same_v< + guts::typelist::typelist, + typename guts::infer_function_traits_t< + KernelFunctor>::parameter_types>, + "Parameter types mismatch"); + + // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes + // doesn't use && + static ReturnType call( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + ParameterTypes... args) { + KernelFunctor* functor_ = static_cast(functor); + // We're explicitly taking in a dispatchKeySet and forwarding it to the + // registered kernel. See Note [Plumbing Keys Through The Dispatcher 2] for + // details. + return (*functor_)(dispatchKeySet, std::forward(args)...); + } +}; + +template +using wrap_kernel_functor_unboxed = wrap_kernel_functor_unboxed_< + KernelFunctor, + typename guts::infer_function_traits_t::func_type>; + +// call_functor_with_args_from_stack + +template < + class Functor, + bool AllowDeprecatedTypes, + size_t... ivalue_arg_indices, + typename... ArgTypes> +std::decay_t::return_type> +call_functor_with_args_from_stack_( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Stack* stack, + std::index_sequence /*unused*/, + guts::typelist::typelist* /*unused*/) { + (void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would + // be unused and we have to silence the compiler warning. + + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the stack. + // See Note [Plumbing Keys Through The Dispatcher] for the background. + return wrap_kernel_functor_unboxed::call( + functor, + dispatchKeySet, + ivalue_to_arg< + typename decay_if_not_tensor::type, + AllowDeprecatedTypes>:: + call(torch::jit::peek( + *stack, ivalue_arg_indices, sizeof...(ivalue_arg_indices)))...); +} + +template +std::decay_t::return_type> +call_functor_with_args_from_stack( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Stack* stack) { + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the stack. + // See Note [Plumbing Keys Through The Dispatcher] for the background. + using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func< + Functor>::parameter_types; + constexpr size_t num_ivalue_args = guts::typelist::size::value; + return call_functor_with_args_from_stack_( + functor, + dispatchKeySet, + stack, + std::make_index_sequence(), + static_cast(nullptr)); +} + +// push_outputs + +template +struct push_outputs final { + // Contrary to [Note: Argument forwarding in the dispatcher], we use + // OutputType&& here to avoid one extra call to the move constructor in this + // case. This is still not a universal reference though because OutputType is + // an explicitly specified class template parameter. + static void call(OutputType&& output, Stack* stack) { + torch::jit::push( + *stack, + return_to_ivalue::call( + std::forward(output))); + } + static void copy(const OutputType& output, Stack* stack) { + torch::jit::push( + *stack, + return_to_ivalue::copy(output)); + } +}; +template +struct push_outputs, AllowDeprecatedTypes> final { + static void call(std::tuple&& output, Stack* stack) { + call_( + std::move(output), + stack, + std::make_index_sequence()); + } + static void copy(const std::tuple& output, Stack* stack) { + copy_(output, stack, std::make_index_sequence()); + } + + private: + template + static void call_( + std::tuple&& output, + Stack* stack, + std::index_sequence /*unused*/) { + torch::jit::push( + *stack, + return_to_ivalue::call( + std::forward(std::get(output)))...); + } + template + static void copy_( + const std::tuple& output, + Stack* stack, + std::index_sequence /*unused*/) { + torch::jit::push( + *stack, + return_to_ivalue::copy( + std::get(output))...); + } +}; +template +struct push_outputs final { + static void call(int /*dummy*/, Stack* /*stack*/) {} + static void copy(int /*dummy*/, Stack* /*stack*/) {} +}; + +// make_boxed_from_unboxed_functor + +template +struct make_boxed_from_unboxed_functor final { + static_assert( + std::is_base_of_v, + "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + + static void call( + OperatorKernel* functor, + const OperatorHandle& /*unused*/, + DispatchKeySet dispatchKeySet, + Stack* stack) { + using ReturnType = + typename guts::infer_function_traits_t::return_type; + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the + // stack. See Note [Plumbing Keys Through The Dispatcher] for the + // background. + using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func< + KernelFunctor>::parameter_types; + constexpr bool has_outputs = !std::is_same_v; + constexpr size_t num_inputs = guts::typelist::size::value; + if constexpr (has_outputs) { + // Decay ReturnType to ReturnType_ so that if a reference gets returned, + // we actually store it by value and don't get a dangling reference. This + // is only required because some kernels still return `Tensor&`. [Note: + // VC++ and 'std': ambiguous symbol] + using ReturnType_ = ::std::decay_t; + ReturnType_ output = call_functor_with_args_from_stack< + KernelFunctor, + AllowDeprecatedTypes>(functor, dispatchKeySet, stack); + torch::jit::drop(*stack, num_inputs); + // See note [ VC++ and 'std': ambiguous symbol] + push_outputs::call( + ::std::move(output), stack); + } else { + call_functor_with_args_from_stack( + functor, dispatchKeySet, stack); + torch::jit::drop(*stack, num_inputs); + } + } +}; +} // namespace impl + +} // namespace c10 + +namespace torch { +using OperatorKernel = c10::OperatorKernel; +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..aecf24471b02853caed9872783e3fdb3f3aaf011 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h @@ -0,0 +1,145 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +template +inline std::vector makeStack(Inputs&&... inputs) { + return {std::forward(inputs)...}; +} + +inline at::Tensor dummyTensor( + c10::DispatchKeySet ks, + bool requires_grad = false) { + auto* allocator = c10::GetCPUAllocator(); + int64_t nelements = 1; + auto dtype = caffe2::TypeMeta::Make(); + int64_t size_bytes = nelements * dtype.itemsize(); + auto storage_impl = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + size_bytes, + allocator->allocate(size_bytes), + allocator, + /*resizable=*/true); + at::Tensor t = + at::detail::make_tensor(storage_impl, ks, dtype); + // TODO: We add this to simulate the ideal case where we only have Autograd + // backend keys + // on Tensor when it requires grad. But currently Autograd keys are + // added in TensorImpl constructor by default. + if (!requires_grad) { + t.unsafeGetTensorImpl()->remove_autograd_key(); + } + return t; +} + +inline at::Tensor dummyTensor( + c10::DispatchKey dispatch_key, + bool requires_grad = false) { + return dummyTensor(c10::DispatchKeySet(dispatch_key), requires_grad); +} + +template +inline std::vector callOp( + const c10::OperatorHandle& op, + Args... args) { + auto stack = makeStack(std::forward(args)...); + op.callBoxed(&stack); + return stack; +} + +template +inline Result callOpUnboxed(const c10::OperatorHandle& op, Args... args) { + return op.typed().call(std::forward(args)...); +} + +template +inline Result callOpUnboxedWithDispatchKey( + const c10::OperatorHandle& op, + c10::DispatchKey dispatchKey, + Args... args) { + return op.typed().callWithDispatchKey( + dispatchKey, std::forward(args)...); +} + +template +inline Result callOpUnboxedWithPrecomputedDispatchKeySet( + const c10::OperatorHandle& op, + c10::DispatchKeySet ks, + Args... args) { + return op.typed().redispatch( + ks, std::forward(args)...); +} + +inline void expectDoesntFindKernel( + const char* op_name, + c10::DispatchKey dispatch_key) { + auto op = c10::Dispatcher::singleton().findSchema({op_name, ""}); + EXPECT_ANY_THROW(callOp(*op, dummyTensor(dispatch_key), 5);); +} + +inline void expectDoesntFindOperator(const char* op_name) { + auto op = c10::Dispatcher::singleton().findSchema({op_name, ""}); + EXPECT_FALSE(op.has_value()); +} + +template +inline void expectThrows(Functor&& functor, const char* expectMessageContains) { + try { + std::forward(functor)(); + } catch (const Exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr(expectMessageContains)); + return; + } + ADD_FAILURE() << "Expected to throw exception containing \"" + << expectMessageContains << "\" but didn't throw"; +} + +template +void expectListEquals(c10::ArrayRef expected, std::array actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +template +void expectListEquals(c10::ArrayRef expected, c10::ArrayRef actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +template +void expectListEquals(c10::ArrayRef expected, c10::List actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual.get(i)); + } +} + +template +void expectListEquals(c10::ArrayRef expected, std::vector actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +// NB: This is not really sound, but all of the type sets constructed here +// are singletons so it's fine +static inline c10::DispatchKey extractDispatchKey(const at::Tensor& t) { + return legacyExtractDispatchKey(t.key_set()); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h new file mode 100644 index 0000000000000000000000000000000000000000..6812e6c1dc0d6656f3522fa1832a90101d4d80e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10::impl { + +// A CppSignature object holds RTTI information about a C++ function signature +// at runtime and can compare them or get a debug-printable name. +class TORCH_API CppSignature final { + public: + CppSignature(const CppSignature&) = default; + CppSignature(CppSignature&&) noexcept = default; + CppSignature& operator=(const CppSignature&) = default; + CppSignature& operator=(CppSignature&&) noexcept = default; + + template + static CppSignature make() { + // Normalize functors, lambdas, function pointers, etc. into the plain + // function type The first argument of the schema might be of type + // DispatchKeySet, in which case we remove it. We do this to guarantee that + // all CppSignature's for an operator will match, even if they're registered + // with different calling conventions. + // See Note [Plumbing Keys Through The Dispatcher] + using decayed_function_type = + typename c10::remove_DispatchKeySet_arg_from_func< + std::decay_t>::func_type; + + return CppSignature(std::type_index(typeid(decayed_function_type))); + } + + std::string name() const { + return c10::demangle(signature_.name()); + } + + friend bool operator==(const CppSignature& lhs, const CppSignature& rhs) { + if (lhs.signature_ == rhs.signature_) { + return true; + } + // Without RTLD_GLOBAL, the type_index comparison could yield false because + // they point to different instances of the RTTI data, but the types would + // still be the same. Let's check for that case too. + // Note that there still is a case where this might not work, i.e. when + // linking libraries of different compilers together, they might have + // different ways to serialize a type name. That, together with a missing + // RTLD_GLOBAL, would still fail this. + if (0 == strcmp(lhs.signature_.name(), rhs.signature_.name())) { + return true; + } + + return false; + } + + private: + explicit CppSignature(std::type_index signature) + : signature_(std::move(signature)) {} + std::type_index signature_; +}; + +inline bool operator!=(const CppSignature& lhs, const CppSignature& rhs) { + return !(lhs == rhs); +} + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h new file mode 100644 index 0000000000000000000000000000000000000000..78b8cecac1db5d571ec9fb88de9f294505f4b271 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h @@ -0,0 +1,285 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +namespace impl { + +// Take a DispatchKeySet for a Tensor and determine what the actual dispatch +// DispatchKey should be, taking into account TLS, and skipping backends which +// fall through. +// +// Unlike Tensor::key_set(), the value of this on a tensor can change depending +// on TLS. +// +// NB: If there is no valid dispatch key, this will return Undefined +inline DispatchKeySet computeDispatchKeySet( + DispatchKeySet ks, + // The key mask lets us eliminate (by zero entries) keys which should not + // be considered for dispatch. There are two cases when we use this: + // + // - If an operator's dispatch table contains a fallthrough entry, we + // should bypass it entirely when finding the key + // - If a user invokes with redispatch, the mask lets us + // zero out the key the user asked us to stop. + // + // These excluded backends are NOT tracked in the TLS, but must be applied + // AFTER TLS (since the backend may have been introduced for consideration + // by the included TLS), which is why you have to pass them in to this + // function (as opposed to just applying it to the input 'ks'). + DispatchKeySet key_mask) { + c10::impl::LocalDispatchKeySet local = + c10::impl::tls_local_dispatch_key_set(); + // TODO: It's a bit irritating that we have to do logical ORs here, it would + // be nice to only do one. Can always_included be folded into the TLS? Well, + // it's a bit troublesome, because fastpath TLS access requires the type of + // the TLS in question to be zero-initialized, so you don't actually win + // anything in that case. + return (((ks | local.included_) - local.excluded_) & key_mask); +} + +} // namespace impl + +namespace detail { +// A small gadget to extract the DispatchKeySet from types which are known +// to have it. Used to extract dispatch keys from unboxed calls. +struct MultiDispatchKeySet : at::IterArgs { + DispatchKeySet ts; + void operator()(const at::Tensor& x) { + ts = ts | x.key_set(); + } + void operator()(const std::optional& x) { + if (x.has_value()) { + ts = ts | x->key_set(); + } + } + void operator()(at::ArrayRef xs) { + for (const auto& x : xs) { + ts = ts | x.key_set(); + } + } + // Tensor?[] translates to this case. + void operator()(const c10::List>& xs) { + for (std::optional x : xs) { + if (x.has_value()) { + ts = ts | x.value().key_set(); + } + } + } + // Structured Tensor[] translates to this case + void operator()(const at::ITensorListRef& xs) { + for (const auto& x : xs) { + ts = ts | x.key_set(); + } + } + [[noreturn]] void operator()( + at::ArrayRef> /*unused*/) { + // Just checking that the handling of Tensor?[] didn't change. + TORCH_INTERNAL_ASSERT(false); + } + void operator()(const at::Generator& gen) { + if (gen.defined()) { + ts = ts | gen.key_set(); + } + } + void operator()(const std::optional& gen) { + if (gen.has_value() && gen->defined()) { + ts = ts | gen->key_set(); + } + } + template + void operator()(const T& /*unused*/) { + // do nothing + } +}; + +// NB: take by const reference (Don't do universal forwarding here! You +// don't want to move into this function!) +template +DispatchKeySet multi_dispatch_key_set(const Args&... args) { + return MultiDispatchKeySet().apply(args...).ts; +} +} // namespace detail + +/** + * An instance of DispatchKeyExtractor knows how to get a dispatch key given + * a list of arguments for an operator call. + * + * The instance is specific for a certain operator as: + * - In boxed dispatch, different operators have different ways to extract + * the dispatch key (e.g. different numbers of arguments), and we precompute + * the stack locations we should look at; and + * - In all dispatch, some backends should be excluded from dispatch because + * they have been registered as fallthrough. The set of excluded backends + * varies from operator, as some operators may have overridden the + * fallthrough with custom behavior. + * + * Note - this should maintain identical impl to the py dispatcher key + * extraction logic at pytorch/torch/dispatcher.py + */ +struct TORCH_API DispatchKeyExtractor final { + public: + static DispatchKeyExtractor make(const FunctionSchema& schema) { + return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema)); + } + + static DispatchKeyExtractor makeUninitialized() { + return DispatchKeyExtractor(c10::utils::bitset()); + } + + void registerSchema(const FunctionSchema& schema) { + TORCH_INTERNAL_ASSERT(dispatch_arg_indices_reverse_.is_entirely_unset()); + dispatch_arg_indices_reverse_ = makeBitsetForDispatchArgs(schema); + } + void deregisterSchema() { + dispatch_arg_indices_reverse_ = c10::utils::bitset(); + } + + DispatchKeySet getDispatchKeySetBoxed(const torch::jit::Stack* stack) const { + DispatchKeySet ks; + dispatch_arg_indices_reverse_.for_each_set_bit([&](size_t + reverse_arg_index) { + const auto& ivalue = torch::jit::peek(*stack, 0, reverse_arg_index + 1); + if (C10_LIKELY(ivalue.isTensor())) { + // NB: Take care not to introduce a refcount bump (there's + // no safe toTensorRef method, alas) + ks = ks | ivalue.unsafeToTensorImpl()->key_set(); + } else if (C10_UNLIKELY(ivalue.isTensorList())) { + // NB: use toListRef as it doesn't induce refcount bumps + // (toTensorListRef is not a thing) + for (const auto& nv : ivalue.toListRef()) { + auto* tensor = nv.unsafeToTensorImpl(); + ks = ks | tensor->key_set(); + } + } + // Tensor?[] translates to a c10::List so we need to peek inside + else if (C10_UNLIKELY(ivalue.isList())) { + for (const auto& elt : ivalue.toListRef()) { + if (elt.isTensor()) { + ks = ks | elt.toTensor().key_set(); + } + } + } + }); + // Keys that are fallthrough should be skipped + if (requiresBitsetPerBackend_) { + c10::impl::LocalDispatchKeySet tls = + c10::impl::tls_local_dispatch_key_set(); + auto backend_idx = + ((ks | tls.included_) - tls.excluded_).getBackendIndex(); + return impl::computeDispatchKeySet( + ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } + } + + template + DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const { + auto ks = detail::multi_dispatch_key_set(args...); + // Keys that are fallthrough should be skipped + if (requiresBitsetPerBackend_) { + c10::impl::LocalDispatchKeySet tls = + c10::impl::tls_local_dispatch_key_set(); + auto backend_idx = + ((ks | tls.included_) - tls.excluded_).getBackendIndex(); + return impl::computeDispatchKeySet( + ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } + } + + void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough); + + std::string dumpState() const; + void checkInvariants(const FunctionSchema& schema) const; + + private: + static bool isDispatchType(const Type& type) { + // Checking isSubtypeOf on a DynamicType heap-allocates a + // DynamicType version of the argument if it's not a DynamicType + // already, and this has measurable overhead during startup. +#ifdef C10_MOBILE + struct CachedTypes { + DynamicTypePtr listOfTensors; + DynamicTypePtr listOfOptionalTensors; + DynamicTypePtr optionalOfTensor; + }; + static const CachedTypes ct = { + DynamicType::create(*ListType::ofTensors()), + DynamicType::create(*ListType::ofOptionalTensors()), + DynamicType::create(*OptionalType::ofTensor())}; + return type.isSubtypeOf(c10::TypeFactory::get()) || + type.isSubtypeOf(ct.listOfTensors) || + type.isSubtypeOf(ct.listOfOptionalTensors) || + type.isSubtypeOf(ct.optionalOfTensor); +#else // C10_MOBILE + return type.isSubtypeOf(*TensorType::get()) || + type.isSubtypeOf(*ListType::ofTensors()) || + type.isSubtypeOf(*ListType::ofOptionalTensors()) || + type.isSubtypeOf(*OptionalType::ofTensor()); +#endif // C10_MOBILE + } + static c10::utils::bitset makeBitsetForDispatchArgs( + const FunctionSchema& schema) { + TORCH_CHECK( + schema.arguments().size() <= c10::utils::bitset::NUM_BITS(), + "The function schema has ", + schema.arguments().size(), + " arguments but this PyTorch build only supports ", + c10::utils::bitset::NUM_BITS()); + c10::utils::bitset dispatch_arg_indices_reverse; + for (const auto index : c10::irange(schema.arguments().size())) { + if (isDispatchType(*schema.arguments()[index].type())) { + dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index); + } + } + return dispatch_arg_indices_reverse; + } + + explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) + : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse), + nonFallthroughKeys_(DispatchKeySet::FULL) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; + } + } + + // this is a bitset that has ones for each argument index which has to be + // considered for dispatch. This avoids having to iterate over the stack + // to find all the tensors. The bits are stored in reverse order, i.e. + // dispatch_arg_indices_reverse_[i] == true, then the i-th argument from + // the top of the stack (i.e. the i-th last argument of the function) + // is relevant for dispatch. + // dispatch_arg_indices_reverse_ is allowed to have zero bits set; that just + // means you must do the fallthrough + c10::utils::bitset dispatch_arg_indices_reverse_; + + // Set of functionality keys for which the operator does NOT have fallthrough + // kernel. + DispatchKeySet nonFallthroughKeys_; + // Set of functionality keys for which the operator does NOT have fallthrough + // kernel, defined PER BACKEND. This is only needed if we know that the + // operator has a different set of fallthroughs defined for some backends. + std::array nonFallthroughKeysPerBackend_; + // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast + // path), or if we need to fall back to the slower path and check + // nonFallthroughKeysPerBackend_ + bool requiresBitsetPerBackend_{false}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h new file mode 100644 index 0000000000000000000000000000000000000000..2dc51027a01bb6fa8e83c3542d06e3c1008a4db5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h @@ -0,0 +1,955 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef NDEBUG +#include +#endif + +namespace c10 { + +TORCH_API bool show_dispatch_trace(); +TORCH_API void dispatch_trace_nesting_incr(); +TORCH_API void dispatch_trace_nesting_decr(); +TORCH_API int64_t dispatch_trace_nesting_value(); + +struct DispatchTraceNestingGuard { + DispatchTraceNestingGuard() { + dispatch_trace_nesting_incr(); + } + ~DispatchTraceNestingGuard() { + dispatch_trace_nesting_decr(); + } +}; + +class TORCH_API OperatorHandle; +template +class TypedOperatorHandle; + +/** + * Implement this interface and register your instance with the dispatcher + * to get notified when operators are registered or deregistered with + * the dispatcher. + * + * NB: registration events only occur when a 'def' occurs; we don't trigger + * on 'impl' or 'fallback' calls. + */ +class TORCH_API OpRegistrationListener { + public: + virtual ~OpRegistrationListener(); + + virtual void onOperatorRegistered(const OperatorHandle& op) = 0; + virtual void onOperatorDeregistered(const OperatorHandle& op) = 0; +}; + +namespace detail { +class RegistrationListenerList; +} +class SchemaRegistrationHandleRAII; + +/** + * Top-level dispatch interface for dispatching via the dynamic dispatcher. + * Most end users shouldn't use this directly; if you're trying to register + * ops look in op_registration + */ +class TORCH_API Dispatcher final { + private: + // For direct access to backend fallback information + friend class impl::OperatorEntry; + + struct OperatorDef final { + explicit OperatorDef(OperatorName&& op_name) : op(std::move(op_name)) {} + + impl::OperatorEntry op; + + // These refer to the number of outstanding RegistrationHandleRAII + // for this operator. def_count reflects only def() registrations + // (in the new world, this should only ever be 1, but old style + // registrations may register the schema multiple times, which + // will increase this count). def_and_impl_count reflects the number + // of combined def() and impl() registrations. When the last def() gets + // unregistered, we must immediately call the Deregistered listeners, but we + // must not actually delete the handle as there are other outstanding RAII + // destructors which will try to destruct and they had better still have a + // working operator handle in this case + size_t def_count = 0; + size_t def_and_impl_count = 0; + }; + friend class OperatorHandle; + template + friend class TypedOperatorHandle; + + struct Guard final { + Guard() : alive(true) {} + std::atomic alive; + std::mutex mutex; + }; + + public: + ~Dispatcher(); + + // Implementation note: this class abstracts over the fact that we have + // per-operator dispatch tables. This could be easily adjusted to have a + // single global hash table. + static Dispatcher& realSingleton(); + + C10_ALWAYS_INLINE static Dispatcher& singleton() { +#if !defined C10_MOBILE + // Implemented inline so that steady-state code needn't incur + // function-call overhead. We can't just inline `realSingleton` + // because the function-local static would get duplicated across + // all DSOs that include & use this header, leading to multiple + // singleton instances. + static Dispatcher& s = realSingleton(); + return s; +#else + // For C10_MOBILE, we should never inline a static function that + // has a static member, since the generated code calls + // __cxa_guard_acquire and __cxa_guard_release which help + // implement exactly once semantics for the initialization of the + // static Dispatcher& s above (for the non-mobile case). That + // additional code when duplicated across all operator stubs + // for every backend results in a lot of additional code + // being generated by the compiler. + return realSingleton(); +#endif + } + + // ------------------------------------------------------------------------ + // + // Accessing operators by schema + // + // ------------------------------------------------------------------------ + + /** + * Looks for an operator schema with the given name and overload name + * and returns it if it is registered WITH A SCHEMA. + * Returns nullopt otherwise. + */ + std::optional findSchema(const OperatorName& operator_name); + + /** + * Variant of findSchema that results in less code generated at the call site. + * It (1) takes const char* pointer rather than OperatorName (so we skip + * generating std::string constructor calls at the call site), and (2) + * it raises an exception if the operator is not found (so we skip + * generating exception raising code at the call site) + * + * Irritatingly, we still have to generate the handful of instructions + * for dealing with an exception being thrown during static initialization + * (e.g. __cxa_guard_abort). If we could annotate this method noexcept we + * could avoid this code too, but as the name of the function suggests, + * it does throw exceptions. + */ + OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name); + + // Like findSchema, but also returns OperatorHandle even if there is no schema + std::optional findOp(const OperatorName& operator_name); + + // Returns a list of all operator names present in the operatorLookupTable_ + const std::vector getAllOpNames(); + + // Returns a list of all operator names present in the operatorLookupTable_ + // for a given dispatch key + const std::vector getAllOpNamesForDispatchKey(DispatchKey k); + + // ------------------------------------------------------------------------ + // + // Invoking operators + // + // ------------------------------------------------------------------------ + + template + Return call(const TypedOperatorHandle& op, Args... args) + const; + + template + static Return callWithDispatchKeySlowPath( + const TypedOperatorHandle& op, + at::StepCallbacks& stepCallbacks, + DispatchKeySet dispatchKeySet, + const KernelFunction& kernel, + Args... args); + + // Like call, but intended for use in a redispatch in kernels that have + // explicitly performed the DispatchKey update calculatulation. This will take + // the DispatchKeySet completely as is and dispatch to the kernel of the + // corresponding highest priority key in the set. Note that this version of + // redispatch treats the inputted DispatchKeySet *as is*, and does NOT mask + // out the highest priority key. See Note [Plumbing Keys Through The + // Dispatcher] + template + Return redispatch( + const TypedOperatorHandle& op, + DispatchKeySet currentDispatchKeySet, + Args... args) const; + + // Invoke an operator via the boxed calling convention using an IValue stack + void callBoxed(const OperatorHandle& op, Stack* stack) const; + void callBoxedForDispatchKey( + const OperatorHandle& op, + DispatchKey dk, + Stack* stack) const; + + // TODO: This will only be useful if we write a backend fallback that plumbs + // dispatch keys (currently there are none) See Note [Plumbing Keys Through + // The Dispatcher] + void redispatchBoxed( + const OperatorHandle& op, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + bool hasBackendFallbackForDispatchKey(DispatchKey dk) { + auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk); + if (dispatch_ix < 0) + return false; + return backendFallbackKernels_[dispatch_ix].kernel.isValid(); + } + + // Used by torchdeploy/multipy for multiple // codespell:ignore: multipy + // interpreters racing. + void waitForDef(const FunctionSchema& schema); + void waitForImpl( + const OperatorName& op_name, + std::optional dispatch_key); + + // ------------------------------------------------------------------------ + // + // Performing registrations (NON user public; use op_registration) + // + // ------------------------------------------------------------------------ + + /** + * Register a new operator schema. + * + * If a schema with the same operator name and overload name already exists, + * this function will check that both schemas are exactly identical. + */ + RegistrationHandleRAII registerDef( + FunctionSchema schema, + std::string debug, + std::vector tags = {}); + + /** + * Register a kernel to the dispatch table for an operator. + * If dispatch_key is nullopt, then this registers a fallback kernel. + * + * @return A RAII object that manages the lifetime of the registration. + * Once that object is destructed, the kernel will be deregistered. + */ + // NB: steals the inferred function schema, as we may need to hold on to + // it for a bit until the real schema turns up + RegistrationHandleRAII registerImpl( + OperatorName op_name, + std::optional dispatch_key, + KernelFunction kernel, + std::optional cpp_signature, + std::unique_ptr inferred_function_schema, + std::string debug); + + /** + * Given an operator, tells the Dispatcher that we have implemented a fake + * impl for this op in the given Python module. Call this a "pystub". + */ + RegistrationHandleRAII registerPythonModule( + const OperatorName& op_name, + const char* pymodule, + const char* context); + + /** + * Given an operator, throws if we have a pystub. + */ + void throwIfHasPythonModule(OperatorName op_name); + + std::optional> getPyStub( + OperatorName op_name); + + /** + * Register a new operator by name. + */ + RegistrationHandleRAII registerName(OperatorName op_name); + + /** + * Register a fallback kernel for a backend. + * If an operator is called but there is no concrete kernel for the dispatch + * key of the given operator arguments, it will check if there is such a + * fallback kernel for the given dispatch key and, if yes, call that one. + */ + RegistrationHandleRAII registerFallback( + DispatchKey dispatch_key, + KernelFunction kernel, + std::string debug); + + /** + * Use to register whenever we had a TORCH_LIBRARY declaration in the frontend + * API. These invocations are only permitted once per program, so we raise + * an error if this is called again for the same namespace. + */ + RegistrationHandleRAII registerLibrary(std::string ns, std::string debug); + + // ------------------------------------------------------------------------ + // + // Listeners on registrations + // + // ------------------------------------------------------------------------ + + /** + * Add a listener that gets called whenever a new op is registered or an + * existing op is deregistered. Immediately after registering, this listener + * gets called for all previously registered ops, so it can be used to keep + * track of ops registered with this dispatcher. + */ + RegistrationHandleRAII addRegistrationListener( + std::unique_ptr listener); + + void checkInvariants() const; + + // + // ------------------------------------------------------------------------ + // + // Assertions + // + // ------------------------------------------------------------------------ + + /** + * For testing purposes. + * Returns a list of all operators that were created through calls to + * registerImpl(), without any corresponding calls to registerDef(). After + * static initialization is done this is almost certainly a bug, as the + * created OperatorHandle won't have any schema associated with it and users + * calling the op through the dispatcher won't be able to access it + * + * Note that we cannot enforce this invariant "as we go" during static + * initialization, due to undefined static initialization order- we have no + * guarantees over the order in which .def() and .impl() calls are registered + * in the dispatcher at static initialization time. So this function should + * only be called after static initialization. + */ + std::vector findDanglingImpls() const; + + /** + * Useful for inspecting global Dispatcher registration state. + * Returns the names of all operators with a kernel registered for the + * specified DispatchKey. If no DispatchKey is specified, it returns all + * registered operators. + */ + std::vector getRegistrationsForDispatchKey( + std::optional k) const; + + private: + Dispatcher(); + + static int64_t sequenceNumberForRunningRecordFunction( + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet); + static void runRecordFunction( + at::RecordFunction& guard, + at::RecordFunction::schema_ref_t schema_ref, + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet); + static void runRecordFunction( + at::RecordFunction& guard, + at::RecordFunction::schema_ref_t schema_ref, + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet, + c10::ArrayRef args); + +#ifdef FBCODE_CAFFE2 + static bool profilingOperatorEvents(); + static void fireOpStartUSDT( + at::RecordFunction::schema_ref_t schema_ref, + std::vector& argsAddresses, + std::vector& argsTypes); + static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref); +#endif // FBCODE_CAFFE2 + + OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema); + OperatorHandle findOrRegisterName_(const OperatorName& op_name); + + void deregisterDef_(const OperatorHandle& op, const OperatorName& op_name); + void deregisterImpl_( + const OperatorHandle& op, + const OperatorName& op_name, + std::optional dispatch_key, + impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle); + void deregisterName_(const OperatorHandle& op, const OperatorName& op_name); + void deregisterFallback_(DispatchKey dispatchKey); + void deregisterLibrary_(const std::string& ns); + void cleanup(const OperatorHandle& op, const OperatorName& op_name); + void checkSchemaCompatibility( + const OperatorHandle& op, + const FunctionSchema& schema, + const std::string& debug); + + std::list operators_; +#if !defined(C10_MOBILE) + LeftRight> + operatorLookupTable_; +#else + RWSafeLeftRightWrapper> + operatorLookupTable_; +#endif + // Map from namespace to debug string (saying, e.g., where the library was + // defined) + ska::flat_hash_map libraries_; + + std::array + backendFallbackKernels_; + + std::unique_ptr listeners_; + + // This condition variable gets notified whenever we add a new def/impl to the + // dispatch table. This is primarily used by multiply/torchdeploy, when + // we have multiple interpreters trying to register to the dispatch table. + // In this situation, whenever the non-primary interpreter would have tried + // to register to the dispatch table, instead it will check to see if the + // expected registration has already been made, and if it hasn't, wait on + // this condition variable to see if it was just racing with the primary + // interpreter. + // + // We expect it to be rare for there to be any waiters on this condition + // variable. This is mostly just to help give better diagnostics if + // something goes horribly wrong + std::condition_variable cond_var_; + + // Protect concurrent access to the dispatcher. We store this in a + // `shared_ptr` as we return callbacks that call back into dispatcher methods, + // and we need to be able to handle and guard against the event when the + // `Dispatcher` has been destroyed before the callbacks fire. + std::shared_ptr guard_; +}; + +/** + * This is a handle to an operator schema registered with the dispatcher. + * This handle can be used to register kernels with the dispatcher or + * to lookup a kernel for a certain set of arguments. + */ +class TORCH_API OperatorHandle { + template + friend struct std::hash; + + public: + OperatorHandle(OperatorHandle&&) noexcept = default; + OperatorHandle& operator=(OperatorHandle&&) noexcept = default; + OperatorHandle(const OperatorHandle&) = default; + OperatorHandle& operator=(const OperatorHandle&) = default; + // NOLINTNEXTLINE(performance-trivially-destructible) + ~OperatorHandle(); + + const OperatorName& operator_name() const { + return operatorDef_->op.operator_name(); + } + + bool hasSchema() const { + return operatorDef_->op.hasSchema(); + } + + const FunctionSchema& schema() const { + return operatorDef_->op.schema(); + } + + const std::string& debug() const { + return operatorDef_->op.debug(); + } + + std::string dumpState() const { + return operatorDef_->op.dumpState(); + } + + bool hasKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.hasKernelForDispatchKey(k); + } + + bool isKernelFallthroughKernel(DispatchKey k) const { + return operatorDef_->op.kernelForDispatchKey(k).isFallthrough(); + } + + bool hasKernelForAnyDispatchKey(DispatchKeySet k) const { + return operatorDef_->op.hasKernelForAnyDispatchKey(k); + } + + bool hasComputedKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.hasComputedKernelForDispatchKey(k); + } + + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.getComputedKernelForDispatchKey(k); + } + + std::string dumpComputedTable() const { + return operatorDef_->op.dumpComputedTable(); + } + + void checkInvariants() const { + operatorDef_->op.checkInvariants(); + } + + c10::ArrayRef getTags() const { + return operatorDef_->op.getTags(); + } + + void setReportErrorCallback_(std::unique_ptr callback) { + operatorDef_->op.setReportErrorCallback_(std::move(callback)); + } + + bool hasTag(const at::Tag& tag) const { + for (const auto& tag_ : getTags()) { + if (tag == tag_) { + return true; + } + } + return false; + } + + template + TypedOperatorHandle typed() const { + // NB: This assert is not 100% sound: you can retrieve a typed() operator + // handle prior to ANY C++ signature being registered on the operator + // and the check will say everything is OK (at which point you can then + // smuggle in a kernel that is typed incorrectly). For everything + // in core library this won't happen, because all the static registrations + // will be done by the time a typed() handle is acquired. +#if !defined C10_MOBILE + operatorDef_->op.assertSignatureIsCorrect(); + if (fn_has_symint::value) { + operatorDef_->op.assertSignatureIsCorrect< + typename fn_remove_symint::type>(); + } +#endif + return TypedOperatorHandle(operatorIterator_); + } + + void callBoxed(Stack* stack) const { + c10::Dispatcher::singleton().callBoxed(*this, stack); + } + + void callBoxed(Stack& stack) const { + callBoxed(&stack); + } + + void callBoxedForDispatchKey(DispatchKey dk, Stack& stack) const { + c10::Dispatcher::singleton().callBoxedForDispatchKey(*this, dk, &stack); + } + + void redispatchBoxed(DispatchKeySet ks, Stack* stack) const { + c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack); + } + + template + PyObject* getPythonOp( + c10::impl::PyInterpreter* self_interpreter, + F slow_accessor) const { + return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor); + } + + bool operator==(const OperatorHandle& other) const { + return operatorDef_ == other.operatorDef_; + } + + bool operator!=(const OperatorHandle& other) const { + return operatorDef_ != other.operatorDef_; + } + + private: + explicit OperatorHandle( + std::list::iterator operatorIterator) + : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator) {} + friend class Dispatcher; + template + friend class TypedOperatorHandle; + + // Storing a direct pointer to the OperatorDef even though we + // already have the iterator saves an instruction in the critical + // dispatch path. The iterator is effectively a + // pointer-to-std::list-node, and (at least in libstdc++'s + // implementation) the element is at an offset 16 bytes from that, + // because the prev/next pointers come first in the list node + // struct. So, an add instruction would be necessary to convert from the + // iterator to an OperatorDef*. + Dispatcher::OperatorDef* operatorDef_; + + // We need to store this iterator in order to make + // Dispatcher::cleanup() fast -- it runs a lot on program + // termination (and presumably library unloading). + std::list::iterator operatorIterator_; +}; + +/** + * This is a handle to an operator schema registered with the dispatcher. + * It holds the same information as an OperatorHandle, but it is templated + * on the operator arguments and allows calling the operator in an + * unboxed way. + */ +template +class TypedOperatorHandle final { + static_assert( + guts::false_t(), + "FuncType in OperatorHandle::typed was not a valid function type"); +}; +template +class TypedOperatorHandle final : public OperatorHandle { + public: + TypedOperatorHandle(TypedOperatorHandle&&) noexcept = default; + TypedOperatorHandle& operator=(TypedOperatorHandle&&) noexcept = default; + TypedOperatorHandle(const TypedOperatorHandle&) = default; + TypedOperatorHandle& operator=(const TypedOperatorHandle&) = default; + + // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use + // && + C10_ALWAYS_INLINE Return call(Args... args) const { + return c10::Dispatcher::singleton().call( + *this, std::forward(args)...); + } + + // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use + // && + C10_ALWAYS_INLINE Return + redispatch(DispatchKeySet currentDispatchKeySet, Args... args) const { + return c10::Dispatcher::singleton().redispatch( + *this, currentDispatchKeySet, std::forward(args)...); + } + + private: + explicit TypedOperatorHandle( + std::list::iterator operatorIterator) + : OperatorHandle(operatorIterator) {} + friend class OperatorHandle; +}; + +namespace detail { +template +inline void unused_arg_(const Args&... /*unused*/) {} + +// CaptureKernelCall is intended to capture return values from Dispatcher +// unboxed kernel calls. A record function may request to get outputs from the +// kernel calls. For boxed kernels, it's straightforward, the returned values +// are in the stack object. The stack can be passed to record functions. For +// unboxed kernels, we need to handle different kinds of return values, cache +// them temporarily, then release the values for the actual function call +// return. +template +struct CaptureKernelCall { + template + CaptureKernelCall( + const F& kernel, + const TypedOperatorHandle& op, + const DispatchKeySet& dispatchKeySet, + Args&&... args) + // Calls the kernel and capture the result in output_. + : output_{kernel.template call( + op, + dispatchKeySet, + std::forward(args)...)} {} + // Wraps the return values in a Stack. + Stack getOutputs() { + Stack stack; + impl::push_outputs::copy(output_, &stack); + return stack; + } + // Since we are returning the output_, we don't expect the output_ to be used + // afterward. Copy elision and RVO do not apply to class data members. Using + // move semantic to avoid copies when possible. + ReturnType release() && { + return std::move(output_); + } + + private: + ReturnType output_; +}; + +// Handle the lvalue reference differently since it should not be moved. +template <> +inline at::Tensor& CaptureKernelCall::release() && { + return output_; +} + +// Handle case where the kernel returns void. +template <> +struct CaptureKernelCall { + template + CaptureKernelCall( + const F& kernel, + const TypedOperatorHandle& op, + const DispatchKeySet& dispatchKeySet, + Args&&... args) { + // Calling the kernel and no need to capture void. + kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } + Stack getOutputs() { + return Stack(); + } + void release() && {} +}; + +TORCH_API void _print_dispatch_trace( + const std::string& label, + const std::string& op_name, + const DispatchKeySet& dispatchKeySet); + +} // namespace detail + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +inline Return Dispatcher::callWithDispatchKeySlowPath( + const TypedOperatorHandle& op, + at::StepCallbacks& stepCallbacks, + DispatchKeySet dispatchKeySet, + const KernelFunction& kernel, + Args... args) { + // If callbacks need inputs, we box the arguments and pass them to the guard. + // Note: For perf reasons we wouldn't want to prematurely box the arguments. + at::RecordFunction guard(std::move(stepCallbacks)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved()); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + constexpr auto num_boxed_args = impl::boxed_size(); + if constexpr (num_boxed_args != 0) { + if (guard.needsInputs()) { + // If we used std::array here, we would + // have to spend time default constructing the IValues in + // boxedArgs. aligned_storage has no such requirement. + // NOLINTNEXTLINE(*array*) + alignas(IValue) std::byte boxedArgs[num_boxed_args * sizeof(IValue)]; + // For debugging only; could be removed (but the compiler will do + // that for us and it's nice to have the extra assurance of + // correctness from our debug builds). + IValue* boxedArgsPtr = reinterpret_cast(boxedArgs); + impl::boxArgsToStack(boxedArgsPtr, args...); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + reinterpret_cast(boxedArgsPtr) == + boxedArgs + num_boxed_args * sizeof(IValue)); + // I don't *think* we need std::launder here, because IValue has + // no subclasses and no const or reference fields. + runRecordFunction( + guard, + schema_ref, + dispatchKey, + dispatchKeySet, + c10::ArrayRef( + reinterpret_cast(boxedArgs), num_boxed_args)); + boxedArgsPtr = reinterpret_cast(boxedArgs); + for (size_t ii = 0; ii < num_boxed_args; ++ii) { + (boxedArgsPtr + ii)->~IValue(); + } + } else { + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + } + } else { + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + } + + if (C10_UNLIKELY(guard.needsOutputs())) { + // Calls the kernel and capture the output temporarily to pass to + // RecordFunction. + detail::CaptureKernelCall captureKernelCall( + kernel, op, dispatchKeySet, std::forward(args)...); + guard.setOutputs(captureKernelCall.getOutputs()); + // Releases the captured output to return to caller. + return std::move(captureKernelCall).release(); + } + + // keeping the guard alive while executing the kernel + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); +} + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call( + const TypedOperatorHandle& op, + Args... args) const { + auto dispatchKeySet = + op.operatorDef_->op.dispatchKeyExtractor() + .template getDispatchKeySetUnboxed(args...); +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[call]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet); +#ifndef PYTORCH_DISABLE_PER_OP_PROFILING + auto step_callbacks = + at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION); + if (C10_UNLIKELY( + step_callbacks.has_value() && op.operatorDef_->op.isObserved())) { + return callWithDispatchKeySlowPath( + op, + *step_callbacks, + dispatchKeySet, + kernel, + std::forward(args)...); + } +#endif // PYTORCH_DISABLE_PER_OP_PROFILING + +#ifdef FBCODE_CAFFE2 + if (profilingOperatorEvents()) { + std::vector argsAddresses = {(void*)(&args)...}; + std::vector argsTypes = {(typeid(args).name())...}; + struct FireOpRAII { + FireOpRAII( + at::RecordFunction::schema_ref_t schema_ref, + std::vector& argsAddresses, + std::vector& argsTypes) + : schema_ref_(schema_ref) { + fireOpStartUSDT(schema_ref, argsAddresses, argsTypes); + } + ~FireOpRAII() { + fireOpEndUSDT(schema_ref_); + } + at::RecordFunction::schema_ref_t schema_ref_; + } event(op.schema(), argsAddresses, argsTypes); + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } else { + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } +#else + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); +#endif // FBCODE_CAFFE2 +} + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +inline Return Dispatcher::redispatch( + const TypedOperatorHandle& op, + DispatchKeySet currentDispatchKeySet, + Args... args) const { + // do not use RecordFunction on redispatch +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[redispatch]", toString(op.operator_name()), currentDispatchKeySet); + } +#endif + const KernelFunction& kernel = + op.operatorDef_->op.lookup(currentDispatchKeySet); + return kernel.template call( + op, currentDispatchKeySet, std::forward(args)...); +} + +inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) + const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; + auto dispatchKeySet = + entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[callBoxed]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const auto& kernel = entry.lookup(dispatchKeySet); +#ifndef PYTORCH_DISABLE_PER_OP_PROFILING + auto step_callbacks = + at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(step_callbacks.has_value() && entry.isObserved())) { + at::RecordFunction guard(std::move(*step_callbacks)); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + guard.needsInputs() + ? runRecordFunction( + guard, + schema_ref, + dispatchKey, + dispatchKeySet, + c10::ArrayRef(stack->data(), stack->size())) + : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + + // keeping the guard alive while executing the kernel + kernel.callBoxed(op, dispatchKeySet, stack); + + if (C10_UNLIKELY(guard.needsOutputs())) { + guard.setOutputs(*stack); + } + return; + } +#endif // PYTORCH_DISABLE_PER_OP_PROFILING + kernel.callBoxed(op, dispatchKeySet, stack); +} + +// NB: this doesn't count as a "true" dispatcher jump, so no instrumentation +inline void Dispatcher::callBoxedForDispatchKey( + const OperatorHandle& op, + DispatchKey dk, + Stack* stack) const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; + // We still compute this as we're obligated to pass it on to the internal + // kernel, if it is a boxed fallback + auto dispatchKeySet = + entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); + const auto& kernel = ([&]() { + if (op.hasKernelForDispatchKey(dk)) { + return entry.kernelForDispatchKey(dk); + } else { + auto idx = getDispatchTableIndexForDispatchKey(dk); + TORCH_INTERNAL_ASSERT(idx >= 0); + return backendFallbackKernels_[idx].kernel; + } + })(); + kernel.callBoxed(op, dispatchKeySet, stack); +} + +inline void Dispatcher::redispatchBoxed( + const OperatorHandle& op, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[redispatchBoxed]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const auto& kernel = entry.lookup(dispatchKeySet); + kernel.callBoxed(op, dispatchKeySet, stack); +} + +} // namespace c10 + +namespace std { + +template <> +struct hash { + size_t operator()(const c10::OperatorHandle& op) const noexcept { + return std::hash{}(static_cast(op.operatorDef_)); + } +}; + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h new file mode 100644 index 0000000000000000000000000000000000000000..ddd4e653c3f67786dd37e93e3ca1ab1e75acf697 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +struct TORCH_API ObservedOperators { + ObservedOperators() = delete; + + static bool isObserved(const OperatorName& name); + + static std::unordered_set& getUnobservedOperatorList(); +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h new file mode 100644 index 0000000000000000000000000000000000000000..fb78faeedd41167e446c29542349acfcb2f2cce5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h @@ -0,0 +1,342 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef C10_MOBILE +#define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY +#endif + +namespace c10 { + +class Dispatcher; + +namespace impl { + +// This data structure represents a kernel that was registered to us from a +// user. Unlike KernelFunction, AnnotatedKernel contains some extra metadata +// about the kernel that isn't necessary for actual dispatching (this is why +// we don't put AnnotatedKernel in the actual DispatchTable), but is useful for +// giving good error messages. +struct AnnotatedKernel final { + AnnotatedKernel( + KernelFunction k, + std::unique_ptr s, + std::string d) + : kernel(std::move(k)), + inferred_function_schema(std::move(s)), + debug(std::move(d)) {} + AnnotatedKernel() = default; + KernelFunction kernel; + std::unique_ptr inferred_function_schema; + // A little debug string to help us identify the kernel in question. + // Most importantly it records the TORCH_LIBRARY block that did the + // registration. + std::string debug; +}; + +// This data structure represents operator schema, with metadata specifying +// where the registration of this schema occurred +struct AnnotatedSchema final { + AnnotatedSchema(FunctionSchema s, std::string d) + : schema(std::move(s)), debug(std::move(d)) {} + FunctionSchema schema; + std::string debug; +}; + +// Internal data structure that records information about a specific operator. +// It's not part of the public API; typically, users will interact with +// OperatorHandle instead. +// +// Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher +// lock (this is important because some methods in OperatorEntry access +// dispatcher state) +class TORCH_API OperatorEntry final { + public: + explicit OperatorEntry(OperatorName&& operator_name); + + OperatorEntry(const OperatorEntry&) = delete; + OperatorEntry(OperatorEntry&&) noexcept = delete; + OperatorEntry& operator=(const OperatorEntry&) = delete; + OperatorEntry& operator=(OperatorEntry&&) noexcept = delete; + + const FunctionSchema& schema() const { + TORCH_INTERNAL_ASSERT( + schema_.has_value(), + "Tried to access the schema for ", + name_, + " which doesn't have a schema registered yet"); + return schema_->schema; + } + const std::string& debug() const { + TORCH_INTERNAL_ASSERT(schema_.has_value()); + return schema_->debug; + } + bool hasSchema() const { + return schema_.has_value(); + } + + bool isObserved() const { + return is_observed_; + } + + // We may allocate an OperatorEntry for an operator even when we don't + // have a schema. When we receive the schema registration, we post + // facto register a schema. + // + // NB: registerSchema/deregisterSchema are not idempotent; if you + // attempt to register a schema when one is already present or vice + // versa that is an error. (Refcounting for the registrations is + // handled in the OperatorHandle in Dispatcher) + void registerSchema( + FunctionSchema&& /*schema*/, + std::string&& debug, + std::vector tags = {}); + void deregisterSchema(); + + const OperatorName& operator_name() const { + return name_; + } + +#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY + using AnnotatedKernelContainer = std::array; +#else + using AnnotatedKernelContainer = std::list; +#endif + using AnnotatedKernelContainerIterator = AnnotatedKernelContainer::iterator; + + // Why are kernels and fallback asymmetric? It has to do with ownership. + // Kernels and the computed dispatch tables for them are canonically + // owned by OperatorEntry, but backend fallbacks are specified once + // and apply for all operators, so they should be owned by Dispatcher. + // However, the registration of a backend fallback affects the + // state of the computed dispatch table, so when a backend fallback + // is updated, we need to update the operator tables too. Thus, + // registerKernel is the mechanism by which we give kernels to + // operator entry to own (and update dispatch table), but we only + // need a non-owning mechanism to update fallback. + + // Precondition: Dispatcher::mutex_ is held + // Postcondition: caller is responsible for disposing of the kernel + AnnotatedKernelContainerIterator registerKernel( + const Dispatcher& dispatcher, + std::optional dispatch_key, + KernelFunction kernel, + std::optional cpp_signature, + std::unique_ptr inferred_function_schema, + std::string debug); + + // Precondition: Dispatcher::mutex_ is held + void deregisterKernel_( + const Dispatcher& dispatcher, + std::optional dispatch_key, + AnnotatedKernelContainerIterator kernel); + + // Precondition: Dispatcher::mutex_ is held + void updateFallback(const Dispatcher& dispatcher, DispatchKey dispatch_key); + + // Precondition: Dispatcher::mutex_ is held + void updateSchemaAliasAnalysis(AliasAnalysisKind a) { + TORCH_INTERNAL_ASSERT(schema_.has_value()); + schema_->schema.setAliasAnalysis(a); + } + + std::string dumpComputedTable() const; + std::string dumpState() const; + void checkInvariants() const; + + const DispatchKeyExtractor& dispatchKeyExtractor() const { + return dispatchKeyExtractor_; + } + + // Asserts that the given FuncType is correct for calling this operator in an + // unboxed way. + template + inline void assertSignatureIsCorrect() { + assertSignatureIsCorrect( + CppSignature::make(), fn_has_symint::value); + } + + void assertSignatureIsCorrect( + const CppSignature& call_signature, + bool has_symint) const; + + [[noreturn]] void reportError(DispatchKey dispatchKey) const; + + const KernelFunction& lookup(DispatchKeySet ks) const { + const auto idx = ks.getDispatchTableIndexForDispatchKeySet(); + if (C10_UNLIKELY(idx == -1)) { + reportError(ks.highestPriorityTypeId()); + } + const auto& kernel = dispatchTable_[idx]; + // A valid kernel *always* has a boxed kernel and *may* have an + // unboxed kernel. However, we typically do unboxed calls in at:: + // APIs, where the kernel 1) will very likely be valid and 2) + // should have an unboxed kernel. Checking the unboxed kernel + // first will allow us to avoid touching the boxed kernel at all + // in the common case. + if (C10_UNLIKELY(!kernel.isValidUnboxed())) { + if (!kernel.isValid()) { + reportError(ks.highestPriorityTypeId()); + } + } + return kernel; + } + + std::string listAllDispatchKeys() const; + + // Returns true if kernel_ has entry for any key in ks. + // + // Invariant: There are no alias keys in the passed-in dispatch key set. + // Note [No Alias Keys in DispatchKeySet] + // Alias keys should be checked using `hasKernelForDispatchKey` + // Alias keys shouldn't go inside of a DispatchKeySet, since they can + // technically have a value > 63 (causing overflow). + bool hasKernelForAnyDispatchKey(DispatchKeySet ks) const; + // Returns true if kernel_ has entry for a particular key. + bool hasKernelForDispatchKey(DispatchKey k) const; + // Retrieves the kernel entry at a particular key. Symmetric with + // hasKernelForDispatchKey. To get the AnnotatedKernel, see + // getKernelForDispatchKey (private) + const KernelFunction& kernelForDispatchKey(DispatchKey k) const; + // Returns true if the "computed table" has an entry for a particular key. + bool hasComputedKernelForDispatchKey(DispatchKey k) const; + // Returns a KernelFunction corresponding to the kernel in dispatchTable + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const; + // Returns all the operator tags added at the time of registration + const std::vector& getTags() const; + void setReportErrorCallback_(std::unique_ptr callback); + + template + PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor) + const { + return py_cache_.ptr_or(self_interpreter, slow_accessor); + } + + private: + OperatorName name_; + std::optional schema_; +#ifndef C10_MOBILE + std::vector tags_; +#endif + std::array dispatchTable_; + DispatchKeyExtractor dispatchKeyExtractor_; + // Pointer to the torch.ops.ns.op.overload object for speed + c10::PyHandleCache py_cache_; + + // kernels_ stores all registered kernels for the corresponding dispatch key + // and catchAllKernels_ stores the catch-all kernels. + // If an operator library gets loaded that overwrites an already existing + // kernel, both kernels will be in that list but only the newer one will be in + // dispatchTable. If any of the kernels go away (say the library gets + // unloaded), we remove the kernel from this list and update the + // dispatchTable if necessary. + // Kernels in the list are ordered by registration time descendingly, + // newer registrations are before older registrations. + // We do not combine dispatchTable and kernels into one hash map because + // kernels is a larger data structure and accessed quite infrequently + // while dispatchTable is accessed often and should be kept small to fit + // into CPU caches. + // Invariants: + // - dispatchTable[dispatch_key] == kernels_[dispatch_key].front() + // - dispatchTable[dispatch_key] does not exist if and only if + // kernels_[dispatch_key] does not exist + // - If kernels_[dispatch_key] exists, then it has elements. + // It is never an empty list. + // + // Why do we do that? + // ----- + // We mostly do this to enable Jupyter notebooks where a cell registering + // a kernel could be executed multiple times and the later execution + // should overwrite the earlier one. Note that this still fails when the + // function schema changed between the executions, but it works as long + // as the function schema didn't change. A better solution would be to + // unload the old extension library from the Jupyter cell when the cell is + // re-executed and then only allow one kernel here, i.e. error if a kernel + // is already registered, but that's a lot of effort to implement and + // currently not high-pri. + ska::flat_hash_map< + DispatchKey, +#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY + // On mobile, we needn't worry about Jupyter notebooks. + std::array +#else + std::list +#endif + > + kernels_; + + const AnnotatedKernel& missingKernel() const; + const AnnotatedKernel& ambiguousAutogradOtherKernel() const; + + // cpp_signature_ stores function signature if any of + // the kernels was created in a way that allowed us to know the function + // signature (i.e. by supplying an unboxed C++ kernel function). + // If this is set, it will be used to check that future kernel + // registrations match and it will be used in unboxed function calls + // to verify their arguments against the known function signature. + struct CppSignatureWithDebug { + CppSignature signature; + std::string debug; + std::optional dispatch_key; + }; + std::optional cpp_signature_; + std::optional sym_cpp_signature_; + + // A Python custom error handler for OperatorEntry::reportError + std::unique_ptr report_error_callback_; + + // Whether this operator needs to be observed with RecordFunction + const bool is_observed_; + + [[noreturn]] void reportSignatureError( + const CppSignature& call_signature, + const CppSignatureWithDebug& saved_signature) const; + const KernelFunction& computeDispatchTableEntry( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key) const; + std::pair + computeDispatchTableEntryWithDebug( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key) const; + // This function re-establishes the invariant that dispatchTable + // contains the front element from the kernels list for a given runtime + // dispatch key. + void updateDispatchTableEntry_( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key); + // Like above, but also handles alias dispatch keys. + void updateDispatchTable_( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key); + // Like above, but for ALL entries in the dispatch table. + void updateDispatchTableFull_(const c10::Dispatcher& dispatcher); + // Retrieves a pointer to AnnotatedKernel at + // kernels_.at(dispatch_key).front(). + const AnnotatedKernel* getKernelForDispatchKey( + DispatchKey dispatch_key) const; +}; + +} // namespace impl +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..7d506e7a43784a38e1646e6ced419bdf8f080aac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +enum class AliasAnalysisKind : uint8_t { + INTERNAL_SPECIAL_CASE, + CONSERVATIVE, // The most conservative alias analysis type, assumes + // side-effects. This is the default analysis. + FROM_SCHEMA, + PURE_FUNCTION +}; + +#if !defined(_MSC_VER) +constexpr // Our current MSVC version has a bug that doesn't allow this to be + // constexpr. +#endif + inline const char* + toString(AliasAnalysisKind aliasAnalysisKind) { + return (aliasAnalysisKind == AliasAnalysisKind::CONSERVATIVE) ? "CONSERVATIVE" + : (aliasAnalysisKind == AliasAnalysisKind::FROM_SCHEMA) ? "FROM_SCHEMA" + : (aliasAnalysisKind == AliasAnalysisKind::PURE_FUNCTION) + ? "PURE_FUNCTION" + : (aliasAnalysisKind == AliasAnalysisKind::INTERNAL_SPECIAL_CASE) + ? "INTERNAL_SPECIAL_CASE" + : "UNKNOWN"; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h new file mode 100644 index 0000000000000000000000000000000000000000..c66b08e8350e355de45777c2e15f71dcdbb8a2f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +class RegistrationHandleRAII final { + public: + explicit RegistrationHandleRAII(std::function onDestruction) + : onDestruction_(std::move(onDestruction)) {} + + ~RegistrationHandleRAII() { + if (onDestruction_) { + onDestruction_(); + } + } + + RegistrationHandleRAII(const RegistrationHandleRAII&) = delete; + RegistrationHandleRAII& operator=(const RegistrationHandleRAII&) = delete; + + RegistrationHandleRAII(RegistrationHandleRAII&& rhs) noexcept + : onDestruction_(std::move(rhs.onDestruction_)) { + rhs.onDestruction_ = nullptr; + } + + RegistrationHandleRAII& operator=(RegistrationHandleRAII&& rhs) noexcept { + onDestruction_ = std::move(rhs.onDestruction_); + rhs.onDestruction_ = nullptr; + return *this; + } + + private: + std::function onDestruction_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h new file mode 100644 index 0000000000000000000000000000000000000000..41936f74d3f79450df15220020866d9ca2de492a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h @@ -0,0 +1,86 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +/* + * [Note: hacky wrapper removal for optional tensor] + * + * The kernel implementation takes an optional tensor marked in the schema as + * Tensor? but the C++ function takes Tensor instead of the std::optional + * expected by the dispatcher. + * + * To remove the hacky wrapper, the C++ function is changed to take + * std::optional and unwrap the Tensor value at the beginning of + * the function, e.g.: + * > c10::MaybeOwned weight_maybe_owned = + * > at::borrow_from_optional_tensor(weight_opt); + * > const Tensor& weight = *weight_maybe_owned; + * + * We may want to make the kernel handle optional directly without + * going through the creation of a default-constructed Tensor in + * at::borrow_from_optional_tensor. + */ + +/* + * [Note: hacky wrapper removal for TensorOptions] + * + * The kernel implementation takes a TensorOptions argument but the dispatcher + * expects separate arguments for dtype, layout, device, pin_memory. + * + * To remove the hacky wrapper, the kernel implementation is changed to take + * the 4 arguments (dtype, layout, device, pin_memory), and assemble the + * TensorOptions value at the beginning of the function, e.g.: + * > TensorOptions options = TensorOptions().dtype(dtype).layout(layout) + * > .device(device).pinned_memory(pin_memory); + * + * We may want make the kernel handle these parameters directly without going + * through the creation of a TensorOptions value. + */ + +namespace c10::impl { + +TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName); + +inline void check_and_update_common_device(std::optional& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { + // TODO: Remove this once the following issue is addressed: + // https://github.com/pytorch/pytorch/issues/57380 + if (!tensor.defined()) { + return; + } + + if (!common_device.has_value()) { + common_device = tensor.device(); + return; + } + + if (C10_UNLIKELY(common_device != tensor.device())) { + common_device_check_failure(*common_device, tensor, methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, const std::optional& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { + if (tensor.has_value()) { + check_and_update_common_device(common_device, tensor.value(), methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) { + for (const auto& tensor : tensors) { + check_and_update_common_device(common_device, tensor, methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, const List>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) { + for (const auto& tensor : tensors) { + check_and_update_common_device(common_device, tensor, methodName, argName); + } +} +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h new file mode 100644 index 0000000000000000000000000000000000000000..bb01fcab0b4d7314188acbd761f61a12de6d14d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h @@ -0,0 +1,162 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +/** + * This file contains functionality to take a C++ function and infer its + * c10::FunctionSchema. + */ + +#include +#include + +namespace c10 { +namespace detail::infer_schema { + +/// The templated inference code creates `ArgumentDef` instead of `Argument`, +/// because that can be constructed at compile time and has a much smaller +/// binary size than having calls to `Argument` constructors in the template. +/// Creating `Argument` objects from `ArgumentDef` can then be done at +/// runtime in a non-templated way. +struct ArgumentDef final { + using GetTypeFn = TypePtr(); + GetTypeFn* getTypeFn; + GetTypeFn* getFakeTypeFn; + constexpr ArgumentDef(): getTypeFn(nullptr), getFakeTypeFn(nullptr) {} + explicit constexpr ArgumentDef(GetTypeFn *getTypeFn, GetTypeFn *getFakeTypeFn): getTypeFn(getTypeFn), getFakeTypeFn(getFakeTypeFn) {} +}; + +template +struct bool_t {}; +template<> struct bool_t : std::true_type {}; +template<> struct bool_t : std::false_type {}; + +/// Checks the static C++ types `Types` for correctness to catch common error cases. +template +constexpr int checkStaticTypes() { + // Give nice error messages for some of the common error cases. + // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT + static_assert(std::conjunction_v< + bool_t || std::is_same_v || std::is_same_v || std::is_same_v>... + >, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type"); + static_assert(std::conjunction_v< + bool_t>... + >, "INVALID TYPE: float is not supported as an argument type, use double instead"); + return 0; +} + +template +constexpr std::array createArgumentVectorFromTypes(std::index_sequence /*unused*/) { + return ( + // Check types for common errors + checkStaticTypes(), + + // Create the return value + std::array{ + ArgumentDef(&getTypePtrCopy>, &getFakeTypePtrCopy>)...} + ); +} + +/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified +/// as template arguments. +template struct createArguments final {}; +template +struct createArguments> final { + static constexpr std::array call() { + return createArgumentVectorFromTypes( + std::make_index_sequence() + ); + } +}; + +/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified +/// as a tuple (i.e. in the way c10 kernels return values). +/// It can be a tuple if there's three output arguments with types A, B, C. +/// It can be an empty tuple<>, or void for kernels that don't return anything. +/// It can be a single type A (i.e. no tuple) for the case where a kernel just +/// returns one value. +template struct createReturns final {}; + +template +struct createReturns, void> final { + static constexpr std::array call() { + return createArgumentVectorFromTypes( + std::make_index_sequence() + ); + } +}; + +template +struct createReturns && !guts::is_instantiation_of::value>> final { + static constexpr std::array call() { + return createReturns>::call(); + } +}; + +template<> +struct createReturns final { + static constexpr std::array call() { + return createReturns>::call(); + } +}; + +template +struct createSingleReturn { + static constexpr std::array call() { + return createArgumentVectorFromTypes(std::make_index_sequence<1>()); + } +}; + +TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef arguments, c10::ArrayRef returns); +TORCH_API FunctionSchema make_function_schema(c10::ArrayRef arguments, c10::ArrayRef returns); + +/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a +/// function. Flattens std::tuple returns into multiple return types +template +FunctionSchema createFunctionSchemaFromTraitsFlattenedReturns() { + using ReturnType = typename FunctionTraits::return_type; + using ParameterTypes = typename FunctionTraits::parameter_types; + + // arguments and returns are computed into a std::array at compile time and embedded into the binary. + // The only code executed at runtime here is the one that creates a std::vector + // of the arguments/returns from the std::array. + constexpr auto arguments = createArguments::call(); + constexpr auto returns = createReturns::call(); + + return make_function_schema(arguments, returns); +} + +/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a +/// function. Preserves std::tuple returns as a Tuple return type +template +FunctionSchema createFunctionSchemaFromTraitsSingleReturn(std::string&& name, std::string&& overload_name) { + using ReturnType = typename FunctionTraits::return_type; + using ParameterTypes = typename FunctionTraits::parameter_types; + + // arguments and returns are computed into a std::array at compile time and embedded into the binary. + // The only code executed at runtime here is the one that creates a std::vector + // of the arguments/returns from the std::array. + constexpr auto arguments = createArguments::call(); + constexpr auto returns = createSingleReturn::call(); + + return make_function_schema(std::move(name), std::move(overload_name), arguments, returns); +} + +} + +template +FunctionSchema inferFunctionSchemaFlattenedReturns() { + return detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns>(); +} + +template +FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& overload_name) { + return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn>(std::move(name), std::move(overload_name)); +} + +TORCH_API std::optional findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified); + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h new file mode 100644 index 0000000000000000000000000000000000000000..85169f8a1ab8684c84e08188ef66fe9e945ed7ec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h @@ -0,0 +1,186 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// TODO: unify to C10_MOBILE. In theory this header could be used in OSS. +#ifdef TEMPLATE_SELECTIVE_BUILD +#include +#endif + +/** + * This header implements functionality to build PyTorch with only a certain + * set of operators (+ dependencies) included. + * + * - Build with -DTORCH_OPERATOR_WHITELIST="aten::add;aten::sub" and only these + * two ops will be included in your build. The allowlist records operators + * only, no overloads; if you include aten::add, all overloads of aten::add + * will be included. + * + * Internally, this is done by removing the operator registration calls + * using compile time programming, and the linker will then prune all + * operator functions that weren't registered. + * See Note [Selective build] for more details + * + * WARNING: The allowlist mechanism doesn't work for all ways you could go about + * registering an operator. If the dispatch key / operator name is not + * sufficiently obvious at compile time, then the allowlisting mechanism + * will fail (and the operator will be included in the binary anyway). + */ + +#include +#include +#include + + +#if defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) +#include +#endif + +namespace c10::impl { + +constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item); // Forward Declare + +/** + * In selective build mode returns true/false depending on whether a build + * feature is available or not. + * + * In instrumenting mode (tracing mode), always returns true, and doesn't + * trigger any side effects. + */ +constexpr bool is_build_feature_available(const char* name) { +#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) + // Selective Build mode. +#if !defined(TORCH_BUILD_FEATURE_ALLOWLIST) + (void)name; + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_BUILD_FEATURE_ALLOWLIST), + name); +#endif + +#else + // Instrumenting mode. + (void)name; + return true; +#endif +} + +[[noreturn]] void build_feature_required_feature_not_available(const char* feature); + +/** + * Use BUILD_FEATURE_REQUIRED macro in user-code. + * + * In selective build mode becomes a no-op if the build feature passed + * in is available. If not available, throws an exception (c10::Error). + * The compiler is able to perform dead code elimination for code + * following this method if the build feature is not available. + * + * In instrumenting mode (tracing mode), registers (as a side effect) + * the presence of this specific build feature being triggered. + */ +#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) // selective build mode + +#if defined(TORCH_BUILD_FEATURE_ALLOWLIST) +#define BUILD_FEATURE_REQUIRED(NAME) \ + if (!c10::impl::is_build_feature_available(NAME)) { \ + ::c10::impl::build_feature_required_feature_not_available(NAME); \ + } +#else // Everything trivially selected +#define BUILD_FEATURE_REQUIRED(NAME) + +#endif + +#else // trace mode +#define BUILD_FEATURE_REQUIRED(NAME) \ + RECORD_FUNCTION_WITH_SCOPE( \ + at::RecordScope::BUILD_FEATURE, \ + std::string(NAME), \ + {}); +#endif + +// Use this macro, and not is_build_feature_available +#define BUILD_FEATURE_AVAILABLE(NAME) ::c10::impl::is_build_feature_available(NAME) + +// returns true iff allowlist contains item +// allowlist_contains("a;bc;d", "bc") == true +constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item) { + //Choose a really big value for next so that if something goes wrong + //this code will blow up in a hopefully detectable way. + size_t next = std::numeric_limits::max(); + for (size_t cur = 0; cur <= allowlist.size(); cur = next) { + next = allowlist.find(';', cur); + if (next != std::string_view::npos) { + if (allowlist.substr(cur, next - cur) == item) { + return true; + } + next++; + } else { + if (allowlist.substr(cur) == item) { + return true; + } + break; + } + } + return false; +} + +// Returns true iff the given op name is on the allowlist +// and should be registered +constexpr bool op_allowlist_check(std::string_view op_name [[maybe_unused]]) { + assert(op_name.find("::") != std::string_view::npos); + // Use assert() instead of throw() due to a gcc bug. See: + // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function + // https://github.com/fmtlib/fmt/issues/682 + assert(op_name.find('(') == std::string_view::npos); +#if !defined(TORCH_OPERATOR_WHITELIST) + // If the TORCH_OPERATOR_WHITELIST parameter is not defined, + // all ops are to be registered + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_OPERATOR_WHITELIST), + // This function is majorly used for mobile selective build with + // root operators, where the overload is included in the allowlist. + op_name); + // // Strip overload name (as allowlist doesn't contain overloads) + // // Another function based on this may be added when there's usage + // // on op names without overload. + // OperatorNameView::parse(op_name).name); +#endif +} + +// Returns true iff the given schema string is on the allowlist +// and should be registered +constexpr bool schema_allowlist_check(std::string_view schema) { +#if defined(TORCH_FORCE_SCHEMA_REGISTRATION) + return true; +#else + return op_allowlist_check(schema.substr(0, schema.find('('))); +#endif +} + +// Returns true iff the given custom class name is on the allowlist +// and should be registered +constexpr bool custom_class_allowlist_check(std::string_view custom_class_name [[maybe_unused]]) { +#if !defined(TORCH_CUSTOM_CLASS_ALLOWLIST) + // If the TORCH_CUSTOM_CLASS_ALLOWLIST parameter is not defined, + // all custom classes are to be registered + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_CUSTOM_CLASS_ALLOWLIST), + custom_class_name); +#endif +} + +// schema_allowlist_check() implicitly depends on a macro, TORCH_OPERATOR_WHITELIST. +// Add this API to pass arbitrary allowlist. +constexpr bool op_allowlist_contains_name_in_schema(std::string_view allowlist, std::string_view schema) { + return allowlist_contains(allowlist, schema.substr(0, schema.find('('))); +} + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h new file mode 100644 index 0000000000000000000000000000000000000000..6e5f8ffe59479fb8e8da0dcf4716b6d14c9d15db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h @@ -0,0 +1,599 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +/** + * Include this file if you want to register operators. It includes all + * functionality needed to do so for you. + */ + +#include +#include +#include +#include +#include +#include +#include +#if defined(EXPOSE_C2_OPS) || !defined(CAFFE2_IS_XPLAT_BUILD) +#include +#endif +#include + +namespace c10 { + +namespace detail { +// The first argument of the schema might be of type DispatchKeySet, in which case we remove it. +// We do this because every argument in a function schema is expected to be convertible +// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of. +// See Note [Plumbing Keys Through The Dispatcher] +template +std::unique_ptr inferFunctionSchemaFromFunctor() { + using func_type = typename c10::remove_DispatchKeySet_arg_from_func::func_type; + return std::make_unique(inferFunctionSchemaFlattenedReturns()); +} +} + +/** + * An instance of this class handles the registration for one or more operators. + * Make sure you keep the RegisterOperators instance around since it will + * deregister the operator it's responsible for in its destructor. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + */ +class TORCH_API RegisterOperators final { +public: + RegisterOperators() = default; + ~RegisterOperators() = default; + + RegisterOperators(const RegisterOperators&) = delete; + RegisterOperators& operator=(const RegisterOperators&) = delete; + RegisterOperators(RegisterOperators&&) noexcept = default; + RegisterOperators& operator=(RegisterOperators&&) noexcept = default; + + class TORCH_API Options final { + public: + Options(const Options&) = delete; + Options(Options&&) noexcept = delete; + Options& operator=(const Options&) = delete; + Options& operator=(Options&&) noexcept = delete; + + // internal-only for registering stack based kernels + template + Options&& kernel(DispatchKey dispatch_key) && { + return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction(), std::nullopt, nullptr); + } + + // internal-only for registering stack based catch-all kernels + template + Options&& catchAllKernel() && { + return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction(), std::nullopt, nullptr); + } + + // internal only for registering caffe2 ops + Options&& schema(FunctionSchema&& schema) { + TORCH_CHECK(!schemaOrName_.has_value(), "You can only specify the schema once per operator registration."); + schemaOrName_ = FunctionSchema(std::move(schema)); + return std::move(*this); + } + + /** + * Use this to specify the schema for an operator. You can also specify + * the operator name only to have the function signature part of the + * schema be inferred from the kernel function. + * + * Example: + * + * > // Infer function signature from my_kernel_cpu + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + * > + * > + * > // Explicitly specify full schema + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op(Tensor a) -> Tensor") + * > .kernel(DispatchKey::CPU)); + */ + Options&& schema(const std::string& schemaOrName) { + TORCH_CHECK(!schemaOrName_.has_value(), "Tried to register operator ", schemaOrName," but specified schema multiple times. You can only specify the schema once per operator registration."); + + #if !defined(EXPOSE_C2_OPS) && defined(CAFFE2_IS_XPLAT_BUILD) + throw std::logic_error("Tried to register operator " + schemaOrName + ". We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build."); + #else + schemaOrName_ = torch::jit::parseSchemaOrName(schemaOrName); + #endif + + return std::move(*this); + } + + /** + * Use this to register an operator whose kernel is implemented as a functor. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU, "some_configuration", 3, true)); + */ + template + // enable_if: only enable it if KernelFunctor is actually a functor + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && { + static_assert(std::is_base_of_v, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + static_assert(std::is_constructible_v, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedFunctor(std::make_unique(std::forward(constructorParameters)...)), + impl::CppSignature::make(), + detail::inferFunctionSchemaFromFunctor() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a functor. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel()); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel("some_configuration", 3, true)); + */ + template + // enable_if: only enable it if KernelFunctor is actually a functor + std::enable_if_t::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && { + static_assert(std::is_base_of_v, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + static_assert(std::is_constructible_v, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedFunctor(std::make_unique(std::forward(constructorParameters)...)), + impl::CppSignature::make(), + detail::inferFunctionSchemaFromFunctor() + ); + } + + /** + * Use this to register an operator whose kernel is implemented by a function. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + */ + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>::type>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented by a function. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel()); + */ + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> catchAllKernel() && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>::type>() + ); + } + + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> catchAllKernel(FuncType* kernel_func) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a lambda. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * The lambda must be stateless, i.e. not have a capture. If your kernel + * needs to store some configuration parameters, write the kernel as a + * functor instead. + * + * Example: + * + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU, [] (Tensor a) -> Tensor {...})); + */ + template + // enable_if: only enable it if Lambda is a functor (note: lambdas are functors) + std::enable_if_t< + guts::is_functor>::value + && !std::is_same_v>::func_type, KernelFunction::BoxedKernelFunction>, + Options&&> kernel(DispatchKey dispatch_key, Lambda&& functor) && { + static_assert(!std::is_base_of_v>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); + + // We don't support stateful lambdas (i.e. lambdas with a capture), because their + // behavior would be nonobvious. A functor kernel with cache gets a new instance of + // its cache each time the kernel is looked up from the dispatch table. + // A lambda with a capture would be global and share its capture between all kernel lookups. + // So, instead of making users having to think about it (including the thread-safety + // issues this causes), let's just forbid stateful lambdas altogether. + static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedLambda(std::forward(functor)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a lambda. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * The lambda must be stateless, i.e. not have a capture. If your kernel + * needs to store some configuration parameters, write the kernel as a + * functor instead. + * + * Example: + * + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel([] (Tensor a) -> Tensor {...})); + */ + template + // enable_if: only enable it if Lambda is a functor (note: lambdas are functors) + std::enable_if_t< + guts::is_functor>::value + && !std::is_same_v>::func_type, KernelFunction::BoxedKernelFunction>, + Options&&> catchAllKernel(Lambda&& lambda) && { + static_assert(!std::is_base_of_v>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); + + // We don't support stateful lambdas (i.e. lambdas with a capture), because their + // behavior would be nonobvious. + // A lambda with a capture would be global and share its capture between all kernel lookups. + // This would be a likely source for unexpected race conditions, so we forbid it. + // If a kernel really needs global state, they can just have regular global state + // in their .cpp file next to the kernel lambda. + static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + Options&& aliasAnalysis(AliasAnalysisKind aliasAnalysisKind) && { + TORCH_CHECK(!aliasAnalysisKind_.has_value(), "You can only call aliasAnalysis() once per operator registration."); + aliasAnalysisKind_ = aliasAnalysisKind; + return std::move(*this); + } + + private: + Options&& kernel(std::optional dispatch_key, KernelFunction&& func, std::optional cpp_signature, std::unique_ptr&& inferred_function_schema) && { + KernelRegistrationConfig config; + config.dispatch_key = dispatch_key; + config.func = std::move(func); + config.cpp_signature = cpp_signature; + config.inferred_function_schema = std::move(inferred_function_schema); + kernels.push_back(std::move(config)); + return std::move(*this); + } + + Options() + : schemaOrName_(std::nullopt) + , aliasAnalysisKind_(std::nullopt) + {} + + // KernelRegistrationConfig accumulates all information from the config + // parameters passed to a RegisterOperators::op() call into one object. + struct KernelRegistrationConfig final { + KernelRegistrationConfig() + : dispatch_key(std::nullopt) + , cpp_signature(std::nullopt) + , inferred_function_schema(nullptr) + {} + + std::optional dispatch_key; + KernelFunction func; + std::optional cpp_signature; + std::unique_ptr inferred_function_schema; + }; + + std::optional> schemaOrName_; + + std::vector kernels; + std::optional aliasAnalysisKind_; + friend class RegisterOperators; + friend class Library; + }; + + /** + * Call this to get an instance of registration options, which + * can be passed to a call to RegisterOperators::op() to specify + * these options for the operator registration. + * See class doc comment for examples. + */ + static Options options() { + return {}; + } + + /** + * Call this to register an operator. See class doc comment for examples. + */ + RegisterOperators&& op(Options&& options) && { + checkSchemaAndRegisterOp_(std::move(options)); + return std::move(*this); + } + + // Regular mutator version of the && version above + RegisterOperators& op(Options&& options) & { + checkSchemaAndRegisterOp_(std::move(options)); + return *this; + } + + /** + * This is a shorthand for RegisterOperators::op(Options) where you can + * specify the operator schema outside of the options parameter. + * See class doc comment for examples. + */ + RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && { + return std::move(*this).op(std::move(options).schema(schemaOrName)); + } + + // internal only for registering caffe2 ops + RegisterOperators&& op(FunctionSchema schema, Options&& options) && { + return std::move(*this).op(std::move(options).schema(std::move(schema))); + } + + template + explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options()) + : RegisterOperators() { + std::move(*this).op(schemaOrName, std::forward(func), std::move(options)); + } + + /** + * This API registers an operator based on a kernel function pointer. + * + * Given a kernel + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * + * This API looks like: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", &my_kernel_cpu); + * + * If your kernel is small and the overhead of calling it matters, + * then this API might be the wrong choice since the following API + * has a slightly lower overhead for calling into the kernel: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); + * + * Or, alternatively, write your kernel as a functor: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); + */ + template + // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction. + std::enable_if_t::value && !std::is_same_v, RegisterOperators&&> + op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && { + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedRuntimeFunction(func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + + /** + * This API registers an operator based on a kernel lambda. + * + * This API looks like: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", [] (Tensor a, Tensor b) {...}); + * + * This is equivalent to: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .catchAllKernel([] (Tensor a, Tensor b) {...})); + * + */ + template + // enable_if: only enable it if Lambda is actually a stateless lambda + std::enable_if_t::value && guts::is_stateless_lambda>::value, RegisterOperators&&> + op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && { + static_assert(!std::is_base_of_v, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead."); + + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + + template + C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.") + // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda + std::enable_if_t::value && !guts::is_stateless_lambda>::value, RegisterOperators&&> + op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && { + static_assert(!std::is_base_of_v, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead."); + + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + +private: + void checkSchemaAndRegisterOp_(Options&& config); + + static c10::FunctionSchema inferSchemaFromKernels_(const OperatorName& opNameStr, const Options& options); + void checkNoDuplicateKernels_(const Options& options); + void registerOp_(Options&& options); + + std::vector registrars_; +}; + +} // namespace c10 + +namespace torch { + // Old-style API + using RegisterOperators = c10::RegisterOperators; +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..60e0025a2d63d264c9baef2fce846ae400b73cc5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h @@ -0,0 +1,85 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +#if defined(CPU_CAPABILITY_SVE) + +// Define the data type of VLS(vector-length specific). +typedef svbool_t vls_pred_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint8_t vls_int8_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint16_t vls_int16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint32_t vls_int32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint64_t vls_int64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint8_t vls_uint8_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint16_t vls_uint16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint32_t vls_uint32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint64_t vls_uint64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat16_t vls_float16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svbfloat16_t vls_bfloat16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat32_t vls_float32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat64_t vls_float64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); + +#define ptrue svptrue_b8() +#define ZERO_S8 svdup_n_s8(0) +#define ZERO_S16 svdup_n_s16(0) +#define ZERO_S32 svdup_n_s32(0) +#define ZERO_S64 svdup_n_s64(0) +#define ZERO_U8 svdup_n_u8(0) +#define ZERO_U16 svdup_n_u16(0) +#define ZERO_U32 svdup_n_u32(0) +#define ZERO_U64 svdup_n_u64(0) +#define ZERO_F16 svdup_n_f16(0.f) +#define ZERO_F32 svdup_n_f32(0.f) +#define ZERO_F64 svdup_n_f64(0.0) +#define ONE_S8 svdup_n_s8(1) +#define ONE_S16 svdup_n_s16(1) +#define ONE_S32 svdup_n_s32(1) +#define ONE_S64 svdup_n_s64(1) +#define ONE_U8 svdup_n_u8(1) +#define ONE_U16 svdup_n_u16(1) +#define ONE_U32 svdup_n_u32(1) +#define ONE_U64 svdup_n_u64(1) +#define ONE_F16 svdup_n_f16(1.f) +#define ONE_BF16 svdup_n_bf16(1.f) +#define ONE_F32 svdup_n_f32(1.f) +#define ONE_F64 svdup_n_f64(1.0) +#define ALL_S8_TRUE_MASK svdup_n_s8(0xff) +#define ALL_S8_FALSE_MASK svdup_n_s8(0x0) +#define ALL_S16_TRUE_MASK svdup_n_s16(0xffff) +#define ALL_S16_FALSE_MASK svdup_n_s16(0x0) +#define ALL_S32_TRUE_MASK svdup_n_s32(0xffffffff) +#define ALL_S32_FALSE_MASK svdup_n_s32(0x0) +#define ALL_S64_TRUE_MASK svdup_n_s64(0xffffffffffffffff) +#define ALL_S64_FALSE_MASK svdup_n_s64(0x0) +#define ALL_U8_TRUE_MASK svdup_n_u8(0x01) +#define ALL_U8_FALSE_MASK svdup_n_u8(0x00) +#define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK) +#define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK) +#define ALL_BF16_TRUE_MASK svreinterpret_bf16_s16(ALL_S16_TRUE_MASK) +#define ALL_BF16_FALSE_MASK svreinterpret_bf16_s16(ALL_S16_FALSE_MASK) +#define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK) +#define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK) +#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK) +#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK) + +#endif // defined(CPU_CAPABILITY_SVE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h new file mode 100644 index 0000000000000000000000000000000000000000..bb712e8d7ee510503f0a812fdcd4617b7678922a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h @@ -0,0 +1,598 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +namespace at { +namespace vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_bfloat16_t values; + + public: + using value_type = BFloat16; + using size_type = int; + + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(BFloat16); + } + + Vectorized(); + Vectorized(svbfloat16_t v) : values(v) {} + Vectorized(int val); + Vectorized(BFloat16 val); + + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ BFloat16 buffer[size()] = {vals...}; + values = svld1_bf16(ptrue, reinterpret_cast(buffer)); + } + + operator svbfloat16_t() const { + return values; + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s16(ptrue, svreinterpret_s16_bf16(mask_), ALL_S16_TRUE_MASK); + return svsel_bf16(mask, b, a); + } + template + static Vectorized arange( + BFloat16 base = 0.f, + step_t step = static_cast(1)) { + __at_align__ BFloat16 buffer[size()]; + for (int64_t i = 0; i < size(); i++) { + buffer[i] = base + i * step; + } + return svld1_bf16(ptrue, reinterpret_cast(buffer)); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + if (count == 0) { + return a; + } else if (count < size()) { + return svsel_bf16(svwhilelt_b16(0ull, count), b, a); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return svld1_bf16(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b16(0ull, count); + return svld1_bf16(pg, reinterpret_cast(ptr)); + } + void store(void* ptr, int64_t count = size()) const { + __at_align__ bfloat16_t tmp[size()]; + std::memset(tmp, 0, sizeof(tmp)); + if (count == size()) { + svst1_bf16(ptrue, reinterpret_cast(tmp), values); + } else { + svbool_t pg = svwhilelt_b16(0ull, count); + svst1_bf16(pg, reinterpret_cast(tmp), values); + } + std::memcpy( + reinterpret_cast(ptr), + reinterpret_cast(tmp), + count * sizeof(bfloat16_t)); + } + const BFloat16& operator[](int idx) const = delete; + BFloat16& operator[](int idx) = delete; + int64_t zero_mask() const { + int64_t mask = 0; + // returns an integer mask where all zero elements are translated to + // 1-bit and others are translated to 0-bit int64_t mask = 0; + __at_align__ int16_t mask_array[size()]; + + svbool_t svbool_mask = + svcmpeq_f16(ptrue, svreinterpret_f16_bf16(values), ZERO_F16); + svst1_s16( + ptrue, + mask_array, + svsel_s16(svbool_mask, ALL_S16_TRUE_MASK, ALL_S16_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); + } + return mask; + } + Vectorized isnan() const; + bool has_inf_nan() const; + Vectorized map(BFloat16 (*f)(BFloat16)) const { + __at_align__ BFloat16 tmp[size()]; + store(tmp); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = svdup_n_u16(0x7FFF); + auto vals = svreinterpret_u16_bf16(values); + vals = svand_u16_x(ptrue, vals, mask); + return svreinterpret_bf16_u16(vals); + } + Vectorized angle() const; + Vectorized real() const { + return values; + } + Vectorized imag() const { + return Vectorized(0.f); + } + Vectorized conj() const { + return values; + } + Vectorized acos() const; + Vectorized acosh() const; + Vectorized asin() const; + Vectorized atan() const; + Vectorized atanh() const; + Vectorized atan2(const Vectorized& b) const; + Vectorized copysign(const Vectorized& sign) const; + Vectorized erf() const; + Vectorized erfc() const; + Vectorized erfinv() const; + Vectorized exp() const; + Vectorized exp2() const; + Vectorized expm1() const; + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const; + Vectorized hypot(const Vectorized& b) const; + Vectorized i0() const; + Vectorized i0e() const; + Vectorized digamma() const; + Vectorized igamma(const Vectorized& x) const; + Vectorized igammac(const Vectorized& x) const; + Vectorized nextafter(const Vectorized& b) const; + Vectorized log() const; + Vectorized log2() const; + Vectorized log10() const; + Vectorized log1p() const; + Vectorized frac() const; + Vectorized sin() const; + Vectorized sinh() const; + Vectorized cos() const; + Vectorized cosh() const; + Vectorized ceil() const; + Vectorized floor() const; + Vectorized neg() const { + auto mask = svdup_n_u16(0x8000); + auto vals = svreinterpret_u16_bf16(values); + vals = sveor_u16_x(ptrue, vals, mask); + return svreinterpret_bf16_u16(vals); + } + Vectorized round() const; + Vectorized tan() const; + Vectorized tanh() const; + Vectorized trunc() const; + Vectorized lgamma() const; + Vectorized sqrt() const; + Vectorized reciprocal() const; + Vectorized rsqrt() const; + Vectorized pow(const Vectorized& b) const; + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const; + + Vectorized operator!=(const Vectorized& other) const; + + Vectorized operator<(const Vectorized& other) const; + + Vectorized operator<=(const Vectorized& other) const; + + Vectorized operator>(const Vectorized& other) const; + + Vectorized operator>=(const Vectorized& other) const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +#if defined(__GNUC__) && __GNUC__ == 14 +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif +inline std::tuple, Vectorized> +convert_bfloat16_float(const Vectorized& a) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); + auto bf16_vec1 = svzip1_bf16(zero, a); + auto bf16_vec2 = svzip2_bf16(zero, a); + auto x1 = svreinterpret_f32_bf16(bf16_vec1); + auto x2 = svreinterpret_f32_bf16(bf16_vec2); + return {Vectorized(x1), Vectorized(x2)}; +} + +inline Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a); + svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b); + return Vectorized(svuzp1_bf16(x1, x2)); +} + +inline void load_fp32_from_bf16(const BFloat16* data, Vectorized& out) { + __at_align__ float values[Vectorized::size()]; + for (const auto k : c10::irange(Vectorized::size())) { + values[k] = data[k]; + } + out = Vectorized::loadu(values); +} + +inline void load_fp32_from_bf16( + const BFloat16* data, + Vectorized& out1, + Vectorized& out2) { + Vectorized bf16_vec = Vectorized::loadu(data); + auto floats = convert_bfloat16_float(bf16_vec); + out1 = std::get<0>(floats); + out2 = std::get<1>(floats); +} + +template +Vectorized binary_operator_via_float( + Op op, + const Vectorized& a, + const Vectorized& b) { + const auto [a_float_low, a_float_high] = convert_bfloat16_float(a); + const auto [b_float_low, b_float_high] = convert_bfloat16_float(b); + return convert_float_bfloat16( + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::plus>(), a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::minus>(), a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::multiplies>(), a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::divides>(), a, b); +} + +inline Vectorized::Vectorized() { + auto vals_f = svdup_n_f32(0); + values = convert_float_bfloat16(vals_f, vals_f); +} + +inline Vectorized::Vectorized(int val) { + auto vals_f = svdup_n_f32(val); + values = convert_float_bfloat16(vals_f, vals_f); +} + +inline Vectorized::Vectorized(BFloat16 val) { + auto vals_f = svdup_n_f32((float)val); + values = convert_float_bfloat16(vals_f, vals_f); +} + +bool inline Vectorized::has_inf_nan() const { + auto [v1, v2] = convert_bfloat16_float(values); + return v1.has_inf_nan() || v2.has_inf_nan(); +} +// frac. Implement this here so we can use subtraction +Vectorized inline Vectorized::frac() const { + return *this - this->trunc(); +} + +#define DEFINE_BF16_FUNC_VIA_FLOAT(func_name) \ + Vectorized inline Vectorized::func_name() const { \ + auto [v1, v2] = convert_bfloat16_float(*this); \ + v1 = v1.func_name(); \ + v2 = v2.func_name(); \ + return convert_float_bfloat16(v1, v2); \ + } + +#define DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(func_name) \ + Vectorized inline Vectorized::func_name( \ + const Vectorized& a) const { \ + auto [v1, v2] = convert_bfloat16_float(*this); \ + auto [v3, v4] = convert_bfloat16_float(a); \ + v1 = v1.func_name(v3); \ + v2 = v2.func_name(v4); \ + return convert_float_bfloat16(v1, v2); \ + } + +DEFINE_BF16_FUNC_VIA_FLOAT(isnan) +DEFINE_BF16_FUNC_VIA_FLOAT(angle) +DEFINE_BF16_FUNC_VIA_FLOAT(acos) +DEFINE_BF16_FUNC_VIA_FLOAT(acosh) +DEFINE_BF16_FUNC_VIA_FLOAT(asin) +DEFINE_BF16_FUNC_VIA_FLOAT(atan) +DEFINE_BF16_FUNC_VIA_FLOAT(atanh) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign) +DEFINE_BF16_FUNC_VIA_FLOAT(erf) +DEFINE_BF16_FUNC_VIA_FLOAT(erfc) +DEFINE_BF16_FUNC_VIA_FLOAT(exp) +DEFINE_BF16_FUNC_VIA_FLOAT(exp2) +DEFINE_BF16_FUNC_VIA_FLOAT(expm1) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot) +DEFINE_BF16_FUNC_VIA_FLOAT(i0) +DEFINE_BF16_FUNC_VIA_FLOAT(i0e) +DEFINE_BF16_FUNC_VIA_FLOAT(digamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter) +DEFINE_BF16_FUNC_VIA_FLOAT(log) +DEFINE_BF16_FUNC_VIA_FLOAT(log2) +DEFINE_BF16_FUNC_VIA_FLOAT(log10) +DEFINE_BF16_FUNC_VIA_FLOAT(log1p) +DEFINE_BF16_FUNC_VIA_FLOAT(sin) +DEFINE_BF16_FUNC_VIA_FLOAT(sinh) +DEFINE_BF16_FUNC_VIA_FLOAT(cos) +DEFINE_BF16_FUNC_VIA_FLOAT(cosh) +DEFINE_BF16_FUNC_VIA_FLOAT(ceil) +DEFINE_BF16_FUNC_VIA_FLOAT(floor) +DEFINE_BF16_FUNC_VIA_FLOAT(round) +DEFINE_BF16_FUNC_VIA_FLOAT(tan) +DEFINE_BF16_FUNC_VIA_FLOAT(tanh) +DEFINE_BF16_FUNC_VIA_FLOAT(trunc) +DEFINE_BF16_FUNC_VIA_FLOAT(lgamma) +DEFINE_BF16_FUNC_VIA_FLOAT(sqrt) +DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal) +DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow) + +Vectorized inline Vectorized::operator==( + const Vectorized& other) const { + auto [f1, f2] = convert_bfloat16_float(values); + auto [f3, f4] = convert_bfloat16_float(other); + svbool_t mask1 = svcmpeq_f32(ptrue, f1, f3); + svbool_t mask2 = svcmpeq_f32(ptrue, f2, f4); + auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + + auto bf16_1 = svreinterpret_bf16_f32(res1); + auto bf16_2 = svreinterpret_bf16_f32(res2); + return svuzp1_bf16(bf16_1, bf16_2); +} +Vectorized inline Vectorized::operator!=( + const Vectorized& other) const { + auto [f1, f2] = convert_bfloat16_float(values); + auto [f3, f4] = convert_bfloat16_float(other); + svbool_t mask1 = svcmpne_f32(ptrue, f1, f3); + svbool_t mask2 = svcmpne_f32(ptrue, f2, f4); + auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + + auto bf16_1 = svreinterpret_bf16_f32(res1); + auto bf16_2 = svreinterpret_bf16_f32(res2); + return svuzp1_bf16(bf16_1, bf16_2); +} +Vectorized inline Vectorized::operator>( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 > v3, v2 > v4); +} +Vectorized inline Vectorized::operator>=( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 >= v3, v2 >= v4); +} +Vectorized inline Vectorized::operator<( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 < v3, v2 < v4); +} +Vectorized inline Vectorized::operator<=( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 <= v3, v2 <= v4); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), + a, + b); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), + a, + b); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&clamp_max), + a, + max); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&clamp_min), + a, + min); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return clamp_min(clamp_max(a, max), min); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + svand_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + svorr_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + sveor_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svst1_bf16( + ptrue, + const_cast(reinterpret_cast(dst)) + i, + svldnt1_bf16( + ptrue, + const_cast(reinterpret_cast(src)) + + i)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + svbool_t pg = svwhilelt_b16(i, n); + svst1_bf16( + pg, + const_cast(reinterpret_cast(dst)) + i, + svldnt1_bf16( + pg, + const_cast(reinterpret_cast(src)) + + i)); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b + c; +} + +#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16) + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h new file mode 100644 index 0000000000000000000000000000000000000000..d11be323e05416cb0d7ef821e8bd0dde7ad1d0c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h @@ -0,0 +1,241 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with SVE] + +#include + +#include +#include + +#if defined(CPU_CAPABILITY_SVE) +#include +#include +#include +#include +#include +#endif + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix) \ + template <> \ + inline Vectorized cast(const Vectorized& src) { \ + return svreinterpret_##t1_prefix##_##t2_prefix(src); \ + } \ + template <> \ + inline Vectorized cast(const Vectorized& src) { \ + return svreinterpret_##t2_prefix##_##t1_prefix(src); \ + } + +DEFINE_SVE_CAST(int64_t, s64, double, f64) +DEFINE_SVE_CAST(int32_t, s32, double, f64) +DEFINE_SVE_CAST(int16_t, s16, double, f64) +DEFINE_SVE_CAST(int64_t, s64, float, f32) +DEFINE_SVE_CAST(int32_t, s32, float, f32) +DEFINE_SVE_CAST(int16_t, s16, float, f32) +DEFINE_SVE_CAST(float, f32, double, f64) + +#ifdef __ARM_FEATURE_BF16 +DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16) +DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16) +DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16) +#endif // __ARM_FEATURE_BF16 + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex_) { + svint64_t vindex = + svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3); + return svld1_gather_s64index_f64(ptrue, base_addr, vindex); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex_) { + svint32_t vindex = + svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); + return svld1_gather_s32index_f32(ptrue, base_addr, vindex); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex_, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK); + svint64_t vindex = + svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3); + return svsel_f64( + mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex_, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK); + svint32_t vindex = + svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); + return svsel_f32( + mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Only works for inputs in the range: [-2^51, 2^51] +// From: https://stackoverflow.com/a/41148578 +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000)); + return svsub_s64_x( + ptrue, + svreinterpret_s64_f64(x), + svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000))); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return svcvt_s32_f32_x(ptrue, src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a3, a3} + // b = {b0, b1, b2, b3} + // group cols crossing lanes: + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + return std::make_pair( + Vectorized(svzip1_f64(a, b)), + Vectorized(svzip2_f64(a, b))); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + return std::make_pair( + Vectorized(svzip1_f32(a, b)), Vectorized(svzip2_f32(a, b))); +} + +#ifdef __ARM_FEATURE_BF16 +template <> +std::pair< + Vectorized, + Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + return std::make_pair( + Vectorized(svzip1_bf16(a, b)), + Vectorized(svzip2_bf16(a, b))); +} +#endif // __ARM_FEATURE_BF16 + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + return std::make_pair( + Vectorized(svuzp1_f64(a, b)), + Vectorized(svuzp2_f64(a, b))); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + return std::make_pair( + Vectorized(svuzp1_f32(a, b)), Vectorized(svuzp2_f32(a, b))); +} + +#ifdef __ARM_FEATURE_BF16 +template <> +std::pair< + Vectorized, + Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + return std::make_pair( + Vectorized(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)), + Vectorized(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b))); +} +#endif // __ARM_FEATURE_BF16 + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h new file mode 100644 index 0000000000000000000000000000000000000000..8abd6d275e80db7658c8c187ccc78031b6c600b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h @@ -0,0 +1,622 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) +#include +#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code +#else +#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code +#endif + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_float64_t values; + + public: + using value_type = double; + using size_type = int; + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(double); + } + Vectorized() { + values = svdup_n_f64(0); + } + Vectorized(svfloat64_t v) : values(v) {} + Vectorized(double val) { + values = svdup_n_f64(val); + } + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ double buffer[size()] = {vals...}; + values = svld1_f64(ptrue, buffer); + } + operator svfloat64_t() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each element is 1 if the corresponding bit in + // 'mask' is set, 0 otherwise. + __at_align__ int64_t flag_arr[size()]; + for (int i = 0; i < size(); i++) { + flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0; + } + // Load the flag array into an SVE int64 vector. + svint64_t int_mask = svld1_s64(svptrue_b64(), flag_arr); + // Compare each lane of int_mask to 0; returns an svbool_t predicate where + // true indicates a nonzero flag. + svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0); + + // Use svsel to select elements from b where the predicate is true, else + // from a. + svfloat64_t result = svsel(blend_mask, b.values, a.values); + return Vectorized(result); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK); + return svsel_f64(mask, b, a); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + __at_align__ double buffer[size()]; + for (int64_t i = 0; i < size(); i++) { + buffer[i] = base + i * step; + } + return svld1_f64(ptrue, buffer); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + if (count == 0) { + return a; + } else if (count < size()) { + return svsel_f64(svwhilelt_b64(0ull, count), b, a); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return svld1_f64(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b64(0ull, count); + return svld1_f64(pg, reinterpret_cast(ptr)); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + svst1_f64(ptrue, reinterpret_cast(ptr), values); + } else { + svbool_t pg = svwhilelt_b64(0ull, count); + svst1_f64(pg, reinterpret_cast(ptr), values); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int64_t zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + int64_t mask = 0; + __at_align__ int64_t mask_array[size()]; + + svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64); + svst1_s64( + ptrue, + mask_array, + svsel_s64(svbool_mask, ALL_S64_TRUE_MASK, ALL_S64_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); + } + return mask; + } + Vectorized isnan() const { + // NaN check + svbool_t mask = svcmpuo_f64(ptrue, values, ZERO_F64); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + bool has_inf_nan() const { + return svptest_any( + ptrue, + svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64)); + } + Vectorized map(double (*f)(double)) const { + __at_align__ double tmp[size()]; + store(tmp); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + return svabs_f64_x(ptrue, values); + } + Vectorized angle() const { + const auto nan_vec = svdup_n_f64(NAN); + const auto nan_mask = svcmpuo_f64(ptrue, values, ZERO_F64); + const auto pi = svdup_n_f64(c10::pi); + + const auto neg_mask = svcmplt_f64(ptrue, values, ZERO_F64); + auto angle = svsel_f64(neg_mask, pi, ZERO_F64); + angle = svsel_f64(nan_mask, nan_vec, angle); + return angle; + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized(0.0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return USE_SLEEF( + Vectorized(Sleef_acosdx_u10sve(values)), map(std::acos)); + } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshdx_u10sve(values)), map(std::acosh)); + } + Vectorized asin() const { + return USE_SLEEF( + Vectorized(Sleef_asindx_u10sve(values)), map(std::asin)); + } + Vectorized asinh() const { + return USE_SLEEF( + Vectorized(Sleef_asinhdx_u10sve(values)), map(std::asinh)); + } + Vectorized atan() const { + return USE_SLEEF( + Vectorized(Sleef_atandx_u10sve(values)), map(std::atan)); + } + Vectorized atanh() const { + return USE_SLEEF( + Vectorized(Sleef_atanhdx_u10sve(values)), map(std::atanh)); + } + Vectorized atan2(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_atan2dx_u10sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::atan2(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized copysign(const Vectorized& sign) const { + USE_SLEEF( + { return Vectorized(Sleef_copysigndx_sve(values, sign)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_sign[size()]; + store(tmp); + sign.store(tmp_sign); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::copysign(tmp[i], tmp_sign[i]); + } + return loadu(tmp); + })} Vectorized erf() const { + return USE_SLEEF( + Vectorized(Sleef_erfdx_u10sve(values)), map(std::erf)); + } + Vectorized erfc() const { + return USE_SLEEF( + Vectorized(Sleef_erfcdx_u15sve(values)), map(std::erfc)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return USE_SLEEF( + Vectorized(Sleef_expdx_u10sve(values)), map(std::exp)); + } + Vectorized exp2() const { + return USE_SLEEF( + Vectorized(Sleef_exp2dx_u10sve(values)), map(std::exp2)); + } + Vectorized expm1() const { + return USE_SLEEF( + Vectorized(Sleef_expm1dx_u10sve(values)), map(std::expm1)); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const {USE_SLEEF( + { return Vectorized(Sleef_fmoddx_sve(values, q)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_q[size()]; + store(tmp); + q.store(tmp_q); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::fmod(tmp[i], tmp_q[i]); + } + return loadu(tmp); + })} Vectorized hypot(const Vectorized& b) const { + USE_SLEEF( + { return Vectorized(Sleef_hypotdx_u05sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::hypot(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized nextafter(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_nextafterdx_sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::nextafter(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized log() const { + return USE_SLEEF( + Vectorized(Sleef_logdx_u10sve(values)), map(std::log)); + } + Vectorized log2() const { + return USE_SLEEF( + Vectorized(Sleef_log2dx_u10sve(values)), map(std::log2)); + } + Vectorized log10() const { + return USE_SLEEF( + Vectorized(Sleef_log10dx_u10sve(values)), map(std::log10)); + } + Vectorized log1p() const { + return USE_SLEEF( + Vectorized(Sleef_log1pdx_u10sve(values)), map(std::log1p)); + } + Vectorized frac() const; + Vectorized sin() const { + return USE_SLEEF( + Vectorized(Sleef_sindx_u10sve(values)), map(std::sin)); + } + Vectorized sinh() const { + return USE_SLEEF( + Vectorized(Sleef_sinhdx_u10sve(values)), map(std::sinh)); + } + Vectorized cos() const { + return USE_SLEEF( + Vectorized(Sleef_cosdx_u10sve(values)), map(std::cos)); + } + Vectorized cosh() const { + return USE_SLEEF( + Vectorized(Sleef_coshdx_u10sve(values)), map(std::cosh)); + } + Vectorized ceil() const { + return svrintp_f64_x(ptrue, values); + } + Vectorized floor() const { + return svrintm_f64_x(ptrue, values); + } + Vectorized neg() const { + return svneg_f64_x(ptrue, values); + } + Vectorized round() const { + return svrinti_f64_x(ptrue, values); + } + Vectorized tan() const { + return USE_SLEEF( + Vectorized(Sleef_tandx_u10sve(values)), map(std::tan)); + } + Vectorized tanh() const { + return USE_SLEEF( + Vectorized(Sleef_tanhdx_u10sve(values)), map(std::tanh)); + } + Vectorized trunc() const { + return svrintz_f64_x(ptrue, values); + } + Vectorized lgamma() const { + return USE_SLEEF( + Vectorized(Sleef_lgammadx_u10sve(values)), map(std::lgamma)); + } + Vectorized sqrt() const { + return svsqrt_f64_x(ptrue, values); + } + Vectorized reciprocal() const { + return svdivr_f64_x(ptrue, values, ONE_F64); + } + Vectorized rsqrt() const { + return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64); + } + Vectorized pow(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_powdx_u10sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::pow(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + svbool_t mask = svcmpeq_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized operator!=(const Vectorized& other) const { + svbool_t mask = svcmpne_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized operator<(const Vectorized& other) const { + svbool_t mask = svcmplt_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized operator<=(const Vectorized& other) const { + svbool_t mask = svcmple_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized operator>(const Vectorized& other) const { + svbool_t mask = svcmpgt_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized operator>=(const Vectorized& other) const { + svbool_t mask = svcmpge_f64(ptrue, values, other); + return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return svadd_f64_x(ptrue, a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return svsub_f64_x(ptrue, a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return svmul_f64_x(ptrue, a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return svdiv_f64_x(ptrue, a, b); +} + +// frac. Implement this here so we can use subtraction +Vectorized inline Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return svmax_f64_x(ptrue, a, b); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return svmin_f64_x(ptrue, a, b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return svmin_f64_x(ptrue, max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return svmax_f64_x(ptrue, min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0); +} + +template <> +inline void convert(const double* src, double* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svst1_f64(ptrue, dst + i, svldnt1_f64(ptrue, src + i)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + svbool_t pg = svwhilelt_b64(i, n); + svst1_f64(pg, dst + i, svldnt1_f64(pg, src + i)); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svmad_f64_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svmsb_f64_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svnmsb_f64_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svnmad_f64_x(ptrue, a, b, c); +} + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h new file mode 100644 index 0000000000000000000000000000000000000000..008b7bb711ad0888d8ba8fac509c6e8f31599c28 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h @@ -0,0 +1,760 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) +#include +#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code +#else +#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code +#endif + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_float32_t values; + + public: + using value_type = float; + using size_type = int; + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(float); + } + Vectorized() { + values = svdup_n_f32(0); + } + Vectorized(svfloat32_t v) : values(v) {} + Vectorized(float val) { + values = svdup_n_f32(val); + } + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ float buffer[size()] = {vals...}; + values = svld1_f32(ptrue, buffer); + } + operator svfloat32_t() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each element is 1 if the corresponding bit in + // 'mask' is set, 0 otherwise. + __at_align__ int32_t flag_arr[size()]; + for (int i = 0; i < size(); i++) { + flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0; + } + // Load the flag array into an SVE int32 vector. + svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr); + // Compare each lane of int_mask to 0; returns an svbool_t predicate where + // true indicates a nonzero flag. + svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0); + // Use svsel to select elements from b where the predicate is true, else + // from a. + svfloat32_t result = svsel_f32(blend_mask, b.values, a.values); + return Vectorized(result); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK); + return svsel_f32(mask, b, a); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + __at_align__ float buffer[size()]; + for (int64_t i = 0; i < size(); i++) { + buffer[i] = base + i * step; + } + return svld1_f32(ptrue, buffer); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + if (count == 0) { + return a; + } else if (count < size()) { + return svsel_f32(svwhilelt_b32(0ull, count), b, a); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return svld1_f32(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b32(0ull, count); + return svld1_f32(pg, reinterpret_cast(ptr)); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + svst1_f32(ptrue, reinterpret_cast(ptr), values); + } else { + svbool_t pg = svwhilelt_b32(0ull, count); + svst1_f32(pg, reinterpret_cast(ptr), values); + } + } + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int64_t zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + int64_t mask = 0; + __at_align__ int32_t mask_array[size()]; + + svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32); + svst1_s32( + ptrue, + mask_array, + svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); + } + return mask; + } + Vectorized isnan() const { + // NaN check + svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + bool has_inf_nan() const { + return svptest_any( + ptrue, + svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32)); + } + Vectorized map(float (*f)(float)) const { + __at_align__ float tmp[size()]; + store(tmp); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + return svabs_f32_x(ptrue, values); + } + Vectorized angle() const { + const auto nan_vec = svdup_n_f32(NAN); + const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32); + const auto pi = svdup_n_f32(c10::pi); + + const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32); + auto angle = svsel_f32(neg_mask, pi, ZERO_F32); + angle = svsel_f32(nan_mask, nan_vec, angle); + return angle; + } + Vectorized real() const { + return values; + } + Vectorized imag() const { + return Vectorized(0.f); + } + Vectorized conj() const { + return values; + } + Vectorized acos() const { + return USE_SLEEF( + Vectorized(Sleef_acosfx_u10sve(values)), map(std::acos)); + } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshfx_u10sve(values)), map(std::acosh)); + } + Vectorized asin() const { + return USE_SLEEF( + Vectorized(Sleef_asinfx_u10sve(values)), map(std::asin)); + } + Vectorized asinh() const { + return USE_SLEEF( + Vectorized(Sleef_asinhfx_u10sve(values)), map(std::asinh)); + } + Vectorized atan() const { + return USE_SLEEF( + Vectorized(Sleef_atanfx_u10sve(values)), map(std::atan)); + } + Vectorized atanh() const { + return USE_SLEEF( + Vectorized(Sleef_atanhfx_u10sve(values)), map(std::atanh)); + } + Vectorized atan2(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_atan2fx_u10sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::atan2(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized copysign(const Vectorized& sign) const { + + USE_SLEEF( + { return Vectorized(Sleef_copysignfx_sve(values, sign)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_sign[size()]; + store(tmp); + sign.store(tmp_sign); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::copysign(tmp[i], tmp_sign[i]); + } + return loadu(tmp); + })} Vectorized erf() const { + return USE_SLEEF( + Vectorized(Sleef_erffx_u10sve(values)), map(std::erf)); + } + Vectorized erfc() const { + return USE_SLEEF( + Vectorized(Sleef_erfcfx_u15sve(values)), map(std::erfc)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return USE_SLEEF( + Vectorized(Sleef_expfx_u10sve(values)), map(std::exp)); + } + Vectorized exp2() const { + return USE_SLEEF( + Vectorized(Sleef_exp2fx_u10sve(values)), map(std::exp2)); + } + Vectorized expm1() const { + return USE_SLEEF( + Vectorized(Sleef_expm1fx_u10sve(values)), map(std::expm1)); + } + // Implementation copied from Arm Optimized Routines: + // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c + Vectorized exp_u20() const { + // special case to handle special inputs that are too large or too small + // i.e. where there's at least one element x, s.t. |x| >= 87.3... + svbool_t is_special_case = svacgt(svptrue_b32(), values, 0x1.5d5e2ap+6f); + if (svptest_any(svptrue_b32(), is_special_case)) { + return exp(); + } + const svfloat32_t ln2_hi = svdup_n_f32(0x1.62e4p-1f); + const svfloat32_t ln2_lo = svdup_n_f32(0x1.7f7d1cp-20f); + const svfloat32_t c1 = svdup_n_f32(0.5f); + const svfloat32_t inv_ln2 = svdup_n_f32(0x1.715476p+0f); + + const float shift = 0x1.803f8p17f; + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, values, shift); + svfloat32_t n = svsub_x(svptrue_b32(), z, shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = values; + r = svmls_x(svptrue_b32(), r, n, ln2_hi); + r = svmls_x(svptrue_b32(), r, n, ln2_lo); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa(svreinterpret_u32(z)); + + /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */ + svfloat32_t r2 = svmul_x(svptrue_b32(), r, r); + svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1); + return svmla_x(svptrue_b32(), scale, scale, poly); + } + Vectorized fexp_u20() const { + return exp_u20(); + } + Vectorized fmod(const Vectorized& q) const {USE_SLEEF( + { return Vectorized(Sleef_fmodfx_sve(values, q)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_q[size()]; + store(tmp); + q.store(tmp_q); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::fmod(tmp[i], tmp_q[i]); + } + return loadu(tmp); + })} Vectorized hypot(const Vectorized& b) const { + USE_SLEEF( + { return Vectorized(Sleef_hypotfx_u05sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::hypot(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized nextafter(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_nextafterfx_sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::nextafter(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized log() const { + return USE_SLEEF( + Vectorized(Sleef_logfx_u10sve(values)), map(std::log)); + } + Vectorized log2() const { + return USE_SLEEF( + Vectorized(Sleef_log2fx_u10sve(values)), map(std::log2)); + } + Vectorized log10() const { + return USE_SLEEF( + Vectorized(Sleef_log10fx_u10sve(values)), map(std::log10)); + } + Vectorized log1p() const { + return USE_SLEEF( + Vectorized(Sleef_log1pfx_u10sve(values)), map(std::log1p)); + } + Vectorized frac() const; + Vectorized sin() const { + return USE_SLEEF( + Vectorized(Sleef_sinfx_u10sve(values)), map(std::sin)); + } + Vectorized sinh() const { + return USE_SLEEF( + Vectorized(Sleef_sinhfx_u10sve(values)), map(std::sinh)); + } + Vectorized cos() const { + return USE_SLEEF( + Vectorized(Sleef_cosfx_u10sve(values)), map(std::cos)); + } + Vectorized cosh() const { + return USE_SLEEF( + Vectorized(Sleef_coshfx_u10sve(values)), map(std::cosh)); + } + Vectorized ceil() const { + return svrintp_f32_x(ptrue, values); + } + Vectorized floor() const { + return svrintm_f32_x(ptrue, values); + } + Vectorized neg() const { + return svneg_f32_x(ptrue, values); + } + Vectorized round() const { + return svrinti_f32_x(ptrue, values); + } + Vectorized tan() const { + return USE_SLEEF( + Vectorized(Sleef_tanfx_u10sve(values)), map(std::tan)); + } + // Implementation is picked from + // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179 + Vectorized tanh() const { + // Constants used for the tanh calculation. + const svfloat32_t CONST_1 = + svdup_n_f32(1.f); // Constant 1.0f for the tanh formula. + const svfloat32_t CONST_2 = svdup_n_f32( + 2.f); // Constant 2.0f for the tanh formula (used in exp(2x)). + const svfloat32_t CONST_MIN_TANH = svdup_n_f32( + -10.f); // Minimum threshold for input values to prevent overflow. + const svfloat32_t CONST_MAX_TANH = svdup_n_f32( + 10.f); // Maximum threshold for input values to prevent overflow. + + // Step 1: Clamp the values within the range [-10, 10] to prevent overflow + // during exponentiation. The tanh function approaches ±1 rapidly as the + // input grows large, so we limit the input range to avoid numerical + // instability. svmax_f32_z ensures values are greater than -10, and + // svmin_f32_z ensures they are less than 10. + svfloat32_t x = svmin_f32_z( + ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH); + + // Step 2: Calculate exp(2 * x), where x is the clamped value. + // svmul_f32_z computes 2 * x, and exp_u20() computes the exponential of + // the result (via Vectorized, then auto-converts back to + // svfloat32_t). + svfloat32_t exp2x = + Vectorized(svmul_f32_z(ptrue, CONST_2, x)).exp_u20(); + + // Step 3: Calculate the numerator of the tanh function, which is exp(2x) + // - 1. + svfloat32_t num = svsub_f32_z(ptrue, exp2x, CONST_1); + + // Step 4: Calculate the denominator of the tanh function, which is exp(2x) + // + 1. + svfloat32_t den = svadd_f32_z(ptrue, exp2x, CONST_1); + + // Step 5: Calculate the tanh function as the ratio of the numerator and + // denominator: num / den. + svfloat32_t tanh = svdiv_f32_z(ptrue, num, den); + + // Return the calculated tanh values. + return tanh; + } + Vectorized trunc() const { + return svrintz_f32_x(ptrue, values); + } + Vectorized lgamma() const { + return USE_SLEEF( + Vectorized(Sleef_lgammafx_u10sve(values)), map(std::lgamma)); + } + Vectorized sqrt() const { + return svsqrt_f32_x(ptrue, values); + } + Vectorized reciprocal() const { + return svdivr_f32_x(ptrue, values, ONE_F32); + } + Vectorized rsqrt() const { + return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32); + } + Vectorized pow(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_powfx_u10sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::pow(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + svbool_t mask = svcmpeq_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized operator!=(const Vectorized& other) const { + svbool_t mask = svcmpne_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized operator<(const Vectorized& other) const { + svbool_t mask = svcmplt_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized operator<=(const Vectorized& other) const { + svbool_t mask = svcmple_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized operator>(const Vectorized& other) const { + svbool_t mask = svcmpgt_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized operator>=(const Vectorized& other) const { + svbool_t mask = svcmpge_f32(ptrue, values, other); + return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return svadd_f32_x(ptrue, a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return svsub_f32_x(ptrue, a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return svmul_f32_x(ptrue, a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return svdiv_f32_x(ptrue, a, b); +} + +// frac. Implement this here so we can use subtraction +Vectorized inline Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return svmax_f32_x(ptrue, a, b); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return svmin_f32_x(ptrue, a, b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return svmin_f32_x(ptrue, max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return svmax_f32_x(ptrue, min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +inline void convert(const float* src, float* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + svbool_t pg = svwhilelt_b32(i, n); + svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i)); + } +} + +template <> +inline void convert(const float* src, at::Half* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized::size()); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svfloat16_t src_vec = svuzp1_f16( + svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); + svst1_f16(pg_16, reinterpret_cast(dst) + i, src_vec); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_16 = svwhilelt_b16(i, n); + pg_32 = svwhilelt_b32(i, n); + svfloat16_t src_vec = svuzp1_f16( + svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); + svst1_f16(pg_16, reinterpret_cast(dst) + i, src_vec); + } +} + +template <> +inline void convert(const at::Half* src, float* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized::size()); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svfloat16_t src_vec = svzip1_f16( + svldnt1_f16(pg_16, reinterpret_cast(src) + i), + ZERO_F16); + svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_16 = svwhilelt_b16(i, n); + pg_32 = svwhilelt_b32(i, n); + svfloat16_t src_vec = svzip1_f16( + svldnt1_f16(pg_16, reinterpret_cast(src) + i), + ZERO_F16); + svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec)); + } +} + +template <> +inline void convert(const bool* src, float* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); + svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); + svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_8 = svwhilelt_b8(i, n); + pg_32 = svwhilelt_b32(i, n); + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); + svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); + svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32)); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svmad_f32_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svmsb_f32_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svnmsb_f32_x(ptrue, a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return svnmad_f32_x(ptrue, a, b, c); +} + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h new file mode 100644 index 0000000000000000000000000000000000000000..3dee484491f505993e1c523591b88747e782ede0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h @@ -0,0 +1,504 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE) + +#define VEC_INT_SVE_TEMPLATE(vl, bit) \ + template <> \ + struct is_vec_specialized_for : std::bool_constant {}; \ + \ + template <> \ + class Vectorized { \ + private: \ + vls_int##bit##_t values; \ + \ + public: \ + using value_type = int##bit##_t; \ + using size_type = int; \ + static constexpr size_type size() { \ + return vl; \ + } \ + Vectorized() { \ + values = svdup_n_s##bit(0); \ + } \ + Vectorized(svint##bit##_t v) : values(v) {} \ + Vectorized(int##bit##_t val) { \ + values = svdup_n_s##bit(val); \ + } \ + template < \ + typename... Args, \ + typename = std::enable_if_t<(sizeof...(Args) == size())>> \ + Vectorized(Args... vals) { \ + __at_align__ int##bit##_t buffer[size()] = {vals...}; \ + values = svld1_s##bit(ptrue, buffer); \ + } \ + operator svint##bit##_t() const { \ + return values; \ + } \ + template \ + static Vectorized blend( \ + const Vectorized& a, \ + const Vectorized& b) { \ + __at_align__ int##bit##_t flag_arr[size()]; \ + for (int i = 0; i < size(); ++i) { \ + flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0; \ + } \ + svbool_t blend_mask = svcmpne_n_s##bit( \ + svptrue_b##bit(), svld1_s##bit(svptrue_b##bit(), flag_arr), 0); \ + return Vectorized( \ + svsel_s##bit(blend_mask, b.values, a.values)); \ + } \ + static Vectorized blendv( \ + const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& mask_) { \ + svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK); \ + return svsel_s##bit(mask, b, a); \ + } \ + /* step sometimes requires a higher precision type (e.g., T=int, \ + * step_t=double) */ \ + template \ + static Vectorized arange( \ + int##bit##_t base = 0, \ + step_t step = static_cast(1)) { \ + __at_align__ int##bit##_t buffer[size()]; \ + for (int64_t i = 0; i < size(); i++) { \ + buffer[i] = base + i * step; \ + } \ + return svld1_s##bit(ptrue, buffer); \ + } \ + static Vectorized set( \ + const Vectorized& a, \ + const Vectorized& b, \ + int##bit##_t count = size()) { \ + if (count == 0) { \ + return a; \ + } else if (count < size()) { \ + return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a); \ + } \ + return b; \ + } \ + static Vectorized loadu( \ + const void* ptr, \ + int64_t count = size()) { \ + if (count == size()) \ + return svld1_s##bit( \ + ptrue, reinterpret_cast(ptr)); \ + svbool_t pg = svwhilelt_b##bit(0ull, count); \ + return svld1_s##bit(pg, reinterpret_cast(ptr)); \ + } \ + void store(void* ptr, int64_t count = size()) const { \ + if (count == size()) { \ + svst1_s##bit(ptrue, reinterpret_cast(ptr), values); \ + } else { \ + svbool_t pg = svwhilelt_b##bit(0ull, count); \ + svst1_s##bit(pg, reinterpret_cast(ptr), values); \ + } \ + } \ + const int##bit##_t& operator[](int idx) const = delete; \ + int##bit##_t& operator[](int idx) = delete; \ + Vectorized abs() const { \ + return svabs_s##bit##_x(ptrue, values); \ + } \ + Vectorized real() const { \ + return values; \ + } \ + Vectorized imag() const { \ + return svdup_n_s##bit(0); \ + } \ + Vectorized conj() const { \ + return values; \ + } \ + Vectorized frac() const; \ + Vectorized neg() const { \ + return svneg_s##bit##_x(ptrue, values); \ + } \ + Vectorized operator==( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpeq_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator!=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpne_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator<( \ + const Vectorized& other) const { \ + svbool_t mask = svcmplt_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator<=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmple_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator>( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpgt_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator>=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpge_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized eq(const Vectorized& other) const; \ + Vectorized ne(const Vectorized& other) const; \ + Vectorized gt(const Vectorized& other) const; \ + Vectorized ge(const Vectorized& other) const; \ + Vectorized lt(const Vectorized& other) const; \ + Vectorized le(const Vectorized& other) const; \ + }; \ + template <> \ + Vectorized inline operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return svadd_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return svsub_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return svmul_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline maximum( \ + const Vectorized& a, const Vectorized& b) { \ + return svmax_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline minimum( \ + const Vectorized& a, const Vectorized& b) { \ + return svmin_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline clamp( \ + const Vectorized& a, \ + const Vectorized& min, \ + const Vectorized& max) { \ + return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a)); \ + } \ + template <> \ + Vectorized inline clamp_max( \ + const Vectorized& a, \ + const Vectorized& max) { \ + return svmin_s##bit##_x(ptrue, max, a); \ + } \ + template <> \ + Vectorized inline clamp_min( \ + const Vectorized& a, \ + const Vectorized& min) { \ + return svmax_s##bit##_x(ptrue, min, a); \ + } \ + template <> \ + Vectorized inline operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return svand_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return svorr_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return sveor_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + inline Vectorized operator~( \ + const Vectorized& a) { \ + return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1)); \ + } \ + Vectorized inline Vectorized::eq( \ + const Vectorized& other) const { \ + return (*this == other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ne( \ + const Vectorized& other) const { \ + return (*this != other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::gt( \ + const Vectorized& other) const { \ + return (*this > other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ge( \ + const Vectorized& other) const { \ + return (*this >= other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::lt( \ + const Vectorized& other) const { \ + return (*this < other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::le( \ + const Vectorized& other) const { \ + return (*this <= other) & Vectorized(1); \ + } + +VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64) +VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32) +VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16) +VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8) + +template +Vectorized inline intdiv_nosve( + const Vectorized& a, + const Vectorized& b) { + T values_a[Vectorized::size()]; + T values_b[Vectorized::size()]; + a.store(values_a); + b.store(values_b); + for (int i = 0; i != Vectorized::size(); i++) { + values_a[i] /= values_b[i]; + } + return Vectorized::loadu(values_a); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return svdiv_s64_x(ptrue, a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return svdiv_s32_x(ptrue, a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return intdiv_nosve(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return intdiv_nosve(a, b); +} + +template <> +inline void convert(const int32_t* src, int64_t* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); + svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) + svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i))); +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_32 = svwhilelt_b32(i, n); + pg_64 = svwhilelt_b64(i, n); + svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i))); + } +} + +template <> +inline void convert(const int64_t* src, float* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); + svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i); + svfloat32_t src_vec_f32 = + svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); + svst1_f32(pg_32, dst + i, src_vec_f32); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_32 = svwhilelt_b32(i, n); + pg_64 = svwhilelt_b64(i, n); + svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i); + svfloat32_t src_vec_f32 = + svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); + svst1_f32(pg_32, dst + i, src_vec_f32); + } +} + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg = svwhilelt_b32(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svint32_t src_vec = svldnt1_s32(pg, src + i); + svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg = svwhilelt_b32(i, n); + svint32_t src_vec = svldnt1_s32(pg, src + i); + svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec)); + } +} + +template <> +inline void convert(const bool* src, int64_t* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); + svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint64_t src_vec_u64 = + svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); + svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64); + svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_8 = svwhilelt_b8(i, n); + pg_64 = svwhilelt_b64(i, n); + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint64_t src_vec_u64 = + svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); + svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64); + svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64)); + } +} + +template <> +inline void convert(const bool* src, int32_t* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); + svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); + svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); + svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_8 = svwhilelt_b8(i, n); + pg_32 = svwhilelt_b32(i, n); + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); + svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); + svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32)); + } +} + +template <> +inline void convert(const uint8_t* src, bool* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); + svbool_t pg = svwhilelt_b8(0ull, Vectorized::size()); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8); + svst1_u8( + pg, + reinterpret_cast(dst) + i, + svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg = svwhilelt_b8(i, n); + svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8); + svst1_u8( + pg, + reinterpret_cast(dst) + i, + svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); + } +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b)); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b)); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b)); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b)); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b)); +} + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h new file mode 100644 index 0000000000000000000000000000000000000000..98d45ba0790f208cb165d29974d99ff1547999b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h @@ -0,0 +1,611 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with SVE] + +#include +#include +#include +#include +#include +#include + +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 4x Vectorized +// Vectorized -> 4x Vectorized +// Vectorized -> 1x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE) + +// NOTE: These are low-performance implementations that we fall back on +// if we are not building with SVE. This may not be an issue, because +// currently for quantization we assume the user has at least SVE +// installed, so these can simply act as a reference implementation. +// +// If in the future we relax this requirement (SVE+), we should probably +// revisit these implementations + +template < + typename T, + typename float_vec_return_type_, + typename int_vec_return_type_, + int size_> +struct VectorizedQuantizedConverter { + using size_type = int; + static constexpr size_type size() { + return size_; + } + + static constexpr int float_num_vecs() { + return size() / Vectorized::size(); + } + + static constexpr int int_num_vecs() { + return size() / Vectorized::size(); + } + + using float_vec_return_type = float_vec_return_type_; + using int_vec_return_type = int_vec_return_type_; + + using value_type = typename T::underlying; + std::array vals; + + VectorizedQuantizedConverter(T val) { + for (size_t i = 0; i < size(); ++i) { + vals[i] = val.val_; + } + } + + VectorizedQuantizedConverter(const void* ptr) { + memcpy(vals.data(), ptr, sizeof(value_type) * size()); + } + + void store(void* ptr, int count = size()) const { + memcpy(ptr, vals.data(), count * sizeof(value_type)); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + float_vec_return_type rv; + float tmp_scale[Vectorized::size()]; + float tmp_zero_point[Vectorized::size()]; + scale.store(tmp_scale); + zero_point.store(tmp_zero_point); + for (int i = 0; i < float_num_vecs(); ++i) { + float tmp_vals[Vectorized::size()]; + for (int j = 0; j < Vectorized::size(); ++j) { + tmp_vals[j] = at::native::dequantize_val( + tmp_scale[j], + tmp_zero_point[j], + T(vals[Vectorized::size() * i + j])); + } + rv[i] = Vectorized::loadu(tmp_vals); + } + return rv; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + float_vec_return_type rv; + float tmp_scale[Vectorized::size()]; + float tmp_zero_point[Vectorized::size()]; + scale.store(tmp_scale); + zero_point.store(tmp_zero_point); + for (int i = 0; i < float_num_vecs(); ++i) { + float tmp_vals[Vectorized::size()]; + for (int j = 0; j < Vectorized::size(); ++j) { + tmp_vals[j] = at::native::dequantize_val( + tmp_scale[j], + tmp_zero_point[j], + T(vals[Vectorized::size() * i + j])); + } + rv[i] = Vectorized::loadu(tmp_vals); + } + return rv; + } + + protected: + VectorizedQuantizedConverter() {} +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + VECTOR_WIDTH / 4> { + Vectorized() + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + VECTOR_WIDTH / 4>() {} + Vectorized(c10::qint32 val) + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + VECTOR_WIDTH / 4>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + VECTOR_WIDTH / 4>(ptr) {} +#if 1 + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } +#else + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return svld1_s32(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b32(0ull, count); + return svld1_s32(pg, reinterpret_cast(ptr)); + } +#endif + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + std::array qvals; + std::array::size()> float_vals; + + for (int i = 0; i < float_num_vecs(); ++i) { + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint32*)qvals.data(), + Vectorized::size() * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + for (size_t i = 0; i < size(); ++i) { + retval[0].vals[i] = vals[i] - b.vals[i]; + } + return retval; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = + nearbyint(static_cast(inp[0].vals[i]) * multiplier) + + zero_point; + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (size_t i = 0; i < std::decay_t::size(); ++i) { + retval.vals[i] = a.vals[i] * b.vals[i]; + } + return retval; +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (size_t i = 0; i < std::decay_t::size(); ++i) { + retval.vals[i] = a.vals[i] + b.vals[i]; + } + return retval; +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH> { + Vectorized() + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>() {} + Vectorized(c10::qint8 val) + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>(ptr) {} + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + std::array qvals; + std::array::size()> float_vals; + + for (int i = 0; i < float_num_vecs(); ++i) { + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint8*)qvals.data(), + Vectorized::size() * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (size_t i = 0; i < int_num_vecs(); ++i) { + for (size_t j = 0; j < elem_per_int_vec; ++j) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (size_t i = 0; i < int_num_vecs(); ++i) { + for (size_t j = 0; j < elem_per_int_vec; ++j) { + int32_t rounded = + nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH> { + Vectorized() + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>() {} + Vectorized(c10::quint8 val) + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH>(ptr) {} +#if 1 + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } +#else + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return svld1_u8(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b8(0ull, count); + return svld1_u8(pg, reinterpret_cast(ptr)); + } +#endif + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + std::array qvals; + std::array::size()> float_vals; + + for (int i = 0; i < float_num_vecs(); ++i) { + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::quint8*)qvals.data(), + Vectorized::size() * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (size_t i = 0; i < size(); ++i) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (size_t i = 0; i < int_num_vecs(); ++i) { + for (size_t j = 0; j < elem_per_int_vec; ++j) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (size_t i = 0; i < int_num_vecs(); ++i) { + for (size_t j = 0; j < elem_per_int_vec; ++j) { + int32_t rounded = + nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h new file mode 100644 index 0000000000000000000000000000000000000000..766f980da7088f7f7f830bf84299de836e361837 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// ARM NEON uses 128-bit vector registers. + +#include + +#ifdef __aarch64__ +#if !defined(CPU_CAPABILITY_SVE) +#include +#include +#include +#include +#include +#include +#endif + +#include +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..5ae7920fa4a90b434bfba8238c96926bcc522f96 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h @@ -0,0 +1,703 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] +#include +#include +#include +#include +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Following vec128_half_neon.h, we only support aarch64. +#if !defined(C10_MOBILE) && defined(__aarch64__) +#ifdef __BIG_ENDIAN__ +#error "Big endian is not supported." +#endif + +// GCC does not properly optimize bf16 operators +#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19) +#define BF16_ARITHMETIC_SUPPORTED() 1 +#else +#define BF16_ARITHMETIC_SUPPORTED() 0 +#endif + +// Unlike the float16_t family of types, bfloat16_t is not available +// when we're not targeting bfloat16 hardware support on some +// platforms (but not Mac, so we have to be careful not to shadow the +// definitions in case they are actually there!). (See +// https://godbolt.org/z/orv6e94n4 ) So, we need to handle it as +// uint16_t in that case. +#define IMPLEMENT_AT_BF16_SHIM(vec_suffix) \ + inline at_bfloat16x4_t at_vget_low_bf16(at_bfloat16x8_t a) { \ + return vget_low_##vec_suffix(a); \ + } \ + \ + inline at_bfloat16x4_t at_vget_high_bf16(at_bfloat16x8_t a) { \ + return vget_high_##vec_suffix(a); \ + } \ + \ + inline at_bfloat16x8_t at_vcombine_bf16( \ + at_bfloat16x4_t low, at_bfloat16x4_t high) { \ + return vcombine_##vec_suffix(low, high); \ + } \ + \ + inline at_bfloat16x8_t at_vdupq_n_bf16(at_bfloat16_t value) { \ + return vdupq_n_##vec_suffix(value); \ + } \ + \ + inline at_bfloat16x8_t at_vld1q_bf16(const at_bfloat16_t* ptr) { \ + return vld1q_##vec_suffix(ptr); \ + } \ + \ + inline void at_vst1q_bf16(at_bfloat16_t* ptr, at_bfloat16x8_t value) { \ + vst1q_##vec_suffix(ptr, value); \ + } \ + \ + template \ + inline at_bfloat16x8_t at_vreinterpretq_bf16_u16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpretq_bf16_u16(val); \ + } \ + } \ + template \ + inline at_bfloat16x4_t at_vreinterpret_bf16_u16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpret_bf16_u16(val); \ + } \ + } \ + template \ + inline uint16x8_t at_vreinterpretq_u16_bf16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpretq_u16_bf16(val); \ + } \ + } \ + template \ + inline uint16x4_t at_vreinterpret_u16_bf16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpret_u16_bf16(val); \ + } \ + } + +#ifdef __ARM_FEATURE_BF16 +using at_bfloat16x8_t = bfloat16x8_t; +using at_bfloat16x4_t = bfloat16x4_t; +using at_bfloat16_t = bfloat16_t; +IMPLEMENT_AT_BF16_SHIM(bf16) +#define at_vsetq_lane_bf16 vsetq_lane_bf16 +#define at_vgetq_lane_bf16 vgetq_lane_bf16 +#else +using at_bfloat16x8_t = uint16x8_t; +using at_bfloat16x4_t = uint16x4_t; +using at_bfloat16_t = uint16_t; +IMPLEMENT_AT_BF16_SHIM(u16) +#define at_vsetq_lane_bf16 vsetq_lane_u16 +#define at_vgetq_lane_bf16 vgetq_lane_u16 +#endif // __ARM_FEATURE_BF16 + +template +struct BlendBFloat16Regs { + static at_bfloat16x8_t impl( + const at_bfloat16x8_t& a, + const at_bfloat16x8_t& b, + at_bfloat16x8_t& res); +}; + +template +struct BlendBFloat16Regs { + static at_bfloat16x8_t impl( + const at_bfloat16x8_t& a, + const at_bfloat16x8_t& b, + at_bfloat16x8_t& res) { + return at_vsetq_lane_bf16(at_vgetq_lane_bf16(b, index), res, index); + } +}; + +template +struct BlendBFloat16Regs { + static at_bfloat16x8_t impl( + const at_bfloat16x8_t& a, + const at_bfloat16x8_t& b, + at_bfloat16x8_t& res) { + return at_vsetq_lane_bf16(at_vgetq_lane_bf16(a, index), res, index); + } +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16< + at_bfloat16x8_t, + c10::BFloat16, + BlendBFloat16Regs, + Vectorized> { + using Base = Vectorized16< + at_bfloat16x8_t, + c10::BFloat16, + BlendBFloat16Regs, + Vectorized>; + friend Base; + friend std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a); + friend Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b); + + private: + Vectorized map2( + const Vectorized& second, + c10::BFloat16 (*const f)(c10::BFloat16, c10::BFloat16)) const { + __at_align__ c10::BFloat16 tmp_first[size()]; + __at_align__ c10::BFloat16 tmp_second[size()]; + store(tmp_first); // store this to tmp_first + second.store(tmp_second); + for (const auto i : c10::irange(size())) { + tmp_first[i] = f(tmp_first[i], tmp_second[i]); + } + return loadu(tmp_first); + } + + static float32x4_t convert_f32_bf16(at_bfloat16x4_t bf16) { +#ifdef __ARM_FEATURE_BF16 + return vcvt_f32_bf16(bf16); +#else + int32x4_t shift = vdupq_n_s32(16); + return vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(bf16), shift)); +#endif // __ARM_FEATURE_BF16 + } + + static at_bfloat16x4_t convert_bf16_f32(const Vectorized& f32) { +#ifdef __ARM_FEATURE_BF16 + return vcvt_bf16_f32(f32); +#else + static_assert(std::is_same_v); + uint32x4_t as_uint32 = vreinterpretq_u32_f32(f32); + uint32x4_t rounding_bias = vaddq_u32( + vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)), + vdupq_n_u32(0x7FFF)); + at_bfloat16x4_t rounded = + vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16); + const auto bf16_nan = vdup_n_u16(0x7FC0); + return vbsl_u16( + vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded); +#endif // __ARM_FEATURE_BF16 + } + + Vectorized map_with_vec_float_method( + Vectorized (Vectorized::*m)() const) const { + float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values)); + float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values)); + Vectorized mv0 = (Vectorized(v00).*m)(); + Vectorized mv1 = (Vectorized(v01).*m)(); + at_bfloat16x4_t r00 = convert_bf16_f32(mv0); + at_bfloat16x4_t r01 = convert_bf16_f32(mv1); + return Vectorized(at_vcombine_bf16(r00, r01)); + } + + Vectorized map2_with_vec_float_method( + const Vectorized& second, + Vectorized (Vectorized::*m)(const Vectorized&) + const) const { + float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values)); + float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values)); + float32x4_t second_v00 = convert_f32_bf16(at_vget_low_bf16(second.values)); + float32x4_t second_v01 = convert_f32_bf16(at_vget_high_bf16(second.values)); + Vectorized mv0 = (Vectorized(v00).*m)(second_v00); + Vectorized mv1 = (Vectorized(v01).*m)(second_v01); + at_bfloat16x4_t r00 = convert_bf16_f32(mv0); + at_bfloat16x4_t r01 = convert_bf16_f32(mv1); + return Vectorized(at_vcombine_bf16(r00, r01)); + } + + Vectorized map2_bitmask_with_vec_float_method( + const Vectorized& second, + Vectorized (Vectorized::*m)(const Vectorized&) + const) const { + float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values)); + float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values)); + float32x4_t second_v00 = convert_f32_bf16(at_vget_low_bf16(second.values)); + float32x4_t second_v01 = convert_f32_bf16(at_vget_high_bf16(second.values)); + Vectorized mv0 = (Vectorized(v00).*m)(second_v00); + Vectorized mv1 = (Vectorized(v01).*m)(second_v01); + // Assume the operator returns a bitmask, not "real" floats, and + // just narrow the bits. All-ones is a NaN and will get mangled by + // conversion! + at_bfloat16x4_t r00 = + at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); + at_bfloat16x4_t r01 = + at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); + return Vectorized(at_vcombine_bf16(r00, r01)); + } + + public: + using Vectorized16::Vectorized16; + + Vectorized() = default; + + Vectorized(c10::BFloat16 val) + : Vectorized16(at_vdupq_n_bf16(c10::bit_cast(val.x))) {} + Vectorized(float val) : Vectorized(c10::BFloat16(val)) {} + Vectorized( + value_type val0, + value_type val1, + value_type val2, + value_type val3, + value_type val4, + value_type val5, + value_type val6, + value_type val7) + : Vectorized16(at_bfloat16x8_t{ + c10::bit_cast(val0.x), + c10::bit_cast(val1.x), + c10::bit_cast(val2.x), + c10::bit_cast(val3.x), + c10::bit_cast(val4.x), + c10::bit_cast(val5.x), + c10::bit_cast(val6.x), + c10::bit_cast(val7.x)}) {} + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // NOTE: blendv has the same problems as it does for Half; see comments in + // vec128_half_neon.h. + Vectorized vec(mask.values); + vec.values = at_vreinterpretq_bf16_u16(vbslq_u16( + at_vreinterpretq_u16_bf16(vec.values), + at_vreinterpretq_u16_bf16(b.values), + at_vreinterpretq_u16_bf16(a.values))); + return vec; + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + uint16_t pre_mask[size()] = {0}; + for (int i = 0; i < count; i++) { + pre_mask[i] = 0xFFFF; + } + uint16x8_t mask = vld1q_u16(pre_mask); + + Vectorized vec(at_vreinterpretq_bf16_u16(vbslq_u16( + mask, + at_vreinterpretq_u16_bf16(b.values), + at_vreinterpretq_u16_bf16(a.values)))); + + return vec; + } + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) { + return at_vld1q_bf16(reinterpret_cast(ptr)); + } + __at_align__ at_bfloat16_t tmp_values[size()]; + std::memset(tmp_values, 0, sizeof(tmp_values)); + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(at_bfloat16_t)); + return at_vld1q_bf16(reinterpret_cast(tmp_values)); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + at_vst1q_bf16(reinterpret_cast(ptr), values); + return; + } else { + at_bfloat16_t tmp_values[size()]; + at_vst1q_bf16(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(at_bfloat16_t)); + } + } + Vectorized isnan() const { + // NOTE: we could make this faster by doing vectorized checks of + // exponent/payload bits. + __at_align__ c10::BFloat16 tmp[size()]; + __at_align__ c10::BFloat16 res[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i])) { + std::memset(static_cast(&res[i]), 0xFF, sizeof(c10::BFloat16)); + } else { + std::memset(static_cast(&res[i]), 0, sizeof(c10::BFloat16)); + } + } + return loadu(res); + } + bool has_inf_nan() const { + __at_align__ c10::BFloat16 tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i]) || _isinf(tmp[i])) { + return true; + } + } + return false; + } +#define DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(name) \ + Vectorized name() const { \ + return map_with_vec_float_method(&Vectorized::name); \ + } + +#define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name) \ + Vectorized name(const Vectorized& other) const { \ + return map2_bitmask_with_vec_float_method( \ + other, &Vectorized::name); \ + } + + Vectorized frac() const; + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc) + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt) + +#ifdef __ARM_FEATURE_BF16 + // Flip sign bit + Vectorized neg() const { + return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768)); + } + // Fast reciprocal is fine because we are truncating results + Vectorized reciprocal() const { + auto x = vcvtq_low_f32_bf16(values); + auto y = vcvtq_high_f32_bf16(values); + x = vrecpeq_f32(x); + y = vrecpeq_f32(y); + return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y); + } + // Clearing the sign bit + Vectorized abs() const { + return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF); + } +#else + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs) + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg) + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal) +#endif + +// These functions are optimized on clang-21+ +#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21) + Vectorized operator==( + const Vectorized& other) const { + return values == other.values; + } + + Vectorized operator!=( + const Vectorized& other) const { + return values != other.values; + } + + Vectorized operator<( + const Vectorized& other) const { + return values < other.values; + } + + Vectorized operator<=( + const Vectorized& other) const { + return values <= other.values; + } + + Vectorized operator>( + const Vectorized& other) const { + return values > other.values; + } + + Vectorized operator>=( + const Vectorized& other) const { + return values >= other.values; + } +#else + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==) + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=) + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<) + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=) + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>) + DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=) +#endif + +#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD +#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; // Vectorized + +inline std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + at_bfloat16x8_t x = a; + float32x4_t x1 = + Vectorized::convert_f32_bf16(at_vget_low_bf16(x)); + float32x4_t x2 = + Vectorized::convert_f32_bf16(at_vget_high_bf16(x)); + return {Vectorized(x1), Vectorized(x2)}; +} +inline Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + at_bfloat16x4_t x1 = Vectorized::convert_bf16_f32(a); + at_bfloat16x4_t x2 = Vectorized::convert_bf16_f32(b); + return Vectorized(at_vcombine_bf16(x1, x2)); +} + +template +Vectorized binary_operator_via_float( + Op op, + const Vectorized& a, + const Vectorized& b) { + const auto [a_float_low, a_float_high] = convert_bfloat16_float(a); + const auto [b_float_low, b_float_high] = convert_bfloat16_float(b); + return convert_float_bfloat16( + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + return x + y; +#else + return binary_operator_via_float(std::plus>(), a, b); +#endif +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + return x - y; +#else + return binary_operator_via_float(std::minus>(), a, b); +#endif +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + return x * y; +#else + return binary_operator_via_float(std::multiplies>(), a, b); +#endif +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + return x / y; +#else + return binary_operator_via_float(std::divides>(), a, b); +#endif +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), + a, + b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), + a, + b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(at_vreinterpretq_bf16_u16( + vandq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(at_vreinterpretq_bf16_u16( + vorrq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(at_vreinterpretq_bf16_u16( + veorq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + bfloat16x8_t z = c; + return x * y + z; +#else + // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16! Also, + // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered + // elements, not the bottom and top half, so they don't seem + // particularly useful here. Ideally we would include dot product in + // the Vectorized interface... + return a * b + c; +#endif +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + bfloat16x8_t z = c; + return (-x) * y + z; +#else + // See NOTE [BF16 FMA] above. + return -a * b + c; +#endif +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + bfloat16x8_t z = c; + return x * y - z; +#else + // See NOTE [BF16 FMA] above. + return a * b - c; +#endif +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#if BF16_ARITHMETIC_SUPPORTED() + bfloat16x8_t x = a; + bfloat16x8_t y = b; + bfloat16x8_t z = c; + return (-x) * y - z; +#else + // See NOTE [BF16 FMA] above. + return -a * b - c; +#endif +} + +#endif // !defined(C10_MOBILE) && defined(__aarch64__) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..da9fb21eb24e3e9ad179fea82ad1ce6d242bc1a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h @@ -0,0 +1,383 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { +#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) + +// Enable auto-vectorization for clang-17+ +// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001 +#if defined(__clang__) && (__clang_major__ >= 17) + +template +inline void convertImpl( + const from_type* __restrict src, + to_type* __restrict dst, + int64_t n) { + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + dst[i] = static_cast(src[i]); + } +} + +template +inline void convertFromBool( + const bool* __restrict src, + to_type* __restrict dst, + int64_t n) { + const uint8_t* srcPtr = reinterpret_cast(src); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + dst[i] = srcPtr[i] != 0 ? static_cast(1) : static_cast(0); + } +} + +template +inline void convertToBool( + const from_type* __restrict src, + bool* __restrict dst, + int64_t n) { + uint8_t* dstPtr = reinterpret_cast(dst); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + dstPtr[i] = src[i] != static_cast(0) ? 1 : 0; + } +} + +#define CONVERT_TEMPLATE(from_type, to_type) \ + template <> \ + inline void convert(const from_type* src, to_type* dst, int64_t n) { \ + return convertImpl(src, dst, n); \ + } + +#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \ + inline void convert(const bool* src, to_type* dst, int64_t n) { \ + return convertFromBool(src, dst, n); \ + } + +#define CONVERT_TO_BOOL_TEMPLATE(from_type) \ + inline void convert(const from_type* src, bool* dst, int64_t n) { \ + return convertToBool(src, dst, n); \ + } + +CONVERT_TEMPLATE(uint8_t, uint8_t) +CONVERT_TEMPLATE(uint8_t, int8_t) +CONVERT_TEMPLATE(uint8_t, int16_t) +CONVERT_TEMPLATE(uint8_t, int32_t) +CONVERT_TEMPLATE(uint8_t, int64_t) +CONVERT_TEMPLATE(uint8_t, float) +CONVERT_TEMPLATE(uint8_t, double) +CONVERT_TO_BOOL_TEMPLATE(uint8_t) +CONVERT_TEMPLATE(int8_t, uint8_t) +CONVERT_TEMPLATE(int8_t, int8_t) +CONVERT_TEMPLATE(int8_t, int16_t) +CONVERT_TEMPLATE(int8_t, int32_t) +CONVERT_TEMPLATE(int8_t, int64_t) +CONVERT_TEMPLATE(int8_t, float) +CONVERT_TEMPLATE(int8_t, double) +CONVERT_TO_BOOL_TEMPLATE(int8_t) +CONVERT_TEMPLATE(int16_t, uint8_t) +CONVERT_TEMPLATE(int16_t, int8_t) +CONVERT_TEMPLATE(int16_t, int16_t) +CONVERT_TEMPLATE(int16_t, int32_t) +CONVERT_TEMPLATE(int16_t, int64_t) +CONVERT_TEMPLATE(int16_t, float) +CONVERT_TEMPLATE(int16_t, double) +CONVERT_TO_BOOL_TEMPLATE(int16_t) +CONVERT_TEMPLATE(int32_t, uint8_t) +CONVERT_TEMPLATE(int32_t, int8_t) +CONVERT_TEMPLATE(int32_t, int16_t) +CONVERT_TEMPLATE(int32_t, int32_t) +CONVERT_TEMPLATE(int32_t, int64_t) +CONVERT_TEMPLATE(int32_t, float) +CONVERT_TEMPLATE(int32_t, double) +CONVERT_TO_BOOL_TEMPLATE(int32_t) +CONVERT_TEMPLATE(int64_t, uint8_t) +CONVERT_TEMPLATE(int64_t, int8_t) +CONVERT_TEMPLATE(int64_t, int16_t) +CONVERT_TEMPLATE(int64_t, int32_t) +CONVERT_TEMPLATE(int64_t, int64_t) +CONVERT_TEMPLATE(int64_t, float) +CONVERT_TEMPLATE(int64_t, double) +CONVERT_TO_BOOL_TEMPLATE(int64_t) +CONVERT_TEMPLATE(float, uint8_t) +CONVERT_TEMPLATE(float, int8_t) +CONVERT_TEMPLATE(float, int16_t) +CONVERT_TEMPLATE(float, int32_t) +CONVERT_TEMPLATE(float, int64_t) +CONVERT_TEMPLATE(float, float) +CONVERT_TEMPLATE(float, double) +CONVERT_TO_BOOL_TEMPLATE(float) +CONVERT_TEMPLATE(double, uint8_t) +CONVERT_TEMPLATE(double, int8_t) +CONVERT_TEMPLATE(double, int16_t) +CONVERT_TEMPLATE(double, int32_t) +CONVERT_TEMPLATE(double, int64_t) +CONVERT_TEMPLATE(double, float) +CONVERT_TEMPLATE(double, double) +CONVERT_TO_BOOL_TEMPLATE(double) +CONVERT_FROM_BOOL_TEMPLATE(uint8_t) +CONVERT_FROM_BOOL_TEMPLATE(int8_t) +CONVERT_FROM_BOOL_TEMPLATE(int16_t) +CONVERT_FROM_BOOL_TEMPLATE(int32_t) +CONVERT_FROM_BOOL_TEMPLATE(int64_t) +CONVERT_FROM_BOOL_TEMPLATE(float) +CONVERT_FROM_BOOL_TEMPLATE(double) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#define CONVERT_FROM_FP16_TEMPLATE(to_type) \ + template <> \ + inline void convert(const at::Half* src, to_type* dst, int64_t n) { \ + const float16_t* srcPtr = reinterpret_cast(src); \ + return convertImpl(srcPtr, dst, n); \ + } + +#define CONVERT_TO_FP16_TEMPLATE(from_type) \ + template <> \ + inline void convert(const from_type* src, at::Half* dst, int64_t n) { \ + float16_t* dstPtr = reinterpret_cast(dst); \ + return convertImpl(src, dstPtr, n); \ + } + +CONVERT_FROM_FP16_TEMPLATE(uint8_t) +CONVERT_FROM_FP16_TEMPLATE(int8_t) +CONVERT_FROM_FP16_TEMPLATE(int16_t) +CONVERT_FROM_FP16_TEMPLATE(int32_t) +CONVERT_FROM_FP16_TEMPLATE(int64_t) +CONVERT_FROM_FP16_TEMPLATE(float16_t) +CONVERT_FROM_FP16_TEMPLATE(float) +CONVERT_FROM_FP16_TEMPLATE(double) +CONVERT_TO_FP16_TEMPLATE(uint8_t) +CONVERT_TO_FP16_TEMPLATE(int8_t) +CONVERT_TO_FP16_TEMPLATE(int16_t) +CONVERT_TO_FP16_TEMPLATE(int32_t) +CONVERT_TO_FP16_TEMPLATE(int64_t) +CONVERT_TO_FP16_TEMPLATE(float) +CONVERT_TO_FP16_TEMPLATE(double) + +inline void convertBoolToFp16Impl( + const bool* __restrict src, + at::Half* __restrict dst, + int64_t n) { + const uint8_t* srcPtr = reinterpret_cast(src); + float16_t* dstPtr = reinterpret_cast(dst); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0; + } +} + +template <> +inline void convert(const bool* src, at::Half* dst, int64_t n) { + return convertBoolToFp16Impl(src, dst, n); +} + +inline void convertFp16ToBoolImpl( + const at::Half* __restrict src, + bool* __restrict dst, + int64_t n) { + const float16_t* srcPtr = reinterpret_cast(src); + uint8_t* dstPtr = reinterpret_cast(dst); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0; + } +} + +template <> +inline void convert(const at::Half* src, bool* dst, int64_t n) { + return convertFp16ToBoolImpl(src, dst, n); +} + +#endif + +template +inline void convertFromBf16Impl( + const c10::BFloat16* __restrict src, + to_type* __restrict dst, + int64_t n) { + const uint16_t* srcPtr = reinterpret_cast(src); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + uint32_t tmp = static_cast(srcPtr[i]) << 16; + float tmpF; + __builtin_memcpy(&tmpF, &tmp, sizeof(float)); + dst[i] = static_cast(tmpF); + } +} +#define CONVERT_FROM_BF16_TEMPLATE(to_type) \ + template <> \ + inline void convert(const c10::BFloat16* src, to_type* dst, int64_t n) { \ + return convertFromBf16Impl(src, dst, n); \ + } + +CONVERT_FROM_BF16_TEMPLATE(uint8_t) +CONVERT_FROM_BF16_TEMPLATE(int8_t) +CONVERT_FROM_BF16_TEMPLATE(int16_t) +CONVERT_FROM_BF16_TEMPLATE(int32_t) +CONVERT_FROM_BF16_TEMPLATE(int64_t) +CONVERT_FROM_BF16_TEMPLATE(float) +CONVERT_FROM_BF16_TEMPLATE(double) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +CONVERT_FROM_BF16_TEMPLATE(float16_t) +#endif + +#ifdef __ARM_FEATURE_BF16 + +// clang-[17, 20] crashes when autovectorizing static cast to bf16 +// Below is a workaround to have some vectorization +// Works decently well for smaller int types +template +inline void convertToBf16Impl( + const from_type* __restrict src, + c10::BFloat16* __restrict dst, + uint64_t n) { + bfloat16_t* dstPtr = reinterpret_cast(dst); + uint64_t loopBound = n - (n % 16); + uint64_t i = 0; + for (; i < loopBound; i += 16) { + float32x4_t a, b, c, d; + a[0] = static_cast(src[i]); + a[1] = static_cast(src[i + 1]); + a[2] = static_cast(src[i + 2]); + a[3] = static_cast(src[i + 3]); + b[0] = static_cast(src[i + 4]); + b[1] = static_cast(src[i + 5]); + b[2] = static_cast(src[i + 6]); + b[3] = static_cast(src[i + 7]); + c[0] = static_cast(src[i + 8]); + c[1] = static_cast(src[i + 9]); + c[2] = static_cast(src[i + 10]); + c[3] = static_cast(src[i + 11]); + d[0] = static_cast(src[i + 12]); + d[1] = static_cast(src[i + 13]); + d[2] = static_cast(src[i + 14]); + d[3] = static_cast(src[i + 15]); + + vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b)); + vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d)); + } + +#pragma clang loop vectorize(disable) interleave(disable) unroll(disable) + for (; i < n; i++) { + float a = static_cast(src[i]); + dstPtr[i] = vcvth_bf16_f32(a); + } +} + +#define CONVERT_TO_BF16_TEMPLATE(from_type) \ + template <> \ + inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \ + return convertToBf16Impl(src, dst, n); \ + } + +CONVERT_TO_BF16_TEMPLATE(uint8_t) +CONVERT_TO_BF16_TEMPLATE(int8_t) +CONVERT_TO_BF16_TEMPLATE(int16_t) +CONVERT_TO_BF16_TEMPLATE(int32_t) + +#endif + +inline void convertBoolToBfloat16Impl( + const bool* __restrict src, + c10::BFloat16* __restrict dst, + int64_t n) { + const uint8_t* srcPtr = reinterpret_cast(src); + uint16_t* dstPtr = reinterpret_cast(dst); + uint64_t len = static_cast(n); + constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16 + for (uint64_t i = 0; i < len; i++) { + dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0; + } +} + +template <> +inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) { + return convertBoolToBfloat16Impl(src, dst, n); +} + +inline void convertBfloat16ToBoolImpl( + const c10::BFloat16* __restrict src, + bool* __restrict dst, + int64_t n) { + uint8_t* dstPtr = reinterpret_cast(dst); + const uint16_t* srcPtr = reinterpret_cast(src); + uint64_t len = static_cast(n); + for (uint64_t i = 0; i < len; i++) { + // Check if all non-sign bits are 0 + bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0; + dstPtr[i] = isBf16Zero ? 0 : 1; + } +} + +template <> +inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) { + return convertBfloat16ToBoolImpl(src, dst, n); +} + +#endif + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_int8_half_register_to_float(src[0]); + } +}; +template +struct VecConvert< + float, + 2, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + const auto [v0, v1] = convert_int8_to_float(src[0]); + return VectorizedN(v0, v1); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + uint16x8_t u16_8 = vld1q_u16(reinterpret_cast(&src[0])); + auto u16_low1 = vget_low_u16(u16_8); + auto u16_high1 = vget_high_u16(u16_8); + float32x4_t f32x4_0 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16)); + float32x4_t f32x4_1 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16)); + result[0] = f32x4_0; + result[1] = f32x4_1; + return result; + } +}; +// Half register to full register. +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + uint16x4_t u16_8 = vld1_u16(reinterpret_cast(&src[0])); + float32x4_t f32x4_0 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16)); + result[0] = f32x4_0; + return result; + } +}; + +#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..f27f9b272224af260be8b9d25ce1b0f2d2f7be90 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h @@ -0,0 +1,591 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + float64x2_t values; + + public: + using value_type = double; + using size_type = int; + static constexpr size_type size() { + return 2; + } + Vectorized() { + values = vdupq_n_f64(0.0); + } + Vectorized(float64x2_t v) : values(v) {} + Vectorized(double val) { + values = vdupq_n_f64(val); + } + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ double buffer[size()] = {vals...}; + values = vld1q_f64(buffer); + } + operator float64x2_t() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint64x2_t maskArray = { + (mask & 1ULL) ? 0xFFFFFFFFFFFFFFFF : 0, + (mask & 2ULL) ? 0xFFFFFFFFFFFFFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_f64(maskArray, b.values, a.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + return vbslq_f64(vreinterpretq_u64_f64(mask_.values), b.values, a.values); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return {base, base + static_cast(step)}; + } + static inline Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + if (count == 0) { + return a; + } else if (count >= 2) { + return b; + } else { + float64x2_t c = {b.values[0], a.values[1]}; + return c; + } + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) { + return vld1q_f64(reinterpret_cast(ptr)); + } else if (count == 1) { + float64x1_t x = vld1_f64(reinterpret_cast(ptr)); + float64x1_t z = {0.0}; + return vcombine_f64(x, z); + } else { + return vdupq_n_f64(0.0); + } + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + vst1q_f64(reinterpret_cast(ptr), values); + } else if (count == 1) { + vst1_f64(reinterpret_cast(ptr), vget_low_f64(values)); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int64_t zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + uint64x2_t cmpReg = vceqzq_f64(values); + uint64x2_t mask = {1, 2}; + uint64x2_t res = vandq_u64(cmpReg, mask); + return res[0] | res[1]; + } + Vectorized isnan() const { + // NaN check + return vreinterpretq_f64_u32( + vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, values)))); + } + bool has_inf_nan() const { + Vectorized x = vsubq_f64(values, values); + float64x2_t r = x.isnan(); + uint64x2_t u = vreinterpretq_u64_f64(r); + return u[0] | u[1]; + } + Vectorized map(double (*f)(double)) const { + float64x2_t result; + result[0] = f(values[0]); + result[1] = f(values[1]); + return result; + } + Vectorized map2( + const Vectorized& second, + double (*const f)(double, double)) const { + float64x2_t result; + result[0] = f(values[0], second.values[0]); + result[1] = f(values[1], second.values[1]); + return result; + } + Vectorized abs() const { + return vabsq_f64(values); + } + Vectorized angle() const { + auto zero = Vectorized(0.0); + auto pi = Vectorized(c10::pi); + auto tmp = blendv(zero, pi, vreinterpretq_f64_u64(vcltzq_f64(values))); + return blendv(tmp, *this, isnan()); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized(0.0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return USE_SLEEF( + Vectorized(Sleef_acosd2_u10(values)), map(std::acos)); + } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshd2_u10(values)), map(std::acosh)); + } + Vectorized asin() const { + return USE_SLEEF( + Vectorized(Sleef_asind2_u10(values)), map(std::asin)); + } + Vectorized asinh() const { + return USE_SLEEF( + Vectorized(Sleef_asinhd2_u10(values)), map(std::asinh)); + } + Vectorized atan() const { + return USE_SLEEF( + Vectorized(Sleef_atand2_u10(values)), map(std::atan)); + } + Vectorized atanh() const { + return USE_SLEEF( + Vectorized(Sleef_atanhd2_u10(values)), map(std::atanh)); + } + Vectorized atan2(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_atan2d2_u10(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::atan2(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized copysign(const Vectorized& sign) const { + USE_SLEEF( + { return Vectorized(Sleef_copysignd2(values, sign)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_sign[size()]; + store(tmp); + sign.store(tmp_sign); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::copysign(tmp[i], tmp_sign[i]); + } + return loadu(tmp); + })} Vectorized erf() const { + return USE_SLEEF( + Vectorized(Sleef_erfd2_u10(values)), map(std::erf)); + } + Vectorized erfc() const { + return USE_SLEEF( + Vectorized(Sleef_erfcd2_u15(values)), map(std::erfc)); + } + Vectorized exp() const { + return USE_SLEEF( + Vectorized(Sleef_expd2_u10(values)), map(std::exp)); + } + Vectorized exp2() const { + return USE_SLEEF( + Vectorized(Sleef_exp2d2_u10(values)), map(std::exp2)); + } + Vectorized expm1() const { + return USE_SLEEF( + Vectorized(Sleef_expm1d2_u10(values)), map(std::expm1)); + } + Vectorized fmod(const Vectorized& q) const {USE_SLEEF( + { return Vectorized(Sleef_fmodd2(values, q)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_q[size()]; + store(tmp); + q.store(tmp_q); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::fmod(tmp[i], tmp_q[i]); + } + return loadu(tmp); + })} Vectorized hypot(const Vectorized& b) const { + USE_SLEEF( + { return Vectorized(Sleef_hypotd2_u05(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::hypot(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized i0() const { + return map(calc_i0); + } + Vectorized nextafter(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_nextafterd2(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::nextafter(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized log() const { + return USE_SLEEF( + Vectorized(Sleef_logd2_u10(values)), map(std::log)); + } + Vectorized log2() const { + return USE_SLEEF( + Vectorized(Sleef_log2d2_u10(values)), map(std::log2)); + } + Vectorized log10() const { + return USE_SLEEF( + Vectorized(Sleef_log10d2_u10(values)), map(std::log10)); + } + Vectorized log1p() const { + return USE_SLEEF( + Vectorized(Sleef_log1pd2_u10(values)), map(std::log1p)); + } + Vectorized frac() const; + Vectorized sin() const { + return USE_SLEEF( + Vectorized(Sleef_sind2_u10(values)), map(std::sin)); + } + Vectorized sinh() const { + return USE_SLEEF( + Vectorized(Sleef_sinhd2_u10(values)), map(std::sinh)); + } + Vectorized cos() const { + return USE_SLEEF( + Vectorized(Sleef_cosd2_u10(values)), map(std::cos)); + } + Vectorized cosh() const { + return USE_SLEEF( + Vectorized(Sleef_coshd2_u10(values)), map(std::cosh)); + } + Vectorized pow(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_powd2_u10(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::pow(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized tan() const { + return USE_SLEEF( + Vectorized(Sleef_tand2_u10(values)), map(std::tan)); + } + Vectorized tanh() const { + return USE_SLEEF( + Vectorized(Sleef_tanhd2_u10(values)), map(std::tanh)); + } + Vectorized lgamma() const { + return USE_SLEEF( + Vectorized(Sleef_lgammad2_u10(values)), map(std::lgamma)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized ceil() const { + return vrndpq_f64(values); + } + Vectorized floor() const { + return vrndmq_f64(values); + } + Vectorized neg() const { + return vnegq_f64(values); + } + Vectorized round() const { + return vrndiq_f64(values); + } + Vectorized trunc() const { + return vrndq_f64(values); + } + Vectorized sqrt() const { + return vsqrtq_f64(values); + } + Vectorized reciprocal() const { + return vdivq_f64(vdupq_n_f64(1.0), values); + } + Vectorized rsqrt() const { + return vdivq_f64(vdupq_n_f64(1.0), vsqrtq_f64(values)); + } + double reduce_add() const { + return vaddvq_f64(values); + } + double reduce_max() const { + return vmaxvq_f64(values); + } + Vectorized operator==(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f64_u64(vceqq_f64(values, other.values))); + } + + Vectorized operator!=(const Vectorized& other) const { + float64x2_t r0 = vreinterpretq_f64_u32( + vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, other.values)))); + return Vectorized(r0); + } + + Vectorized operator<(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f64_u64(vcltq_f64(values, other.values))); + } + + Vectorized operator<=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f64_u64(vcleq_f64(values, other.values))); + } + + Vectorized operator>(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f64_u64(vcgtq_f64(values, other.values))); + } + + Vectorized operator>=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f64_u64(vcgeq_f64(values, other.values))); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return vaddq_f64(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return vsubq_f64(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_f64(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return vdivq_f64(a, b); +} + +// frac. Implement this here so we can use subtraction +Vectorized inline Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_f64(a, b); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_f64(a, b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return vminq_f64(max, vmaxq_f64(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return vminq_f64(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return vmaxq_f64(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return vreinterpretq_f64_u64( + vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return vreinterpretq_f64_u64( + vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return vreinterpretq_f64_u64( + veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b))); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0); +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return vfmaq_f64(c, a, b); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return vfmsq_f64(c, a, b); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return vfmaq_f64(vnegq_f64(c), a, b); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return vfmsq_f64(vnegq_f64(c), a, b); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..c6f047f86fc4f62fc82e24506f688e7d39a92214 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h @@ -0,0 +1,661 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include + +#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) +#include +#endif + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + +// Sleef offers vectorized versions of some transcedentals +// such as sin, cos, tan etc.. +// However for now opting for STL, since we are not building +// with Sleef for mobile yet. + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Right now contains only aarch64 implementation. +// Due to follow two reasons aarch32 is not currently supported. +// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics +// that work for aarch64 dont work for aarch32. +// 2. Android NDK r21 has problems with compiling aarch32. +// Clang seg faults. +// https://github.com/android/ndk/issues/1248 +// https://bugs.llvm.org/show_bug.cgi?id=45824 +// Most likely we will do aarch32 support with inline asm. +#if defined(__aarch64__) + +#ifdef __BIG_ENDIAN__ +#error "Big endian is not supported." +#endif + +#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) +#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code +#else +#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code +#endif + +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res); +}; + +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res) { + return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index); + } +}; + +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res) { + return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index); + } +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + float32x4_t values; + + public: + using value_type = float; + using size_type = int; + static constexpr size_type size() { + return 4; + } + Vectorized() { + values = vmovq_n_f32(0); + } + Vectorized(float32x4_t v) : values(v) {} + Vectorized(float val) : values{vdupq_n_f32(val)} {} + Vectorized(float val0, float val1, float val2, float val3) + : values{val0, val1, val2, val3} {} + Vectorized(float (&arr)[4]) : Vectorized(arr[0], arr[1], arr[2], arr[3]) {} + operator float32x4_t() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + Vectorized vec; + vec.values = BlendRegs < 0, + (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 1, + (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 2, + (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 3, + (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values); + return vec; + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // TODO + // NB: This requires that each value, i.e., each uint value, + // of the mask either all be zeros or all be 1s. + // We perhaps need some kind of an assert? + // But that will affect performance. + Vectorized vec(mask.values); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const Vectorized step_sizes(0, 1, 2, 3); + return fmadd(step_sizes, step_vec, base_vec); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + case 2: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + case 3: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) { + return vld1q_f32(reinterpret_cast(ptr)); + } else { + __at_align__ float tmp_values[size()]; + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(float)); + return vld1q_f32(reinterpret_cast(tmp_values)); + } + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + vst1q_f32(reinterpret_cast(ptr), values); + } else { + float tmp_values[size()]; + vst1q_f32(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(float)); + } + } + // Very slow implementation of indexing. + // Only required because vec256_qint refers to this. + // Once we specialize that implementation for ARM + // this should be removed. TODO (kimishpatel) + float operator[](int idx) const { + __at_align__ float tmp[size()]; + store(tmp); + return tmp[idx]; + } + float operator[](int idx) { + __at_align__ float tmp[size()]; + store(tmp); + return tmp[idx]; + } + int zero_mask() const { + uint32x4_t is_zero_vec = vceqzq_f32(values); + const int32x4_t shift = vcombine_s32( + vcreate_s32(0x0 | (int64_t(0x1) << 32)), + vcreate_s32(0x2 | (int64_t(0x3) << 32))); + uint32x4_t bits_vec = + vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift); + return vaddvq_u32(bits_vec); + } + Vectorized isnan() const { + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values))); + } + bool has_inf_nan() const { + __at_align__ float tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i]) || _isinf(tmp[i])) { + return true; + } + } + return false; + } + Vectorized map(float (*const f)(float)) const { + __at_align__ float tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized map2( + const Vectorized& second, + float (*const f)(float, float)) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_second[size()]; + store(tmp); + second.store(tmp_second); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i], tmp_second[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + return Vectorized(vabsq_f32(values)); + } + Vectorized angle() const { + auto zero = Vectorized(0); + auto pi = Vectorized(c10::pi); + auto tmp = blendv(zero, pi, *this < zero); + return blendv(tmp, *this, isnan()); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized(0.f); + } + Vectorized conj() const { + return *this; + } +#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, sleef_name) \ + Vectorized name() const { \ + return USE_SLEEF(Vectorized(sleef_name(values)), map(std::name)); \ + } + +#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name) \ + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, Sleef_##name##f4_u10) + + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acos) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acosh) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(asin) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(asinh) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atan) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atanh) + +#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, sleef_name) \ + Vectorized name(const Vectorized& arg) const { \ + return USE_SLEEF( \ + Vectorized(sleef_name(values, arg.values)), \ + map2(arg, std::name)); \ + } + +#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(name) \ + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, Sleef_##name##f4_u10) + + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(atan2) + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + copysign, + Sleef_copysignf4) + Vectorized erf() const; + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + erfc, + Sleef_erfcf4_u15) + Vectorized erfinv() const { + return map(calc_erfinv); + } + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1) + // Implementation copied from Arm Optimized Routine + // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c + inline Vectorized vexpq_f32_u20() const { + // bail out to sleef if it's a special case: + // i.e. there's an input s.t. |input| > 87.3.... + const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); + uint32x4_t cmp = vcagtq_f32(values, special_bound); + if (vpaddd_u64(vreinterpretq_u64_u32(cmp)) != 0) { + return exp(); + } + + const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); + const float ln2_hi = 0x1.62e4p-1f; + const float ln2_lo = 0x1.7f7d1cp-20f; + const float c0 = 0x1.0e4020p-7f; + const float c2 = 0x1.555e66p-3f; + const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; + + const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); + const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); + const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); + const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); + float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); + r = vfmsq_laneq_f32(r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); + float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); + + float32x4_t r2 = vmulq_f32(r, r); + float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); + q = vfmaq_f32(q, p, r2); + p = vmulq_f32(c4, r); + float32x4_t poly = vfmaq_f32(p, q, r2); + + return vfmaq_f32(scale, poly, scale); + } + Vectorized exp_u20() const { + return vexpq_f32_u20(); + } + Vectorized fexp_u20() const { + return exp_u20(); + } + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + fmod, + Sleef_fmodf4) + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + hypot, + Sleef_hypotf4_u05) + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + return map2(x, calc_igamma); + } + Vectorized igammac(const Vectorized& x) const { + return map2(x, calc_igammac); + } + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log10) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log1p) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log2) + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + nextafter, + Sleef_nextafterf4) + Vectorized frac() const; + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sin) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sinh) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cos) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cosh) + Vectorized ceil() const { + return map(at::native::ceil_impl); + } + Vectorized floor() const { + return map(at::native::floor_impl); + } + Vectorized neg() const { + return Vectorized(vnegq_f32(values)); + } + Vectorized round() const { + // We do not use std::round because we would like to round midway numbers to + // the nearest even integer. + return map(at::native::round_impl); + } + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tan) + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tanh) + Vectorized trunc() const { + return Vectorized(vrndq_f32(values)); + } + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(lgamma) + Vectorized sqrt() const { + return Vectorized(vsqrtq_f32(values)); + } + Vectorized reciprocal() const { + return Vectorized(vdivq_f32(vdupq_n_f32(1.0f), values)); + } + Vectorized rsqrt() const { + return this->sqrt().reciprocal(); + } + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(pow) + Vectorized operator==(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vceqq_f32(values, other.values))); + } + + Vectorized operator!=(const Vectorized& other) const { + float32x4_t r0 = + vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, other.values))); + return Vectorized(r0); + } + + Vectorized operator<(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcltq_f32(values, other.values))); + } + + Vectorized operator<=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcleq_f32(values, other.values))); + } + + Vectorized operator>(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcgtq_f32(values, other.values))); + } + + Vectorized operator>=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcgeq_f32(values, other.values))); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vaddq_f32(a, b)); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vsubq_f32(a, b)); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vmulq_f32(a, b)); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vdivq_f32(a, b)); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vmaxq_f32(a, b)); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vminq_f32(a, b)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized(vfmaq_f32(c, a, b)); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized(vfmsq_f32(c, a, b)); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized(vnegq_f32(vfmsq_f32(c, a, b))); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized(vnegq_f32(vfmaq_f32(c, a, b))); +} + +inline Vectorized Vectorized::erf() const { + // constants + const Vectorized neg_zero_vec(-0.f); + const Vectorized one_vec(1.0f); + const Vectorized p(0.3275911f); + const Vectorized p1(0.254829592f); + const Vectorized p2(-0.284496736f); + const Vectorized p3(1.421413741f); + const Vectorized p4(-1.453152027f); + const Vectorized p5(1.061405429f); + // sign(x) + auto sign_mask = neg_zero_vec & *this; + auto abs_vec = this->abs(); + // t = 1 / (p * abs(x) + 1) + auto tmp0 = fmadd(p, abs_vec, one_vec); + auto t = one_vec / tmp0; + // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 + auto tmp1 = fmadd(p5, t, p4); + auto tmp2 = fmadd(tmp1, t, p3); + auto tmp3 = fmadd(tmp2, t, p2); + auto r = fmadd(tmp3, t, p1); + // - exp(- x * x) + auto pow_2 = (*this) * (*this); + auto neg_pow_2 = pow_2 ^ neg_zero_vec; + auto tmp4 = neg_pow_2.vexpq_f32_u20(); + auto tmp5 = tmp4 ^ neg_zero_vec; + // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) + auto tmp6 = t * tmp5; + auto tmp7 = fmadd(tmp6, r, one_vec); + return tmp7 ^ sign_mask; +} +#undef DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC +#undef DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC +#endif /* defined(aarch64) */ + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..7d5a95e2fc54ae704bb019f50ae8347a6be93938 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h @@ -0,0 +1,799 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#define VEC_INT_NEON_TEMPLATE(vl, bit) \ + template <> \ + struct is_vec_specialized_for : std::bool_constant {}; \ + \ + template <> \ + class Vectorized { \ + using neon_type = int##bit##x##vl##_t; \ + \ + private: \ + neon_type values; \ + \ + public: \ + using value_type = int##bit##_t; \ + using size_type = int; \ + static constexpr size_type size() { \ + return vl; \ + } \ + Vectorized() { \ + values = vdupq_n_s##bit(0); \ + } \ + Vectorized(neon_type v) : values(v) {} \ + Vectorized(int##bit##_t val); \ + template < \ + typename... Args, \ + typename = std::enable_if_t<(sizeof...(Args) == size())>> \ + Vectorized(Args... vals) { \ + __at_align__ int##bit##_t buffer[size()] = {vals...}; \ + values = vld1q_s##bit(buffer); \ + } \ + operator neon_type() const { \ + return values; \ + } \ + static Vectorized loadu( \ + const void* ptr, \ + int64_t count = size()); \ + void store(void* ptr, int64_t count = size()) const; \ + template \ + static Vectorized blend( \ + const Vectorized& a, \ + const Vectorized& b); \ + static Vectorized blendv( \ + const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& mask_) { \ + return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \ + } \ + template \ + static Vectorized arange( \ + value_type base = 0, \ + step_t step = static_cast(1)); \ + static Vectorized set( \ + const Vectorized& a, \ + const Vectorized& b, \ + int64_t count = size()); \ + const int##bit##_t& operator[](int idx) const = delete; \ + int##bit##_t& operator[](int idx) = delete; \ + Vectorized abs() const { \ + return vabsq_s##bit(values); \ + } \ + Vectorized real() const { \ + return values; \ + } \ + Vectorized imag() const { \ + return vdupq_n_s##bit(0); \ + } \ + Vectorized conj() const { \ + return values; \ + } \ + Vectorized neg() const { \ + return vnegq_s##bit(values); \ + } \ + int##bit##_t reduce_add() const { \ + return vaddvq_s##bit(values); \ + } \ + int##bit##_t reduce_max() const; \ + Vectorized operator==( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \ + } \ + Vectorized operator!=( \ + const Vectorized& other) const; \ + Vectorized operator<( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \ + } \ + Vectorized operator<=( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \ + } \ + Vectorized operator>( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \ + } \ + Vectorized operator>=( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \ + } \ + Vectorized eq(const Vectorized& other) const; \ + Vectorized ne(const Vectorized& other) const; \ + Vectorized gt(const Vectorized& other) const; \ + Vectorized ge(const Vectorized& other) const; \ + Vectorized lt(const Vectorized& other) const; \ + Vectorized le(const Vectorized& other) const; \ + }; \ + template <> \ + Vectorized inline operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return vaddq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return vsubq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return vandq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return vorrq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return veorq_s##bit(a, b); \ + } \ + Vectorized inline Vectorized::eq( \ + const Vectorized& other) const { \ + return (*this == other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ne( \ + const Vectorized& other) const { \ + return (*this != other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::gt( \ + const Vectorized& other) const { \ + return (*this > other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ge( \ + const Vectorized& other) const { \ + return (*this >= other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::lt( \ + const Vectorized& other) const { \ + return (*this < other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::le( \ + const Vectorized& other) const { \ + return (*this <= other) & Vectorized(1); \ + } + +VEC_INT_NEON_TEMPLATE(2, 64) +VEC_INT_NEON_TEMPLATE(4, 32) +VEC_INT_NEON_TEMPLATE(8, 16) +VEC_INT_NEON_TEMPLATE(16, 8) + +inline int32_t Vectorized::reduce_max() const { + return vmaxvq_s32(values); +} + +inline int16_t Vectorized::reduce_max() const { + return vmaxvq_s16(values); +} + +inline int8_t Vectorized::reduce_max() const { + return vmaxvq_s8(values); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s32(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s16(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s8(a, b); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + int64x2_t val = a; + return ~val; +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s32(a); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s16(a); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s8(a); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s32(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s16(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s32(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s16(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s8(a, b); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint64x2_t maskArray = { + (mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0, + (mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s64(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint32x4_t maskArray = { + (mask & 1LL) ? 0xFFFFFFFF : 0, + (mask & 2LL) ? 0xFFFFFFFF : 0, + (mask & 4LL) ? 0xFFFFFFFF : 0, + (mask & 8LL) ? 0xFFFFFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s32(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint16x8_t maskArray = { + (mask & 1LL) ? 0xFFFF : 0, + (mask & 2LL) ? 0xFFFF : 0, + (mask & 4LL) ? 0xFFFF : 0, + (mask & 8LL) ? 0xFFFF : 0, + (mask & 16LL) ? 0xFFFF : 0, + (mask & 32LL) ? 0xFFFF : 0, + (mask & 64LL) ? 0xFFFF : 0, + (mask & 128LL) ? 0xFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s16(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + (mask & 1LL) ? 0xFF : 0, + (mask & 2LL) ? 0xFF : 0, + (mask & 4LL) ? 0xFF : 0, + (mask & 8LL) ? 0xFF : 0, + (mask & 16LL) ? 0xFF : 0, + (mask & 32LL) ? 0xFF : 0, + (mask & 64LL) ? 0xFF : 0, + (mask & 128LL) ? 0xFF : 0, + (mask & 256LL) ? 0xFF : 0, + (mask & 512LL) ? 0xFF : 0, + (mask & 1024LL) ? 0xFF : 0, + (mask & 2048LL) ? 0xFF : 0, + (mask & 4096LL) ? 0xFF : 0, + (mask & 8192LL) ? 0xFF : 0, + (mask & 16384LL) ? 0xFF : 0, + (mask & 32768LL) ? 0xFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s8(maskArray, b.values, a.values); +} + +#define VEC_INT_NEON_OPS(vl, bit) \ + inline Vectorized::Vectorized(int##bit##_t val) { \ + values = vdupq_n_s##bit(val); \ + } \ + inline Vectorized Vectorized::loadu( \ + const void* ptr, int64_t count) { \ + if (count == size()) { \ + return vld1q_s##bit(reinterpret_cast(ptr)); \ + } else { \ + __at_align__ int##bit##_t tmp_values[size()]; \ + for (const auto i : c10::irange(size())) { \ + tmp_values[i] = 0; \ + } \ + std::memcpy( \ + tmp_values, \ + reinterpret_cast(ptr), \ + count * sizeof(int##bit##_t)); \ + return vld1q_s##bit(reinterpret_cast(tmp_values)); \ + } \ + } \ + inline void Vectorized::store(void* ptr, int64_t count) \ + const { \ + if (count == size()) { \ + vst1q_s##bit(reinterpret_cast(ptr), values); \ + } else { \ + int##bit##_t tmp_values[size()]; \ + vst1q_s##bit(reinterpret_cast(tmp_values), values); \ + std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t)); \ + } \ + } + +VEC_INT_NEON_OPS(2, 64) +VEC_INT_NEON_OPS(4, 32) +VEC_INT_NEON_OPS(8, 16) +VEC_INT_NEON_OPS(16, 8) + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return x * y; +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return x / y; +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + int32x4_t x = a; + int32x4_t y = b; + return x / y; +} + +inline int64_t Vectorized::reduce_max() const { + return std::max(values[0], values[1]); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return {std::min(x[0], y[0]), std::min(x[1], y[1])}; +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return {std::max(x[0], y[0]), std::max(x[1], y[1])}; +} + +template +inline Vectorized Vectorized::arange( + int64_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int64x2_t step_sizes = {0, 1}; + return base_vec.values + step_sizes * step_vec.values; +} + +template +inline Vectorized Vectorized::arange( + int32_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int32x4_t step_sizes = {0, 1, 2, 3}; + return vmlaq_s32(base_vec, step_sizes, step_vec); +} + +template +inline Vectorized Vectorized::arange( + int16_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7}; + return vmlaq_s16(base_vec, step_sizes, step_vec); +} + +template +inline Vectorized Vectorized::arange(int8_t base, step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int8x16_t step_sizes = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + return vmlaq_s8(base_vec, step_sizes, step_vec); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + uint64x2_t u = vreinterpretq_u64_s64(y); + uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)}; + return x >> vreinterpretq_s64_u64(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int32x4_t x = a; + int32x4_t y = b; + uint32x4_t bound = vdupq_n_u32(31); + uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound); + return x >> vreinterpretq_s32_u32(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int16x8_t x = a; + int16x8_t y = b; + uint16x8_t bound = vdupq_n_u16(15); + uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound); + return x >> vreinterpretq_s16_u16(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int8x16_t x = a; + int8x16_t y = b; + uint8x16_t bound = vdupq_n_u8(7); + int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound)); + return x >> z; +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int64x2_t y = b; + uint64x2_t u = vreinterpretq_u64_s64(y); + uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)}; + return vshlq_s64(a, vreinterpretq_s64_u64(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int32x4_t y = b; + uint32x4_t bound = vdupq_n_u32(32); + uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound); + return vshlq_s32(a, vreinterpretq_s32_u32(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int16x8_t y = b; + uint16x8_t bound = vdupq_n_u16(16); + uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound); + return vshlq_s16(a, vreinterpretq_s16_u16(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int8x16_t y = b; + uint8x16_t bound = vdupq_n_u8(8); + int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound)); + return vshlq_s8(a, z); +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 2) { + return b; + } else { + int64x2_t c = {b.values[0], a.values[1]}; + return c; + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 4) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint32x4_t maskArray = { + (count >= 1LL) ? 0xFFFFFFFF : 0, + (count >= 2LL) ? 0xFFFFFFFF : 0, + (count >= 3LL) ? 0xFFFFFFFF : 0, + 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s32(maskArray, b.values, a.values); + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 8) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint16x8_t maskArray = { + static_cast((count >= 1LL) ? 0xFFFF : 0), + static_cast((count >= 2LL) ? 0xFFFF : 0), + static_cast((count >= 3LL) ? 0xFFFF : 0), + static_cast((count >= 4LL) ? 0xFFFF : 0), + static_cast((count >= 5LL) ? 0xFFFF : 0), + static_cast((count >= 6LL) ? 0xFFFF : 0), + static_cast((count >= 7LL) ? 0xFFFF : 0), + 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s16(maskArray, b.values, a.values); + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 16) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + static_cast((count >= 1LL) ? 0xFF : 0), + static_cast((count >= 2LL) ? 0xFF : 0), + static_cast((count >= 3LL) ? 0xFF : 0), + static_cast((count >= 4LL) ? 0xFF : 0), + static_cast((count >= 5LL) ? 0xFF : 0), + static_cast((count >= 6LL) ? 0xFF : 0), + static_cast((count >= 7LL) ? 0xFF : 0), + static_cast((count >= 8LL) ? 0xFF : 0), + static_cast((count >= 9LL) ? 0xFF : 0), + static_cast((count >= 10LL) ? 0xFF : 0), + static_cast((count >= 11LL) ? 0xFF : 0), + static_cast((count >= 12LL) ? 0xFF : 0), + static_cast((count >= 13LL) ? 0xFF : 0), + static_cast((count >= 14LL) ? 0xFF : 0), + static_cast((count >= 15LL) ? 0xFF : 0), + 0}; + + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s8(maskArray, b.values, a.values); + } +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + Vectorized highBitsA = vmovl_high_s16(a); + Vectorized highBitsB = vmovl_high_s16(b); + Vectorized lowBitsA = vmovl_s16(vget_low_s16(a)); + Vectorized lowBitsB = vmovl_s16(vget_low_s16(b)); + int32x4_t highBitsResult = highBitsA / highBitsB; + int32x4_t lowBitsResult = lowBitsA / lowBitsB; + return vuzp1q_s16( + vreinterpretq_s16_s32(lowBitsResult), + vreinterpretq_s16_s32(highBitsResult)); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + Vectorized highBitsA = vmovl_high_s8(a); + Vectorized highBitsB = vmovl_high_s8(b); + Vectorized lowBitsA = vmovl_s8(vget_low_s8(a)); + Vectorized lowBitsB = vmovl_s8(vget_low_s8(b)); + int16x8_t highBitsResult = highBitsA / highBitsB; + int16x8_t lowBitsResult = lowBitsA / lowBitsB; + return vuzp1q_s8( + vreinterpretq_s8_s16(lowBitsResult), + vreinterpretq_s8_s16(highBitsResult)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..f8c811704314cceb401a0ed793a219332977fded --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h @@ -0,0 +1,383 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#define VEC_UINT_NEON_TEMPLATE(vl, bit) \ + template <> \ + struct is_vec_specialized_for : std::bool_constant {}; \ + \ + template <> \ + class Vectorized { \ + using neon_type = uint##bit##x##vl##_t; \ + \ + private: \ + neon_type values; \ + \ + public: \ + using value_type = uint##bit##_t; \ + using size_type = int; \ + static constexpr size_type size() { \ + return vl; \ + } \ + Vectorized() { \ + values = vdupq_n_u##bit(0); \ + } \ + Vectorized(neon_type v) : values(v) {} \ + Vectorized(uint##bit##_t val); \ + template < \ + typename... Args, \ + typename = std::enable_if_t<(sizeof...(Args) == size())>> \ + Vectorized(Args... vals) { \ + __at_align__ uint##bit##_t buffer[size()] = {vals...}; \ + values = vld1q_u##bit(buffer); \ + } \ + operator neon_type() const { \ + return values; \ + } \ + static Vectorized loadu( \ + const void* ptr, \ + uint64_t count = size()); \ + void store(void* ptr, uint64_t count = size()) const; \ + template \ + static Vectorized blend( \ + const Vectorized& a, \ + const Vectorized& b); \ + static Vectorized blendv( \ + const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& mask_) { \ + return vbslq_u##bit(mask_.values, b, a); \ + } \ + template \ + static Vectorized arange( \ + value_type base = 0, \ + step_t step = static_cast(1)); \ + static Vectorized set( \ + const Vectorized& a, \ + const Vectorized& b, \ + uint64_t count = size()); \ + const uint##bit##_t& operator[](uint idx) const = delete; \ + uint##bit##_t& operator[](uint idx) = delete; \ + Vectorized abs() const { \ + return values; \ + } \ + Vectorized real() const { \ + return values; \ + } \ + Vectorized imag() const { \ + return vdupq_n_u##bit(0); \ + } \ + Vectorized conj() const { \ + return values; \ + } \ + Vectorized neg() const { \ + return vreinterpretq_u##bit##_s##bit( \ + vnegq_s##bit(vreinterpretq_s##bit##_u##bit(values))); \ + } \ + uint##bit##_t reduce_add() const { \ + return vaddvq_u##bit(values); \ + } \ + uint##bit##_t reduce_max() const; \ + Vectorized operator==( \ + const Vectorized& other) const { \ + return Vectorized(vceqq_u##bit(values, other.values)); \ + } \ + Vectorized operator!=( \ + const Vectorized& other) const; \ + Vectorized operator<( \ + const Vectorized& other) const { \ + return Vectorized(vcltq_u##bit(values, other.values)); \ + } \ + Vectorized operator<=( \ + const Vectorized& other) const { \ + return Vectorized(vcleq_u##bit(values, other.values)); \ + } \ + Vectorized operator>( \ + const Vectorized& other) const { \ + return Vectorized(vcgtq_u##bit(values, other.values)); \ + } \ + Vectorized operator>=( \ + const Vectorized& other) const { \ + return Vectorized(vcgeq_u##bit(values, other.values)); \ + } \ + Vectorized eq( \ + const Vectorized& other) const; \ + Vectorized ne( \ + const Vectorized& other) const; \ + Vectorized gt( \ + const Vectorized& other) const; \ + Vectorized ge( \ + const Vectorized& other) const; \ + Vectorized lt( \ + const Vectorized& other) const; \ + Vectorized le( \ + const Vectorized& other) const; \ + }; \ + template <> \ + Vectorized inline operator+( \ + const Vectorized& a, \ + const Vectorized& b) { \ + return vaddq_u##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator-( \ + const Vectorized& a, \ + const Vectorized& b) { \ + return vsubq_u##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator&( \ + const Vectorized& a, \ + const Vectorized& b) { \ + return vandq_u##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator|( \ + const Vectorized& a, \ + const Vectorized& b) { \ + return vorrq_u##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator^( \ + const Vectorized& a, \ + const Vectorized& b) { \ + return veorq_u##bit(a, b); \ + } \ + Vectorized inline Vectorized::eq( \ + const Vectorized& other) const { \ + return (*this == other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ne( \ + const Vectorized& other) const { \ + return (*this != other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::gt( \ + const Vectorized& other) const { \ + return (*this > other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ge( \ + const Vectorized& other) const { \ + return (*this >= other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::lt( \ + const Vectorized& other) const { \ + return (*this < other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::le( \ + const Vectorized& other) const { \ + return (*this <= other) & Vectorized(1); \ + } + +VEC_UINT_NEON_TEMPLATE(16, 8) + +inline uint8_t Vectorized::reduce_max() const { + return vmaxvq_u8(values); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_u8(a, b); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_u8(a); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_u8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_u8(a, b); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + (mask & 1LL) ? 0xFF : 0, + (mask & 2LL) ? 0xFF : 0, + (mask & 4LL) ? 0xFF : 0, + (mask & 8LL) ? 0xFF : 0, + (mask & 16LL) ? 0xFF : 0, + (mask & 32LL) ? 0xFF : 0, + (mask & 64LL) ? 0xFF : 0, + (mask & 128LL) ? 0xFF : 0, + (mask & 256LL) ? 0xFF : 0, + (mask & 512LL) ? 0xFF : 0, + (mask & 1024LL) ? 0xFF : 0, + (mask & 2048LL) ? 0xFF : 0, + (mask & 4096LL) ? 0xFF : 0, + (mask & 8192LL) ? 0xFF : 0, + (mask & 16384LL) ? 0xFF : 0, + (mask & 32768LL) ? 0xFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_u8(maskArray, b.values, a.values); +} + +#define VEC_UINT_NEON_OPS(vl, bit) \ + inline Vectorized::Vectorized(uint##bit##_t val) { \ + values = vdupq_n_u##bit(val); \ + } \ + inline Vectorized Vectorized::loadu( \ + const void* ptr, uint64_t count) { \ + if (count == size()) { \ + return vld1q_u##bit(reinterpret_cast(ptr)); \ + } else { \ + __at_align__ uint##bit##_t tmp_values[size()]; \ + for (const auto i : c10::irange(size())) { \ + tmp_values[i] = 0; \ + } \ + std::memcpy( \ + tmp_values, \ + reinterpret_cast(ptr), \ + count * sizeof(uint##bit##_t)); \ + return vld1q_u##bit(reinterpret_cast(tmp_values)); \ + } \ + } \ + inline void Vectorized::store(void* ptr, uint64_t count) \ + const { \ + if (count == size()) { \ + vst1q_u##bit(reinterpret_cast(ptr), values); \ + } else { \ + uint##bit##_t tmp_values[size()]; \ + vst1q_u##bit(reinterpret_cast(tmp_values), values); \ + std::memcpy(ptr, tmp_values, count * sizeof(uint##bit##_t)); \ + } \ + } + +VEC_UINT_NEON_OPS(16, 8) + +template +inline Vectorized Vectorized::arange( + uint8_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const uint8x16_t step_sizes = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + return vmlaq_u8(base_vec, step_sizes, step_vec); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + uint8x16_t x = a; + uint8x16_t bound = vdupq_n_u8(8); + uint8x16_t z = vminq_u8(b, bound); + return x >> z; +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + uint8x16_t bound = vdupq_n_u8(8); + uint8x16_t z = vminq_u8(b, bound); + return vshlq_u8(a, vreinterpretq_s8_u8(z)); +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + uint64_t count) { + if (count == 0) { + return a; + } else if (count >= 16) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + static_cast((count >= 1LL) ? 0xFF : 0), + static_cast((count >= 2LL) ? 0xFF : 0), + static_cast((count >= 3LL) ? 0xFF : 0), + static_cast((count >= 4LL) ? 0xFF : 0), + static_cast((count >= 5LL) ? 0xFF : 0), + static_cast((count >= 6LL) ? 0xFF : 0), + static_cast((count >= 7LL) ? 0xFF : 0), + static_cast((count >= 8LL) ? 0xFF : 0), + static_cast((count >= 9LL) ? 0xFF : 0), + static_cast((count >= 10LL) ? 0xFF : 0), + static_cast((count >= 11LL) ? 0xFF : 0), + static_cast((count >= 12LL) ? 0xFF : 0), + static_cast((count >= 13LL) ? 0xFF : 0), + static_cast((count >= 14LL) ? 0xFF : 0), + static_cast((count >= 15LL) ? 0xFF : 0), + 0}; + + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_u8(maskArray, b.values, a.values); + } +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + uint8x16_t x = a; + uint8x16_t y = b; + return x / y; +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h new file mode 100644 index 0000000000000000000000000000000000000000..6745dd7eb2a1f371b45d5e21fe2f52276cf864db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h @@ -0,0 +1,435 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include + +#include +#if !( \ + defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \ + defined(CPU_CAPABILITY_ZVECTOR)) +#if defined(CPU_CAPABILITY_SVE256) +#include +#else +// clang-format off +#include +#include +#include +#include +#endif +#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16) +#include +#endif +#include +#include +#include +// clang-format on +#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) +#include +#else +// clang-format off +#include +#include +#include +// clang-format on +#endif + +#include +#include + +#include +#include +#include +#include +#include + +namespace at::vec { + +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) { + stream << val.val_; + return stream; +} +inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) { + stream << static_cast(val.val_); + return stream; +} +inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) { + stream << static_cast(val.val_); + return stream; +} + +template +std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { + T buf[Vectorized::size()]; + vec.store(buf); + stream << "vec["; + for (int i = 0; i != Vectorized::size(); i++) { + if (i != 0) { + stream << ", "; + } + stream << buf[i]; + } + stream << ']'; + return stream; +} + +#if defined(CPU_CAPABILITY_AVX2) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm256_castpd_ps(src); +} + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm256_castps_pd(src); +} + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm256_castsi256_ps(src); +} + +template <> +inline Vectorized cast( + const Vectorized& src) { + return _mm256_castsi256_pd(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex) { + return _mm256_i64gather_pd(base_addr, vindex, scale); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex) { + return _mm256_i32gather_ps(base_addr, vindex, scale); +} +#endif +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex, + Vectorized& mask) { + return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex, + Vectorized& mask) { + return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); +} +#endif +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Only works for inputs in the range: [-2^51, 2^51] +// From: https://stackoverflow.com/a/41148578 +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); + return _mm256_sub_epi64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return _mm256_cvttps_epi32(src); +} + +// From: https://stackoverflow.com/a/41148578 +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 */ + __m256i magic_i_hi32 = + _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 */ + __m256i magic_i_all = + _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 */ + __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all); + + __m256i v_lo = _mm256_blend_epi32( + magic_i_lo, src, 0b01010101); /* v_low = low32 + 2^52 */ + __m256i v_hi = _mm256_srli_epi64(src, 32); + v_hi = _mm256_xor_si256( + v_hi, magic_i_hi32); /* v_hi = high32*2^32 + 2^84 + 2^63 */ + /* int64 = low32 + high32*2^32 = v_hi + v_lo - 2^52 - 2^63 - 2^84 */ + __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); + __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); + return result; +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + return _mm256_cvtepi32_ps(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3} + // b = {b0, b1, b2, b3} + + // swap lanes: + // a_swapped = {a0, a1, b0, b1} + // b_swapped = {a2, a3, b2, b3} + auto a_swapped = + _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2. 4 bits apart + auto b_swapped = + _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3. 4 bits apart + + // group cols crossing lanes: + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + return std::make_pair( + _mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3 + _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3 +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + + // swap lanes: + // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + auto a_swapped = + _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2. 4 bits apart + auto b_swapped = + _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3. 4 bits apart + + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + return std::make_pair( + _mm256_permutevar8x32_ps(a_swapped, group_ctrl), + _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + + // group cols crossing lanes: + // a_grouped = {a0, a1, b0, b1} + // b_grouped = {a2, a3, b2, b3} + auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000); // 0, 2, 1, 3 + auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000); // 0, 2, 1, 3 + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + return std::make_pair( + _mm256_permute2f128_pd( + a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart + _mm256_permute2f128_pd( + a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + + // group cols crossing lanes: + // a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl); + auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl); + + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + return std::make_pair( + _mm256_permute2f128_ps( + a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart + _mm256_permute2f128_ps( + a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + return _mm256_permutevar8x32_ps(v, mask_float); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + return _mm256_permutevar8x32_epi32(v, mask_int32); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m256i mask = _mm256_set_epi8( + 1, + 0, + 3, + 2, + 5, + 4, + 7, + 6, + 9, + 8, + 11, + 10, + 13, + 12, + 15, + 14, + 1, + 0, + 3, + 2, + 5, + 4, + 7, + 6, + 9, + 8, + 11, + 10, + 13, + 12, + 15, + 14); + auto reversed = _mm256_shuffle_epi8(v, mask); + return _mm256_permute2x128_si256(reversed, reversed, 1); +} + +inline __m256i flip8(const __m256i& v) { + const __m256i mask_int8 = _mm256_set_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15); + auto reversed = _mm256_shuffle_epi8(v, mask_int8); + return _mm256_permute2x128_si256(reversed, reversed, 1); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +inline Vectorized operator&&( + const Vectorized& self, + const Vectorized& other) { + const __m256i* self_ = reinterpret_cast(self.as_bytes()); + const __m256i* other_ = reinterpret_cast(other.as_bytes()); + __m256i out = _mm256_and_si256(*self_, *other_); + Vectorized ret; + std::memcpy(ret, &out, ret.size() * sizeof(bool)); + return ret; +} + +#endif // (defined(CPU_CAPABILITY_AVX2) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h new file mode 100644 index 0000000000000000000000000000000000000000..2a585884e36ebdb20ef32ef8dc0e9f82d02895ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h @@ -0,0 +1,837 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +// Used for shared functions and classes for vec256_bfloat16.h and +// vec256_half.h. Any functions/classes that are common between those two files +// should be defined here. Any non-shared functions/classes should be defined in +// the respective files. + +#include +#include + +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +#ifndef SLEEF_CONST +#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#define SLEEF_CONST const +#else +#define SLEEF_CONST +#endif +#define SLEEF_CONST_OLD SLEEF_CONST +#else +#define SLEEF_CONST_OLD +#endif + +// bfloat16 conversion +static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { + o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16)); +} + +static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) { + __m128i lo = _mm256_extractf128_si256(a, 0); + __m128i hi = _mm256_extractf128_si256(a, 1); + cvtbf16_fp32(lo, o1); + cvtbf16_fp32(hi, o2); +} + +static inline __m128i cvtfp32_bf16(const __m256& src) { + __m256i value = _mm256_castps_si256(src); + __m256i nan = _mm256_set1_epi32(0xffff); + __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(src, src, _CMP_ORD_Q)); + __m256i ones = _mm256_set1_epi32(0x1); + __m256i vec_bias = _mm256_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm256_and_si256(_mm256_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm256_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm256_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm256_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm256_blendv_epi8(nan, t_value, mask); + t_value = + _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4] + t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11 01 10 00 + return _mm256_castsi256_si128(t_value); +} + +static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) { + __m256i lo = _mm256_castps_si256(a); + __m256i hi = _mm256_castps_si256(b); + __m256i nan = _mm256_set1_epi32(0xffff); + __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q)); + __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q)); + __m256i ones = _mm256_set1_epi32(0x1); + __m256i vec_bias = _mm256_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_lo = _mm256_and_si256(_mm256_srli_epi32(lo, 16), ones); + auto t_hi = _mm256_and_si256(_mm256_srli_epi32(hi, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_lo = _mm256_add_epi32(t_lo, vec_bias); + t_hi = _mm256_add_epi32(t_hi, vec_bias); + // input += rounding_bias; + t_lo = _mm256_add_epi32(t_lo, lo); + t_hi = _mm256_add_epi32(t_hi, hi); + // input = input >> 16; + t_lo = _mm256_srli_epi32(t_lo, 16); + t_hi = _mm256_srli_epi32(t_hi, 16); + // Check NaN before converting back to bf16 + t_lo = _mm256_blendv_epi8(nan, t_lo, mask_lo); + t_hi = _mm256_blendv_epi8(nan, t_hi, mask_hi); + + t_lo = _mm256_packus_epi32( + t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] + return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11 01 10 00 +} + +static inline __m256i merge_compare_result(const __m256& a, const __m256& b) { + __m256i lo = _mm256_castps_si256(a); + __m256i hi = _mm256_castps_si256(b); + lo = _mm256_srli_epi32(lo, 16); + hi = _mm256_srli_epi32(hi, 16); + auto out = _mm256_packus_epi32(lo, hi); + return _mm256_permute4x64_epi64(out, 0xd8); +} + +// float16 conversion +static inline void cvtfp16_fp32(const __m128i& a, __m256& o) { + o = _mm256_cvtph_ps(a); +} + +static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) { + __m128i lo = _mm256_extractf128_si256(a, 0); + __m128i hi = _mm256_extractf128_si256(a, 1); + cvtfp16_fp32(lo, o1); + cvtfp16_fp32(hi, o2); +} + +static inline __m128i cvtfp32_fp16(const __m256& src) { + return _mm256_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + +static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) { + __m128i lo = + _mm256_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m128i hi = + _mm256_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +// dtype conversion between float16/bfloat16 and float32 +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m128i& a, __m256& o); +template <> +inline void cvt_to_fp32(const __m128i& a, __m256& o) { + cvtbf16_fp32(a, o); +} +template <> +inline void cvt_to_fp32(const __m128i& a, __m256& o) { + cvtfp16_fp32(a, o); +} + +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2); +template <> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2) { + cvtbf16_fp32(a, o1, o2); +} +template <> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2) { + cvtfp16_fp32(a, o1, o2); +} + +template < + typename T, + bool is_compare_op = false, + typename std::enable_if_t, int> = 0> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b); +template <> +inline __m256i cvt_from_fp32( + const __m256& a, + const __m256& b) { + return cvtfp32_bf16(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { + return merge_compare_result(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { + return cvtfp32_fp16(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { + return cvtfp32_fp16(a, b); +} + +template +class Vectorized16 { + static_assert( + is_reduced_floating_point_v, + "Support only float16 and bfloat16."); + + protected: + __m256i values; + + public: + using value_type = uint16_t; + using size_type = int; + static constexpr size_type size() { + return 16; + } + Vectorized16() {} + Vectorized16(__m256i v) : values(v) {} + Vectorized16(T val) { + value_type uw = val.x; + values = _mm256_set1_epi16(uw); + } + Vectorized16( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16) { + values = _mm256_setr_epi16( + val1.x, + val2.x, + val3.x, + val4.x, + val5.x, + val6.x, + val7.x, + val8.x, + val9.x, + val10.x, + val11.x, + val12.x, + val13.x, + val14.x, + val15.x, + val16.x); + } + operator __m256i() const { + return values; + } + T& operator[](int idx) = delete; + const T& operator[](int idx) const = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + __m256i cmp = _mm256_cmpeq_epi16(values, _mm256_set1_epi16(0)); + return _mm256_movemask_epi8(cmp); + } + static Vectorized loadu(const void* ptr, int16_t count = size()) { + if (count == size()) + return _mm256_loadu_si256(reinterpret_cast(ptr)); + + __at_align__ int16_t tmp_values[size()]; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (const auto i : c10::irange(count, size())) { + tmp_values[i] = 0; + } + std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); + return _mm256_loadu_si256(reinterpret_cast(tmp_values)); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else if (count > 0) { + __at_align__ int16_t tmp_values[size()]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); + } + } + template + static Vectorized blend(const Vectorized& a, const Vectorized& b) { + __at_align__ int16_t tmp_values[size()]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi16(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi16(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi16(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi16(b.values, 3); + if (mask & 0x10) + tmp_values[4] = _mm256_extract_epi16(b.values, 4); + if (mask & 0x20) + tmp_values[5] = _mm256_extract_epi16(b.values, 5); + if (mask & 0x40) + tmp_values[6] = _mm256_extract_epi16(b.values, 6); + if (mask & 0x80) + tmp_values[7] = _mm256_extract_epi16(b.values, 7); + if (mask & 0x100) + tmp_values[8] = _mm256_extract_epi16(b.values, 8); + if (mask & 0x200) + tmp_values[9] = _mm256_extract_epi16(b.values, 9); + if (mask & 0x400) + tmp_values[10] = _mm256_extract_epi16(b.values, 10); + if (mask & 0x800) + tmp_values[11] = _mm256_extract_epi16(b.values, 11); + if (mask & 0x1000) + tmp_values[12] = _mm256_extract_epi16(b.values, 12); + if (mask & 0x2000) + tmp_values[13] = _mm256_extract_epi16(b.values, 13); + if (mask & 0x4000) + tmp_values[14] = _mm256_extract_epi16(b.values, 14); + if (mask & 0x8000) + tmp_values[15] = _mm256_extract_epi16(b.values, 15); + return loadu(tmp_values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + T base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + + // 'const' type qualifier on return type has no effect, but sleef defines this + // this way For example `Sleef_exp2f8_u10` signature is `const __m256 + // (__m256)` + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers") + Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + const auto o1 = vop(lo); + const auto o2 = vop(hi); + return cvt_from_fp32(o1, o2); + } + C10_DIAGNOSTIC_POP() + Vectorized isnan() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + lo = _mm256_cmp_ps(lo, _mm256_set1_ps(0.0f), _CMP_UNORD_Q); + hi = _mm256_cmp_ps(hi, _mm256_set1_ps(0.0f), _CMP_UNORD_Q); + return merge_compare_result(lo, hi); + } + Vectorized abs() const { + return _mm256_andnot_si256(_mm256_set1_epi16(0x8000), values); + } + Vectorized angle() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto angle_lambda = [](__m256 values_2) { + const auto zero_vec = _mm256_set1_ps(0.f); + const auto nan_vec = _mm256_set1_ps(NAN); + const auto not_nan_mask = _mm256_cmp_ps(values_2, values_2, _CMP_EQ_OQ); + const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ); + const auto pi = _mm256_set1_ps(c10::pi); + + const auto neg_mask = _mm256_cmp_ps(values_2, zero_vec, _CMP_LT_OQ); + auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask); + angle = _mm256_blendv_ps(angle, nan_vec, nan_mask); + return angle; + }; + auto o1 = angle_lambda(lo); + auto o2 = angle_lambda(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_epi16(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return map(Sleef_acosf8_u10); + } + Vectorized acosh() const { + return map(Sleef_acoshf8_u10); + } + Vectorized asin() const { + return map(Sleef_asinf8_u10); + } + Vectorized atan() const { + return map(Sleef_atanf8_u10); + } + Vectorized atanh() const { + return map(Sleef_atanhf8_u10); + } + Vectorized atan2(const Vectorized& b) const { + __m256 lo, hi; + __m256 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_atan2f8_u10(lo, b1); + auto o2 = Sleef_atan2f8_u10(hi, b2); + return cvt_from_fp32(o1, o2); + } + Vectorized copysign(const Vectorized& sign) const { + // copy sign bit (0x8000) from sign and remaining bits from values + __m256i mask_value = _mm256_set1_epi32(~0x80008000); + __m256i mask_signbit = _mm256_set1_epi32(0x80008000); + return Vectorized(_mm256_or_si256( + _mm256_and_si256(values, mask_value), + _mm256_and_si256(sign, mask_signbit))); + } + Vectorized erf() const { + return map(Sleef_erff8_u10); + } + Vectorized erfc() const { + return map(Sleef_erfcf8_u15); + } + Vectorized erfinv() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + for (int64_t i = 0; i < size() / 2; i++) { + tmp1[i] = calc_erfinv(tmp1[i]); + tmp2[i] = calc_erfinv(tmp2[i]); + } + auto o1 = _mm256_loadu_ps(tmp1); + auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized exp() const { + return map(Sleef_expf8_u10); + } + Vectorized exp2() const { + return map(Sleef_exp2f8_u10); + } + Vectorized expm1() const { + return map(Sleef_expm1f8_u10); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const { + __m256 x_lo, x_hi; + cvt_to_fp32(values, x_lo, x_hi); + __m256 q_lo, q_hi; + cvt_to_fp32(q.values, q_lo, q_hi); + auto o1 = Sleef_fmodf8(x_lo, q_lo); + auto o2 = Sleef_fmodf8(x_hi, q_hi); + return cvt_from_fp32(o1, o2); + } + Vectorized hypot(const Vectorized& b) const { + __m256 lo, hi; + __m256 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_hypotf8_u05(lo, b1); + auto o2 = Sleef_hypotf8_u05(hi, b2); + return cvt_from_fp32(o1, o2); + } + Vectorized i0() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + for (int64_t i = 0; i < size() / 2; i++) { + tmp1[i] = calc_i0(tmp1[i]); + tmp2[i] = calc_i0(tmp2[i]); + } + auto o1 = _mm256_loadu_ps(tmp1); + auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized i0e() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + constexpr auto sz = size(); + __at_align__ float tmp1[sz / 2], tmp2[sz / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + + for (auto i = decltype(sz){0}; i < sz / 2; i++) { + tmp1[i] = calc_i0e(tmp1[i]); + tmp2[i] = calc_i0e(tmp2[i]); + } + const auto o1 = _mm256_loadu_ps(tmp1); + const auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized digamma() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + constexpr auto sz = size(); + __at_align__ float tmp1[sz / 2], tmp2[sz / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + + for (auto i = decltype(sz){0}; i < sz / 2; i++) { + tmp1[i] = calc_digamma(tmp1[i]); + tmp2[i] = calc_digamma(tmp2[i]); + } + const auto o1 = _mm256_loadu_ps(tmp1); + const auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized igamma(const Vectorized& x) const { + __m256 lo, hi; + __m256 xlo, xhi; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(x.values, xlo, xhi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmpx1), xlo); + _mm256_storeu_ps(reinterpret_cast(tmpx2), xhi); + for (int64_t i = 0; i < size() / 2; ++i) { + tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]); + tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]); + } + auto o1 = _mm256_loadu_ps(tmp1); + auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + + Vectorized igammac(const Vectorized& x) const { + __m256 lo, hi; + __m256 xlo, xhi; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(x.values, xlo, xhi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmp1), lo); + _mm256_storeu_ps(reinterpret_cast(tmp2), hi); + __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2]; + _mm256_storeu_ps(reinterpret_cast(tmpx1), xlo); + _mm256_storeu_ps(reinterpret_cast(tmpx2), xhi); + for (int64_t i = 0; i < size() / 2; ++i) { + tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]); + tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]); + } + auto o1 = _mm256_loadu_ps(tmp1); + auto o2 = _mm256_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized log() const { + return map(Sleef_logf8_u10); + } + Vectorized log2() const { + return map(Sleef_log2f8_u10); + } + Vectorized log10() const { + return map(Sleef_log10f8_u10); + } + Vectorized log1p() const { + return map(Sleef_log1pf8_u10); + } + Vectorized sin() const { + return map(Sleef_sinf8_u10); + } + Vectorized sinh() const { + return map(Sleef_sinhf8_u10); + } + Vectorized cos() const { + return map(Sleef_cosf8_u10); + } + Vectorized cosh() const { + return map(Sleef_coshf8_u10); + } + Vectorized ceil() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm256_ceil_ps(lo); + auto o2 = _mm256_ceil_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized floor() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm256_floor_ps(lo); + auto o2 = _mm256_floor_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized neg() const { + return _mm256_xor_si256(values, _mm256_set1_epi16(0x8000)); + } + Vectorized round() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = + _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + auto o2 = + _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + return cvt_from_fp32(o1, o2); + } + Vectorized tan() const { + return map(Sleef_tanf8_u10); + } + Vectorized tanh() const { + return map(Sleef_tanhf8_u10); + } + Vectorized trunc() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + return cvt_from_fp32(o1, o2); + } + Vectorized lgamma() const { + return map(Sleef_lgammaf8_u10); + } + Vectorized sqrt() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm256_sqrt_ps(lo); + auto o2 = _mm256_sqrt_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized reciprocal() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto ones = _mm256_set1_ps(1); + auto o1 = _mm256_div_ps(ones, lo); + auto o2 = _mm256_div_ps(ones, hi); + return cvt_from_fp32(o1, o2); + } + Vectorized rsqrt() const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + auto ones = _mm256_set1_ps(1); + auto o1 = _mm256_div_ps(ones, _mm256_sqrt_ps(lo)); + auto o2 = _mm256_div_ps(ones, _mm256_sqrt_ps(hi)); + return cvt_from_fp32(o1, o2); + } + Vectorized pow(const Vectorized& b) const { + __m256 lo, hi; + __m256 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_powf8_u10(lo, b1); + auto o2 = Sleef_powf8_u10(hi, b2); + return cvt_from_fp32(o1, o2); + } + + private: + template + Vectorized inline binary_compare(const VectorizedType& b, Op op) const { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvt_to_fp32(values, a_lo, a_hi); + cvt_to_fp32(b.values, b_lo, b_hi); + auto o1 = op(a_lo, b_lo); + auto o2 = op(a_hi, b_hi); + return cvt_from_fp32(o1, o2); + } + + public: + Vectorized inline operator>(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_GT_OQ); + }); + } + Vectorized inline operator<(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_LT_OQ); + }); + } + Vectorized inline operator>=(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_GE_OQ); + }); + } + Vectorized inline operator<=(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_LE_OQ); + }); + } + Vectorized inline operator==(const Vectorized16& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); + }); + } + Vectorized inline operator!=(const Vectorized16& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ); + }); + } +}; + +template +static inline Vectorized binary_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvt_to_fp32(__m256i(a), a_lo, a_hi); + cvt_to_fp32(__m256i(b), b_lo, b_hi); + auto o1 = op(a_lo, b_lo); + auto o2 = op(a_hi, b_hi); + return cvt_from_fp32(o1, o2); +} + +#define CONVERT_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + __m256 o1, o2; \ + cvt_to_fp32(__m256i(a), o1, o2); \ + return std::make_tuple(o1, o2); \ + } \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + return cvt_from_fp32(__m256(a), __m256(b)); \ + } + +#define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + auto values = _mm_loadu_si128(reinterpret_cast(data)); \ + __m256 out_values; \ + cvt_to_fp32(values, out_values); \ + out = out_values; \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + auto vec = Vectorized::loadu(data); \ + __m256 out1_values, out2_values; \ + cvt_to_fp32(vec, out1_values, out2_values); \ + out1 = out1_values; \ + out2 = out2_values; \ + } + +#else // CPU_CAPABILITY_AVX2 + +#define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr2); \ + convert(arr2, arr, K); \ + return std::make_tuple( \ + Vectorized::loadu(arr), \ + Vectorized::loadu(arr + Vectorized::size())); \ + } \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr); \ + b.store(arr + Vectorized::size()); \ + convert(arr, arr2, K); \ + return Vectorized::loadu(arr2); \ + } + +#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ + for (const auto k : c10::irange(Vectorized::size())) { \ + values[k] = data[k]; \ + } \ + out = Vectorized::loadu(values); \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + load_fp32_from_##name(data, out1); \ + data += Vectorized::size(); \ + load_fp32_from_##name(data, out2); \ + } + +#endif // CPU_CAPABILITY_AVX2 +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h new file mode 100644 index 0000000000000000000000000000000000000000..6fec6b9b7b59a2ba50b720c71b4146992b665084 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h @@ -0,0 +1,285 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: + using Vectorized16::Vectorized16; + + using value_type = BFloat16; + + Vectorized frac() const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_si256(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_si256(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_si256(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(b), b_lo, b_hi); + auto max_lo = _mm256_max_ps(a_lo, b_lo); + auto max_hi = _mm256_max_ps(a_hi, b_hi); + auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm256_or_ps(max_lo, nan_lo); + auto o2 = _mm256_or_ps(max_hi, nan_hi); + return cvtfp32_bf16(o1, o2); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(b), b_lo, b_hi); + auto min_lo = _mm256_min_ps(a_lo, b_lo); + auto min_hi = _mm256_min_ps(a_hi, b_hi); + auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm256_or_ps(min_lo, nan_lo); + auto o2 = _mm256_or_ps(min_hi, nan_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + __m256 a_lo, a_hi; + __m256 min_lo, min_hi; + __m256 max_lo, max_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(min), min_lo, min_hi); + cvtbf16_fp32(__m256i(max), max_lo, max_hi); + auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo)); + auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi)); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + __m256 a_lo, a_hi; + __m256 max_lo, max_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(max), max_lo, max_hi); + auto o1 = _mm256_min_ps(max_lo, a_lo); + auto o2 = _mm256_min_ps(max_hi, a_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + __m256 a_lo, a_hi; + __m256 min_lo, min_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(min), min_lo, min_hi); + auto o1 = _mm256_max_ps(min_lo, a_lo); + auto o2 = _mm256_max_ps(min_hi, a_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +inline void convert(const float* src, BFloat16* dst, int64_t n) { + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m256 a = _mm256_loadu_ps(&src[i]); + __m256 b = _mm256_loadu_ps(&src[i + 8]); + + __m256i bf = cvtfp32_bf16(a, b); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +inline void convert(const double* src, BFloat16* dst, int64_t n) { + auto load_float = [](const double* src) -> __m256 { + // Load one float vector from an array of doubles + __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src)); + __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4)); + return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1); + }; + + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m256 a = load_float(&src[i]); + __m256 b = load_float(&src[i + 8]); + + __m256i bf = cvtfp32_bf16(a, b); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + __m256 c_lo, c_hi; + cvtbf16_fp32(__m256i(a), a_lo, a_hi); + cvtbf16_fp32(__m256i(b), b_lo, b_hi); + cvtbf16_fp32(__m256i(c), c_lo, c_hi); + auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo); + auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi); + return cvtfp32_bf16(o1, o2); +} + +CONVERT_VECTORIZED_INIT(BFloat16, bfloat16) +LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16) + +#else // defined(CPU_CAPABILITY_AVX2) + +#if !( \ + defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE256)) +CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16) +#endif + +LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16) +#endif // defined(CPU_CAPABILITY_AVX2) +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h new file mode 100644 index 0000000000000000000000000000000000000000..a8b68fdfc60003e8bf42dcaec98fdc02219bda15 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -0,0 +1,543 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include + +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m256d values; + + public: + using value_type = c10::complex; + using size_type = int; + static constexpr size_type size() { + return 2; + } + Vectorized() { + values = _mm256_setzero_pd(); + } + Vectorized(__m256d v) : values(v) {} + Vectorized(c10::complex val) { + double real_value = val.real(); + double imag_value = val.imag(); + values = _mm256_setr_pd(real_value, imag_value, real_value, imag_value); + } + Vectorized(c10::complex val1, c10::complex val2) { + values = _mm256_setr_pd(val1.real(), val1.imag(), val2.real(), val2.imag()); + } + operator __m256d() const { + return values; + } + template + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy + static_assert(mask > -1 && mask < 4, "Unexpected mask value"); + switch (mask) { + case 0: + return a; + case 1: + return _mm256_blend_pd(a.values, b.values, 0x03); + case 2: + return _mm256_blend_pd(a.values, b.values, 0x0c); + case 3: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values); + return _mm256_blendv_pd(a.values, b.values, mask_); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>(base, base + step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + } + return b; + } + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm256_loadu_pd(reinterpret_cast(ptr)); + + __at_align__ double tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(c10::complex)); + return _mm256_load_pd(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm256_storeu_pd(reinterpret_cast(ptr), values); + } else if (count > 0) { + double tmp_values[2 * size()]; + _mm256_storeu_pd(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); + } + } + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + __m256d abs_2_() const { + auto val_2 = _mm256_mul_pd(values, values); // a*a b*b + return _mm256_hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b + } + __m256d abs_() const { + auto real = _mm256_movedup_pd(values); // real real + // movehdup_pd does not exist... + auto imag = _mm256_permute_pd(values, 0xf); // imag imag + return Sleef_hypotd4_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm256_and_pd(abs_(), real_mask); // abs 0 + } + __m256d angle_() const { + // angle = atan2(b/a) + auto b_a = _mm256_permute_pd(values, 0x05); // b a + return Sleef_atan2d4_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + auto angle = _mm256_permute_pd(angle_(), 0x05); // angle 90-angle + return _mm256_and_pd(angle, real_mask); // angle 0 + } + Vectorized> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_pd(); + auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); + auto div = _mm256_div_pd(values, abs); + return _mm256_blendv_pd(div, zero, mask); + } + __m256d real_() const { + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm256_and_pd(values, real_mask); + } + Vectorized> real() const { + return real_(); + } + __m256d imag_() const { + const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF)); + return _mm256_and_pd(values, imag_mask); + } + Vectorized> imag() const { + return _mm256_permute_pd(imag_(), 0x05); // b a + } + __m256d conj_() const { + const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); + return _mm256_xor_pd(values, sign_mask); // a -b + } + Vectorized> conj() const { + return conj_(); + } + Vectorized> log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + Vectorized> log2() const { + const __m256d log2_ = _mm256_set1_pd(std::log(2)); + return _mm256_div_pd(log(), log2_); + } + Vectorized> log10() const { + const __m256d log10_ = _mm256_set1_pd(std::log(10)); + return _mm256_div_pd(log(), log10_); + } + Vectorized> log1p() const { + return map(std::log1p); + } + Vectorized> asin() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // // asin(x) + // // = -i*ln(iz + sqrt(1 -z^2)) + // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + // const __m256d one = _mm256_set1_pd(1); + + // auto conj = conj_(); + // auto b_a = _mm256_permute_pd(conj, 0x05); //-b a + // auto ab = _mm256_mul_pd(conj, b_a); //-ab + // -ab auto im = _mm256_add_pd(ab, ab); //-2ab -2ab + + // auto val_2 = _mm256_mul_pd(values, values); // a*a + // b*b auto re = _mm256_hsub_pd(val_2, _mm256_permute_pd(val_2, 0x05)); // + // a*a-b*b b*b-a*a re = _mm256_sub_pd(one, re); + + // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt(); //sqrt(re + + // i*im) auto ln = Vectorized(_mm256_add_pd(b_a, root)).log(); //ln(iz + + // sqrt()) return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj(); + // //-i*ln() + return map(std::asin); + } + Vectorized> acos() const { + // acos(x) = pi/2 - asin(x) + constexpr auto pi_2d = c10::pi / 2; + const __m256d pi_2 = _mm256_setr_pd(pi_2d, 0.0, pi_2d, 0.0); + return _mm256_sub_pd(pi_2, asin()); + } + Vectorized> atan() const; + Vectorized> atanh() const { + return map(std::atanh); + } + Vectorized> exp() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expd4_u10(values); //exp(a) exp(b) exp = + // _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosd4_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y, + // 0x05), + // sin_cos.x, 0x0A); //cos(b) sin(b) + // return _mm256_mul_pd(exp, cos_sin); + return map(std::exp); + } + Vectorized> exp2() const { + // Use identity 2**x = exp(log(2) * x) + const __m256d ln_2 = _mm256_set1_pd(c10::ln_2); + Vectorized> scaled_values = + _mm256_mul_pd(values, ln_2); + return scaled_values.exp(); + } + Vectorized> expm1() const { + return map(std::expm1); + } + Vectorized> sin() const { + return map(std::sin); + } + Vectorized> sinh() const { + return map(std::sinh); + } + Vectorized> cos() const { + return map(std::cos); + } + Vectorized> cosh() const { + return map(std::cosh); + } + Vectorized> ceil() const { + return _mm256_ceil_pd(values); + } + Vectorized> floor() const { + return _mm256_floor_pd(values); + } + Vectorized> neg() const { + auto zero = _mm256_setzero_pd(); + return _mm256_sub_pd(zero, values); + } + Vectorized> round() const { + return _mm256_round_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized> tan() const { + return map(std::tan); + } + Vectorized> tanh() const { + return map(std::tanh); + } + Vectorized> trunc() const { + return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized> sqrt() const { + return map(std::sqrt); + } + Vectorized> reciprocal() const; + Vectorized> rsqrt() const { + return sqrt().reciprocal(); + } + Vectorized> pow( + const Vectorized>& exp) const { + __at_align__ c10::complex x_tmp[size()]; + __at_align__ c10::complex y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized> operator==( + const Vectorized>& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); + } + Vectorized> operator!=( + const Vectorized>& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); + } + Vectorized> operator<( + const Vectorized>& /*unused*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& /*unused*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& /*unused*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& /*unused*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_add_pd(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_sub_pd(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm256_mul_pd(a, b); // ac bd + + auto d_c = _mm256_permute_pd(b, 0x05); // d c + d_c = _mm256_xor_pd(sign_mask, d_c); // d -c + auto ad_bc = _mm256_mul_pd(a, d_c); // ad -bc + + auto ret = _mm256_hsub_pd(ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // auto mask = _mm256_set1_pd(-0.f); + // auto fabs_cd = _mm256_andnot_pd(mask, b); // |c| |d| + // auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05); // |d| |c| + // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, + // fabs_dc)); // 1/sc 1/sc auto a2 = _mm256_mul_pd(a, scale); // + // a/sc b/sc auto b2 = _mm256_mul_pd(b, scale); // c/sc d/sc + // auto acbd2 = _mm256_mul_pd(a2, b2); + + // const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0); + // auto dc2 = _mm256_permute_pd(b2, 0x05); // d/sc c/sc + // dc2 = _mm256_xor_pd(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm256_mul_pd(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = _mm256_hadd_pd(acbd2, adbc2); //(ac+bd)/sc^2 (bc-ad)/sc^2 + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm256_div_pd(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return _mm256_loadu_pd(reinterpret_cast(out)); +} + +// reciprocal. Implement this here so we can use multiplication. +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); + // auto c_d = _mm256_xor_pd(sign_mask, values); //c -d + // return _mm256_div_pd(c_d, abs_2_()); + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = c10::complex(1) / tmp[i]; + } + return loadu(tmp); +} + +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m256d i = _mm256_setr_pd(0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm256_setr_pd(0.0, 0.5, 0.0, 0.5); + + // auto sum = Vectorized(_mm256_add_pd(i, values)); // a + // 1+b auto sub = Vectorized(_mm256_sub_pd(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() + return map(std::atan); +} + +template <> +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ); + auto max = _mm256_blendv_pd(a, b, mask); + // Exploit the fact that all-ones is a NaN. + auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q); + return _mm256_or_pd(max, isnan); +} + +template <> +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ); + auto min = _mm256_blendv_pd(a, b, mask); + // Exploit the fact that all-ones is a NaN. + auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q); + return _mm256_or_pd(min, isnan); +} + +template <> +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_and_pd(a, b); +} + +template <> +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_or_pd(a, b); +} + +template <> +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_xor_pd(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm256_set1_pd(1.0)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm256_set1_pd(1.0)); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h new file mode 100644 index 0000000000000000000000000000000000000000..96d0530f038d32d5eebfd82269c1df7cd5ae5daa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -0,0 +1,625 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m256 values; + + public: + using value_type = c10::complex; + using size_type = int; + static constexpr size_type size() { + return 4; + } + Vectorized() { + values = _mm256_setzero_ps(); + } + Vectorized(__m256 v) : values(v) {} + Vectorized(c10::complex val) { + float real_value = val.real(); + float imag_value = val.imag(); + values = _mm256_setr_ps( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4) { + values = _mm256_setr_ps( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag()); + } + operator __m256() const { + return values; + } + template + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy + static_assert(mask > -1 && mask < 16, "Unexpected mask range"); + switch (mask) { + case 0: + return a; + case 1: + return _mm256_blend_ps( + a.values, b.values, 0x03); // b0000 0001 = b0000 0011 + case 2: + return _mm256_blend_ps( + a.values, b.values, 0x0C); // b0000 0010 = b0000 1100 + case 3: + return _mm256_blend_ps( + a.values, b.values, 0x0F); // b0000 0011 = b0000 1111 + case 4: + return _mm256_blend_ps( + a.values, b.values, 0x30); // b0000 0100 = b0011 0000 + case 5: + return _mm256_blend_ps( + a.values, b.values, 0x33); // b0000 0101 = b0011 0011 + case 6: + return _mm256_blend_ps( + a.values, b.values, 0x3C); // b0000 0110 = b0011 1100 + case 7: + return _mm256_blend_ps( + a.values, b.values, 0x3F); // b0000 0111 = b0011 1111 + case 8: + return _mm256_blend_ps( + a.values, b.values, 0xC0); // b0000 1000 = b1100 0000 + case 9: + return _mm256_blend_ps( + a.values, b.values, 0xC3); // b0000 1001 = b1100 0011 + case 10: + return _mm256_blend_ps( + a.values, b.values, 0xCC); // b0000 1010 = b1100 1100 + case 11: + return _mm256_blend_ps( + a.values, b.values, 0xCF); // b0000 1011 = b1100 1111 + case 12: + return _mm256_blend_ps( + a.values, b.values, 0xF0); // b0000 1100 = b1111 0000 + case 13: + return _mm256_blend_ps( + a.values, b.values, 0xF3); // b0000 1101 = b1111 0011 + case 14: + return _mm256_blend_ps( + a.values, b.values, 0xFC); // b0000 1110 = b1111 1100 + default: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values); + return _mm256_blendv_ps(a.values, b.values, mask_); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + step, + base + c10::complex(2) * step, + base + c10::complex(3) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm256_loadu_ps(reinterpret_cast(ptr)); + + __at_align__ float tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(c10::complex)); + return _mm256_load_ps(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm256_storeu_ps(reinterpret_cast(ptr), values); + } else if (count > 0) { + float tmp_values[2 * size()]; + _mm256_storeu_ps(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); + } + } + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + __m256 abs_2_() const { + auto val_2 = _mm256_mul_ps(values, values); // a*a b*b + auto ret = _mm256_hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b + return _mm256_permute_ps(ret, 0xD8); + } + __m256 abs_() const { + auto real = _mm256_moveldup_ps(values); // real real + auto imag = _mm256_movehdup_ps(values); // imag imag + return Sleef_hypotf8_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm256_and_ps(abs_(), real_mask); // abs 0 + } + __m256 angle_() const { + // angle = atan2(b/a) + auto b_a = _mm256_permute_ps(values, 0xB1); // b a + return Sleef_atan2f8_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle 90-angle + return _mm256_and_ps(angle, real_mask); // angle 0 + } + Vectorized> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_ps(); + auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); + auto div = _mm256_div_ps(values, abs); + return _mm256_blendv_ps(div, zero, mask); + } + __m256 real_() const { + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm256_and_ps(values, real_mask); + } + Vectorized> real() const { + return real_(); + } + __m256 imag_() const { + const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF)); + return _mm256_and_ps(values, imag_mask); + } + Vectorized> imag() const { + return _mm256_permute_ps(imag_(), 0xB1); // b a + } + __m256 conj_() const { + const __m256 sign_mask = + _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + return _mm256_xor_ps(values, sign_mask); // a -b + } + Vectorized> conj() const { + return conj_(); + } + Vectorized> log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + Vectorized> log2() const { + const __m256 log2_ = _mm256_set1_ps(std::log(2)); + return _mm256_div_ps(log(), log2_); + } + Vectorized> log10() const { + const __m256 log10_ = _mm256_set1_ps(std::log(10)); + return _mm256_div_ps(log(), log10_); + } + Vectorized> log1p() const { + return map(std::log1p); + } + Vectorized> asin() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // // asin(x) + // // = -i*ln(iz + sqrt(1 -z^2)) + // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + // const __m256 one = _mm256_set1_ps(1); + + // auto conj = conj_(); + // auto b_a = _mm256_permute_ps(conj, 0xB1); //-b a + // auto ab = _mm256_mul_ps(conj, b_a); //-ab + // -ab auto im = _mm256_add_ps(ab, ab); //-2ab -2ab + + // auto val_2 = _mm256_mul_ps(values, values); // a*a + // b*b auto re = _mm256_hsub_ps(val_2, _mm256_permute_ps(val_2, 0xB1)); // + // a*a-b*b b*b-a*a re = _mm256_permute_ps(re, 0xD8); re = + // _mm256_sub_ps(one, re); + + // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt(); //sqrt(re + + // i*im) auto ln = Vectorized(_mm256_add_ps(b_a, root)).log(); //ln(iz + + // sqrt()) return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj(); + // //-i*ln() + return map(std::asin); + } + Vectorized> acos() const { + return map(std::acos); + } + Vectorized> atan() const; + Vectorized> atanh() const { + return map(std::atanh); + } + Vectorized> exp() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expf8_u10(values); //exp(a) exp(b) exp = + // _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosf8_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y, + // 0xB1), + // sin_cos.x, 0xAA); //cos(b) sin(b) + // return _mm256_mul_ps(exp, cos_sin); + return map(std::exp); + } + Vectorized> exp2() const { + // Use identity 2**x = exp(log(2) * x) + const __m256 ln_2 = _mm256_set1_ps(c10::ln_2); + Vectorized> scaled_values = _mm256_mul_ps(values, ln_2); + return scaled_values.exp(); + } + Vectorized> expm1() const { + return map(std::expm1); + } + Vectorized> sin() const { + return map(std::sin); + } + Vectorized> sinh() const { + return map(std::sinh); + } + Vectorized> cos() const { + return map(std::cos); + } + Vectorized> cosh() const { + return map(std::cosh); + } + Vectorized> ceil() const { + return _mm256_ceil_ps(values); + } + Vectorized> floor() const { + return _mm256_floor_ps(values); + } + Vectorized> neg() const { + auto zero = _mm256_setzero_ps(); + return _mm256_sub_ps(zero, values); + } + Vectorized> round() const { + return _mm256_round_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized> tan() const { + return map(std::tan); + } + Vectorized> tanh() const { + return map(std::tanh); + } + Vectorized> trunc() const { + return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized> sqrt() const { + return map(std::sqrt); + } + Vectorized> reciprocal() const; + Vectorized> rsqrt() const { + return sqrt().reciprocal(); + } + Vectorized> pow( + const Vectorized>& exp) const { + __at_align__ c10::complex x_tmp[size()]; + __at_align__ c10::complex y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized> operator==( + const Vectorized>& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); + } + Vectorized> operator!=( + const Vectorized>& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ); + } + Vectorized> operator<( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_add_ps(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_sub_ps(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m256 sign_mask = + _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm256_mul_ps(a, b); // ac bd + + auto d_c = _mm256_permute_ps(b, 0xB1); // d c + d_c = _mm256_xor_ps(sign_mask, d_c); // d -c + auto ad_bc = _mm256_mul_ps(a, d_c); // ad -bc + + auto ret = _mm256_hsub_ps(ac_bd, ad_bc); // ac - bd ad + bc + ret = _mm256_permute_ps(ret, 0xD8); + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // auto mask = _mm256_set1_ps(-0.f); + // auto fabs_cd = _mm256_andnot_ps(mask, b); // |c| |d| + // auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1); // |d| |c| + // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc)); // 1/sc 1/sc + // auto a2 = _mm256_mul_ps(a, scale); // a/sc b/sc + // auto b2 = _mm256_mul_ps(b, scale); // c/sc d/sc + // auto acbd2 = _mm256_mul_ps(a2, b2); + + // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); auto dc2 = _mm256_permute_ps(b2, 0xB1); // d/sc c/sc + // dc2 = _mm256_xor_ps(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm256_mul_ps(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = _mm256_hadd_ps(acbd2, adbc2); //(ac+bd)/sc^2 (bc-ad)/sc^2 + // res2 = _mm256_permute_ps(res2, 0xD8); + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm256_div_ps(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return _mm256_loadu_ps(reinterpret_cast(out)); +} + +// reciprocal. Implement this here so we can use multiplication. +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); auto c_d = _mm256_xor_ps(sign_mask, values); //c -d + // return _mm256_div_ps(c_d, abs_2_()); + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = c10::complex(1) / tmp[i]; + } + return loadu(tmp); +} + +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m256 i = _mm256_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm256_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm256_add_ps(i, values)); // a + // 1+b auto sub = Vectorized(_mm256_sub_ps(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() + return map(std::atan); +} + +template <> +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); + auto max = _mm256_blendv_ps(a, b, mask); + // Exploit the fact that all-ones is a NaN. + auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + return _mm256_or_ps(max, isnan); +} + +template <> +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); + auto min = _mm256_blendv_ps(a, b, mask); + // Exploit the fact that all-ones is a NaN. + auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + return _mm256_or_ps(min, isnan); +} + +template <> +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_and_ps(a, b); +} + +template <> +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_or_ps(a, b); +} + +template <> +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_xor_ps(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm256_set1_ps(1.0f)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm256_set1_ps(1.0f)); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..4ea85701b7cbbef81f26709ea08be38cdea3e108 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h @@ -0,0 +1,370 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + __m256 value; + cvtbf16_fp32(_mm256_castsi256_si128(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + __m256 value; + cvtfp16_fp32(_mm256_castsi256_si128(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = _mm256_castsi128_si256(cvtfp32_bf16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = convert_float_bfloat16(src[0], src[1]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + std::tie(result[0], result[1]) = convert_bfloat16_float(src[0]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = _mm256_castsi128_si256(cvtfp32_fp16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = convert_float_half(src[0], src[1]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + std::tie(result[0], result[1]) = convert_half_float(src[0]); + return result; + } +}; + +template <> +inline Vectorized convert_to_fp_of_same_size( + const Vectorized& src); + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low_double = at::vec::convert_to_fp_of_same_size(src[0]); + auto low = _mm256_cvtpd_ps(low_double); + auto high_double = at::vec::convert_to_fp_of_same_size(src[1]); + auto high = _mm256_cvtpd_ps(high_double); + return Vectorized( + _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + // Scalarization is the most reliable way of converting fp to int64 on AVX2. + // Check: https://stackoverflow.com/questions/41144668 + float buffer[8]; + src.store(buffer); + at::vec::VectorizedN result; + result[0] = Vectorized( + static_cast(buffer[0]), + static_cast(buffer[1]), + static_cast(buffer[2]), + static_cast(buffer[3])); + result[1] = Vectorized( + static_cast(buffer[4]), + static_cast(buffer[5]), + static_cast(buffer[6]), + static_cast(buffer[7])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0)); + auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0)); + auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0)); + auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0)); + return Vectorized(_mm256_blend_epi32(low_perm, high_perm, 0xF0)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src[0])); + result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src[0], 1)); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm256_castsi256_si128(src[0]); + return Vectorized(_mm256_cvtepi8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm256_castsi256_si128(src[0]); + return Vectorized(_mm256_cvtepu8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + return Vectorized(_mm256_cvttps_epi32(src[0])); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + return Vectorized(_mm256_cvtepi32_ps(src[0])); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm256_castsi256_si128(src[0]); + return Vectorized(_mm256_cvtepu8_epi16(src128)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + src_t, + 1, + typename std::enable_if_t< + (is_reduced_floating_point_v && is_8bit_integer_v) || + (is_reduced_floating_point_v && is_8bit_integer_v), + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN tmp_fp32 = VecConvert::apply(src); + return VecConvert::apply(tmp_fp32); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 2, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + at::vec::Vectorized vec1 = convert_float_to_int8(src[0]); + at::vec::Vectorized vec2 = convert_float_to_int8(src[1]); + __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2)); + __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1); + // Shuffle [191:128] bit from combined in to [127:64] bit of result + __m256i result = + _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000); + return at::vec::Vectorized(result); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_float_to_int8(src[0]); + } +}; + +template +struct VecConvert< + float, + 2, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled + __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000); + __m256i src2 = + _mm256_castsi128_si256(_mm_castps_si128(_mm256_extractf128_ps( + _mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane + )); + return VectorizedN( + convert_int8_to_float(src[0]), + convert_int8_to_float(src2)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + int64_t, + 2, + std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const VectorizedN& src) { + return VecConvert::apply( + VecConvert::apply(src)); + } +}; + +#endif /* defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) */ + +#if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)) +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_int8_to_float(src[0]); + } +}; +#endif + +#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + // Load 16-bit unsigned integers from src into an SVE vector + svuint16_t u16x4 = + svld1_u16(svptrue_b16(), reinterpret_cast(&src[0])); + // Zero-extend to 32-bit SVE does not have direct vmovl_u16 equivalent. + vls_uint32_t u32x4 = + svreinterpret_u32_u16(svzip1_u16(svdup_n_u16(0), u16x4)); + // Reinterpret as float32 + vls_float32_t f32x4 = svreinterpret_f32_u32(u32x4); + res[0] = Vectorized(f32x4); + return res; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + std::tie(res[0], res[1]) = convert_bfloat16_float(src[0]); + return res; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + res[0] = convert_float_bfloat16(src[0], src[1]); + return res; + } +}; + +#endif // defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + auto [res_vec1, res_vec2] = convert_to_float(src[0]); + return res_vec1; + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_from_float(src[0], src[0]); + } +}; + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h new file mode 100644 index 0000000000000000000000000000000000000000..34c34f62526d9cb2d5cd5ed9d8e396280ca608f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h @@ -0,0 +1,531 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + __m256d values; + + public: + using value_type = double; + using size_type = int; + static constexpr size_type size() { + return 4; + } + Vectorized() { + values = _mm256_setzero_pd(); + } + Vectorized(__m256d v) : values(v) {} + Vectorized(double val) { + values = _mm256_set1_pd(val); + } + Vectorized(double val1, double val2, double val3, double val4) { + values = _mm256_setr_pd(val1, val2, val3, val4); + } + operator __m256d() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm256_blend_pd(a.values, b.values, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_pd(a.values, b.values, mask.values); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return _mm256_loadu_pd(reinterpret_cast(ptr)); + + __at_align__ double tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(double)); + return _mm256_load_pd(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm256_storeu_pd(reinterpret_cast(ptr), values); + } else if (count > 0) { + double tmp_values[size()]; + _mm256_storeu_pd(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(double)); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ); + return _mm256_movemask_pd(cmp); + } + Vectorized isnan() const { + return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q); + } + bool has_inf_nan() const { + __m256d self_sub = _mm256_sub_pd(values, values); + return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != + 0; + } + Vectorized map(double (*const f)(double)) const { + __at_align__ double tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = _mm256_set1_pd(-0.f); + return _mm256_andnot_pd(mask, values); + } + Vectorized angle() const { + const auto zero_vec = _mm256_set1_pd(0.f); + const auto nan_vec = _mm256_set1_pd(NAN); + const auto not_nan_mask = _mm256_cmp_pd(values, values, _CMP_EQ_OQ); + const auto nan_mask = _mm256_cmp_pd(not_nan_mask, zero_vec, _CMP_EQ_OQ); + const auto pi = _mm256_set1_pd(c10::pi); + + const auto neg_mask = _mm256_cmp_pd(values, zero_vec, _CMP_LT_OQ); + auto angle = _mm256_blendv_pd(zero_vec, pi, neg_mask); + angle = _mm256_blendv_pd(angle, nan_vec, nan_mask); + return angle; + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_pd(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return Vectorized(Sleef_acosd4_u10(values)); + } + Vectorized acosh() const { + return Vectorized(Sleef_acoshd4_u10(values)); + } + Vectorized asin() const { + return Vectorized(Sleef_asind4_u10(values)); + } + Vectorized asinh() const { + return Vectorized(Sleef_asinhd4_u10(values)); + } + Vectorized atan() const { + return Vectorized(Sleef_atand4_u10(values)); + } + Vectorized atanh() const { + return Vectorized(Sleef_atanhd4_u10(values)); + } + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2d4_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { + return Vectorized(Sleef_copysignd4(values, sign)); + } + Vectorized erf() const { + return Vectorized(Sleef_erfd4_u10(values)); + } + Vectorized erfc() const { + return Vectorized(Sleef_erfcd4_u15(values)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return Vectorized(Sleef_expd4_u10(values)); + } + Vectorized exp2() const { + return Vectorized(Sleef_exp2d4_u10(values)); + } + Vectorized expm1() const { + return Vectorized(Sleef_expm1d4_u10(values)); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const { + return Vectorized(Sleef_fmodd4(values, q)); + } + Vectorized hypot(const Vectorized& b) const { + return Vectorized(Sleef_hypotd4_u05(values, b)); + } + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized log() const { + return Vectorized(Sleef_logd4_u10(values)); + } + Vectorized log2() const { + return Vectorized(Sleef_log2d4_u10(values)); + } + Vectorized log10() const { + return Vectorized(Sleef_log10d4_u10(values)); + } + Vectorized log1p() const { + return Vectorized(Sleef_log1pd4_u10(values)); + } + Vectorized sin() const { + return Vectorized(Sleef_sind4_u10(values)); + } + Vectorized sinh() const { + return Vectorized(Sleef_sinhd4_u10(values)); + } + Vectorized cos() const { + return Vectorized(Sleef_cosd4_u10(values)); + } + Vectorized cosh() const { + return Vectorized(Sleef_coshd4_u10(values)); + } + Vectorized ceil() const { + return _mm256_ceil_pd(values); + } + Vectorized floor() const { + return _mm256_floor_pd(values); + } + Vectorized frac() const; + Vectorized neg() const { + return _mm256_xor_pd(_mm256_set1_pd(-0.), values); + } + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterd4(values, b)); + } + Vectorized round() const { + return _mm256_round_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized tan() const { + return Vectorized(Sleef_tand4_u10(values)); + } + Vectorized tanh() const { + return Vectorized(Sleef_tanhd4_u10(values)); + } + Vectorized trunc() const { + return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized lgamma() const { + return Vectorized(Sleef_lgammad4_u10(values)); + } + Vectorized sqrt() const { + return _mm256_sqrt_pd(values); + } + Vectorized reciprocal() const { + return _mm256_div_pd(_mm256_set1_pd(1), values); + } + Vectorized rsqrt() const { + return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values)); + } + Vectorized pow(const Vectorized& b) const { + return Vectorized(Sleef_powd4_u10(values, b)); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); + } + + Vectorized operator!=(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); + } + + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ); + } + + Vectorized operator<=(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ); + } + + Vectorized operator>(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ); + } + + Vectorized operator>=(const Vectorized& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_pd(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_pd(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm256_mul_pd(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return _mm256_div_pd(a, b); +} + +// frac. Implement this here so we can use subtraction. +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + Vectorized max = _mm256_max_pd(a, b); + Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + return _mm256_or_pd(max, isnan); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + Vectorized min = _mm256_min_pd(a, b); + Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + return _mm256_or_pd(min, isnan); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return _mm256_min_pd(max, _mm256_max_pd(min, a)); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return _mm256_max_pd(min, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return _mm256_min_pd(max, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_pd(a, b); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_pd(a, b); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_pd(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0); +} + +template <> +inline void convert(const double* src, double* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i)); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +#ifdef CPU_CAPABILITY_AVX2 +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fmadd_pd(a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fnmadd_pd(a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fmsub_pd(a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fnmsub_pd(a, b, c); +} +#endif + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h new file mode 100644 index 0000000000000000000000000000000000000000..1a2cbb07006467f5eded6893f5aadf4d68e93053 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h @@ -0,0 +1,847 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + __m256 values; + + public: + using value_type = float; + using size_type = int; + static constexpr size_type size() { + return 8; + } + Vectorized() { + values = _mm256_setzero_ps(); + } + Vectorized(__m256 v) : values(v) {} + Vectorized(float val) { + values = _mm256_set1_ps(val); + } + Vectorized( + float val1, + float val2, + float val3, + float val4, + float val5, + float val6, + float val7, + float val8) { + values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); + } + Vectorized(const float (&arr)[8]) + : Vectorized( + arr[0], + arr[1], + arr[2], + arr[3], + arr[4], + arr[5], + arr[6], + arr[7]) {} + operator __m256() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm256_blend_ps(a.values, b.values, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_ps(a.values, b.values, mask.values); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return _mm256_loadu_ps(reinterpret_cast(ptr)); + __at_align__ float tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, reinterpret_cast(ptr), count * sizeof(float)); + return _mm256_loadu_ps(tmp_values); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + _mm256_storeu_ps(reinterpret_cast(ptr), values); + } else if (count > 0) { + float tmp_values[size()]; + _mm256_storeu_ps(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(float)); + } + } + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ); + return _mm256_movemask_ps(cmp); + } + Vectorized isnan() const { + return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q); + } + + bool has_inf_nan() const { + __m256 self_sub = _mm256_sub_ps(values, values); + return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != + 0; + } + + Vectorized map(float (*const f)(float)) const { + __at_align__ float tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = _mm256_set1_ps(-0.f); + return _mm256_andnot_ps(mask, values); + } + Vectorized angle() const { + const auto zero_vec = _mm256_set1_ps(0.f); + const auto nan_vec = _mm256_set1_ps(NAN); + const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ); + const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ); + const auto pi = _mm256_set1_ps(c10::pi); + + const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ); + auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask); + angle = _mm256_blendv_ps(angle, nan_vec, nan_mask); + return angle; + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_ps(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return Vectorized(Sleef_acosf8_u10(values)); + } + Vectorized acosh() const { + return Vectorized(Sleef_acoshf8_u10(values)); + } + Vectorized asin() const { + return Vectorized(Sleef_asinf8_u10(values)); + } + Vectorized asinh() const { + return Vectorized(Sleef_asinhf8_u10(values)); + } + Vectorized atan() const { + return Vectorized(Sleef_atanf8_u10(values)); + } + Vectorized atanh() const { + return Vectorized(Sleef_atanhf8_u10(values)); + } + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2f8_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { + return Vectorized(Sleef_copysignf8(values, sign)); + } + Vectorized erf() const { + // constants + const auto neg_zero_vec = _mm256_set1_ps(-0.f); + const auto one_vec = _mm256_set1_ps(1.0f); + const auto p = _mm256_set1_ps(0.3275911f); + const auto p1 = _mm256_set1_ps(0.254829592f); + const auto p2 = _mm256_set1_ps(-0.284496736f); + const auto p3 = _mm256_set1_ps(1.421413741f); + const auto p4 = _mm256_set1_ps(-1.453152027f); + const auto p5 = _mm256_set1_ps(1.061405429f); + // sign(x) + auto sign_mask = _mm256_and_ps(neg_zero_vec, values); + auto abs_vec = _mm256_xor_ps(sign_mask, values); + // t = 1 / (p * abs(x) + 1) + auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec); + auto t = _mm256_div_ps(one_vec, tmp0); + // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 + auto tmp1 = _mm256_fmadd_ps(p5, t, p4); + auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3); + auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2); + auto r = _mm256_fmadd_ps(tmp3, t, p1); + // - exp(- x * x) + auto pow_2 = _mm256_mul_ps(values, values); + auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2); + // auto tmp4 = exp(neg_pow_2); + auto tmp4 = Vectorized(Sleef_expf8_u10(neg_pow_2)); + auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4); + // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) + auto tmp6 = _mm256_mul_ps(tmp5, t); + auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec); + return _mm256_xor_ps(sign_mask, tmp7); + } + Vectorized erfc() const { + return Vectorized(Sleef_erfcf8_u15(values)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return Vectorized(Sleef_expf8_u10(values)); + } + Vectorized exp2() const { + return Vectorized(Sleef_exp2f8_u10(values)); + } + Vectorized expm1() const { + return Vectorized(Sleef_expm1f8_u10(values)); + } + Vectorized fexp_u20() const { + const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f); + const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f); + const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356); + const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236); + + const __m256 vec_exp_log2ef = + _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) + + const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2)); + const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f); + + const __m256 vec_ln_flt_min = + _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); + const __m256 vec_ln_flt_max = + _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); + const __m256 vec_inf = _mm256_set1_ps(INFINITY); + const __m256 zero = _mm256_setzero_ps(); + + // exp(x) = 2**(x * log2(e)) + // = 2**xi * 2**xf - TIPS we are using the EEEE floating point + // representation with identification to the exponent and the + // mentissa + // 2**xf will be approximated to a polynomial of degree 3 computed with + // Horner method + // compute the min/max for the mask + // Masks + __m256 mask_too_small = + _mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min + __m256 mask_too_large = + _mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max + + // transformation with log2(e) + auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef); + auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src)); + + // compute polynomial using Horner Scheme + auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2); + vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1); + vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0); + + vec_src = _mm256_sub_ps(vec_src, vec_res); + // // the tips is here, headache in perspective + auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b); + // headache bis + __m256i casted_integer = _mm256_cvttps_epi32(tmp); + // bitwise to float for the final transformation + auto result = _mm256_castsi256_ps(casted_integer); + // boundary condition + // Set to 0 where x < ln(FLT_MIN) + result = _mm256_blendv_ps(result, zero, mask_too_small); + // Set to +inf where x > ln(FLT_MAX) + result = _mm256_blendv_ps(result, vec_inf, mask_too_large); + // final interpretation to float + return result; + } + + Vectorized exp_u20() const { + // A faster version of exp with ULP=20 + const __m256 vec_factorial_1 = + _mm256_set1_ps(0.999999701f); // 1/factorial(1) + const __m256 vec_factorial_2 = + _mm256_set1_ps(0.499991506f); // 1/factorial(2) + const __m256 vec_factorial_3 = + _mm256_set1_ps(0.166676521f); // 1/factorial(3) + const __m256 vec_factorial_4 = + _mm256_set1_ps(0.0418978221f); // 1/factorial(4) + const __m256 vec_factorial_5 = + _mm256_set1_ps(0.00828929059f); // 1/factorial(5) + const __m256 vec_exp_log2ef = + _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) + const __m256 vec_half = _mm256_set1_ps(0.5f); + const __m256 vec_one = _mm256_set1_ps(1.f); + const __m256 vec_zero = _mm256_set1_ps(0.f); + const __m256 vec_two = _mm256_set1_ps(2.f); + const __m256 vec_ln2f = + _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) + const __m256 vec_ln_flt_min = + _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); + const __m256 vec_ln_flt_max = + _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); + const __m256i vec_127 = _mm256_set1_epi32(0x0000007f); + const int n_mantissa_bits = 23; + + // exp(x) = + // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem + // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression + + auto less_ln_flt_min_mask = + _mm256_cmp_ps(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/); + auto vec_src = _mm256_min_ps(values, vec_ln_flt_max); + vec_src = _mm256_max_ps(vec_src, vec_ln_flt_min); + + // fx = floorf(x * log2ef + 0.5) + auto vec_fx = _mm256_fmadd_ps(vec_src, vec_exp_log2ef, vec_half); + vec_fx = _mm256_floor_ps(vec_fx); + + // x = x - fx * ln2 + auto vec_exp_poly = _mm256_fnmadd_ps(vec_fx, vec_ln2f, vec_src); + + // compute polynomial + auto vec_res = + _mm256_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4); + vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3); + vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2); + vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1); + vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_one); + + // compute 2^(n-1) + auto vec_exp_number = _mm256_sub_ps(vec_fx, vec_one); + auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); + auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); + vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); + auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); + vec_two_pow_n = + _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); + + // y = y * 2^n + vec_res = _mm256_mul_ps(vec_res, vec_two_pow_n); + vec_res = _mm256_mul_ps(vec_res, vec_two); + return vec_res; + } + Vectorized fmod(const Vectorized& q) const { + return Vectorized(Sleef_fmodf8(values, q)); + } + Vectorized log() const { + return Vectorized(Sleef_logf8_u10(values)); + } + Vectorized log2() const { + return Vectorized(Sleef_log2f8_u10(values)); + } + Vectorized log10() const { + return Vectorized(Sleef_log10f8_u10(values)); + } + Vectorized log1p() const { + return Vectorized(Sleef_log1pf8_u10(values)); + } + Vectorized frac() const; + Vectorized sin() const { + return Vectorized(Sleef_sinf8_u35(values)); + } + Vectorized sinh() const { + return Vectorized(Sleef_sinhf8_u10(values)); + } + Vectorized cos() const { + return Vectorized(Sleef_cosf8_u35(values)); + } + Vectorized cosh() const { + return Vectorized(Sleef_coshf8_u10(values)); + } + Vectorized ceil() const { + return _mm256_ceil_ps(values); + } + Vectorized floor() const { + return _mm256_floor_ps(values); + } + Vectorized hypot(const Vectorized& b) const { + return Vectorized(Sleef_hypotf8_u05(values, b)); + } + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized neg() const { + return _mm256_xor_ps(_mm256_set1_ps(-0.f), values); + } + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterf8(values, b)); + } + Vectorized round() const { + return _mm256_round_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized tan() const { + return Vectorized(Sleef_tanf8_u10(values)); + } + Vectorized tanh() const { + return Vectorized(Sleef_tanhf8_u10(values)); + } + Vectorized trunc() const { + return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized lgamma() const { + return Vectorized(Sleef_lgammaf8_u10(values)); + } + Vectorized sqrt() const { + return _mm256_sqrt_ps(values); + } + Vectorized reciprocal() const { + return _mm256_div_ps(_mm256_set1_ps(1), values); + } + Vectorized rsqrt() const { + return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values)); + } + Vectorized pow(const Vectorized& b) const { + return Vectorized(Sleef_powf8_u10(values, b)); + } + float reduce_add() const { + auto v = values; + // 128-bit shuffle + auto v1 = _mm256_permute2f128_ps(v, v, 0x1); + v = _mm256_add_ps(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0x4E); + v = _mm256_add_ps(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0xB1); + v = _mm256_add_ps(v, v1); + return _mm256_cvtss_f32(v); + } + float reduce_max() const { + auto v = values; + // 128-bit shuffle + auto v1 = _mm256_permute2f128_ps(v, v, 0x1); + v = _mm256_max_ps(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0x4E); + v = _mm256_max_ps(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0xB1); + v = _mm256_max_ps(v, v1); + return _mm256_cvtss_f32(v); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); + } + + Vectorized operator!=(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ); + } + + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ); + } + + Vectorized operator<=(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ); + } + + Vectorized operator>(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ); + } + + Vectorized operator>=(const Vectorized& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_ps(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_ps(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm256_mul_ps(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return _mm256_div_ps(a, b); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + Vectorized max = _mm256_max_ps(a, b); + Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + return _mm256_or_ps(max, isnan); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + Vectorized min = _mm256_min_ps(a, b); + Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + return _mm256_or_ps(min, isnan); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return _mm256_min_ps(max, _mm256_max_ps(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return _mm256_min_ps(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return _mm256_max_ps(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_ps(a, b); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_ps(a, b); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_ps(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +inline void convert(const float* src, float* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i)); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fmadd_ps(a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fnmadd_ps(a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fmsub_ps(a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm256_fnmsub_ps(a, b, c); +} + +// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) +// Used by Inductor CPP codegen for micro gemm +inline void transpose_block(at::vec::VectorizedN& input) { + __m256 temp0[8]; + // unpacking and interleaving 32-bit elements + // a0 b0 a1 b1 a4 b4 a5 b5 + // a2 b2 a3 b3 a6 b6 a7 b7 + // c0 d0 c1 d1 ... + // c2 d2 c3 d3 ... + // e0 f0 e1 f1 ... + // e2 f2 e3 f3 ... + // g0 h0 g1 h1 ... + // g2 h2 g3 h3 ... + temp0[0] = _mm256_unpacklo_ps(input[0], input[1]); + temp0[1] = _mm256_unpackhi_ps(input[0], input[1]); + temp0[2] = _mm256_unpacklo_ps(input[2], input[3]); + temp0[3] = _mm256_unpackhi_ps(input[2], input[3]); + temp0[4] = _mm256_unpacklo_ps(input[4], input[5]); + temp0[5] = _mm256_unpackhi_ps(input[4], input[5]); + temp0[6] = _mm256_unpacklo_ps(input[6], input[7]); + temp0[7] = _mm256_unpackhi_ps(input[6], input[7]); + + __m256 temp1[8]; + // unpacking and interleaving 64-bit elements + // a0 b0 c0 d0 a4 b4 c4 d4 + // a1 b1 c1 d1 ... + // a2 b2 c2 d2 ... + // a3 b3 c3 d3 ... + // e0 f0 g0 h0 e4 f4 g4 h4 + // e1 f1 g1 h1 ... + // e2 f2 g2 h2 ... + // e3 f3 g3 h3 ... + temp1[0] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2]))); + temp1[1] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2]))); + temp1[2] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3]))); + temp1[3] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3]))); + temp1[4] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6]))); + temp1[5] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6]))); + temp1[6] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); + temp1[7] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); + + // shuffle 128-bits (composed of 4 32-bit elements) + // a0 b0 c0 d0 e0 f0 g0 h0 + // a1 b1 c1 d1 ... + // a2 b2 c2 d2 ... + // a3 b3 c3 d3 ... + // a4 b4 c4 d4 ... + // a5 b5 c5 d5 ... + // a6 b6 c6 d6 ... + // a7 b7 c7 d7 ... + input[0] = _mm256_permute2f128_ps(temp1[0], temp1[4], 0x20); + input[1] = _mm256_permute2f128_ps(temp1[1], temp1[5], 0x20); + input[2] = _mm256_permute2f128_ps(temp1[2], temp1[6], 0x20); + input[3] = _mm256_permute2f128_ps(temp1[3], temp1[7], 0x20); + input[4] = _mm256_permute2f128_ps(temp1[0], temp1[4], 0x31); + input[5] = _mm256_permute2f128_ps(temp1[1], temp1[5], 0x31); + input[6] = _mm256_permute2f128_ps(temp1[2], temp1[6], 0x31); + input[7] = _mm256_permute2f128_ps(temp1[3], temp1[7], 0x31); +} + +// Used by Inductor CPP codegen +template <> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst) { + // load from src to registers + at::vec::VectorizedN input; + // a: a0 a1 a2 a3 a4 a5 a6 a7 + // b: b0 b1 b2 b3 b4 b5 b6 b7 + // c: c0 c1 c2 c3 c4 c5 c6 c7 + // d: d0 d1 d2 d3 d4 d5 d6 d7 + // e: e0 e1 e2 e3 e4 e5 e6 e7 + // f: f0 f1 f2 f3 f4 f5 f6 f7 + // g: g0 g1 g2 g3 g4 g5 g6 g7 + // h: h0 h1 h2 h3 h4 h5 h6 h7 + int i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i < 8; i++) { + input[i] = _mm256_loadu_ps(&src[i * ld_src]); + } + + transpose_block(input); + + // store from registers to dst +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i < 8; i++) { + _mm256_storeu_ps(&dst[i * ld_dst], input[i]); + } +} + +template <> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst); + transpose_mxn(src + 8, ld_src, dst + 8 * ld_dst, ld_dst); + transpose_mxn(src + 8 * ld_src, ld_src, dst + 8, ld_dst); + transpose_mxn( + src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst); +} +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h new file mode 100644 index 0000000000000000000000000000000000000000..e5d95b014801a22c7eec6b9295baa51a66f0fd2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h @@ -0,0 +1,285 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#ifdef CPU_CAPABILITY_AVX2 + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: + using Vectorized16::Vectorized16; + + using value_type = Half; + + Vectorized frac() const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_si256(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_si256(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_si256(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(b), b_lo, b_hi); + auto max_lo = _mm256_max_ps(a_lo, b_lo); + auto max_hi = _mm256_max_ps(a_hi, b_hi); + auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm256_or_ps(max_lo, nan_lo); + auto o2 = _mm256_or_ps(max_hi, nan_hi); + return cvtfp32_fp16(o1, o2); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(b), b_lo, b_hi); + auto min_lo = _mm256_min_ps(a_lo, b_lo); + auto min_hi = _mm256_min_ps(a_hi, b_hi); + auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm256_or_ps(min_lo, nan_lo); + auto o2 = _mm256_or_ps(min_hi, nan_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + __m256 a_lo, a_hi; + __m256 min_lo, min_hi; + __m256 max_lo, max_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(min), min_lo, min_hi); + cvtfp16_fp32(__m256i(max), max_lo, max_hi); + auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo)); + auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi)); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + __m256 a_lo, a_hi; + __m256 max_lo, max_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(max), max_lo, max_hi); + auto o1 = _mm256_min_ps(max_lo, a_lo); + auto o2 = _mm256_min_ps(max_hi, a_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + __m256 a_lo, a_hi; + __m256 min_lo, min_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(min), min_lo, min_hi); + auto o1 = _mm256_max_ps(min_lo, a_lo); + auto o2 = _mm256_max_ps(min_hi, a_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +inline void convert(const Half* src, Half* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +inline void convert(const float* src, Half* dst, int64_t n) { + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m256 a = _mm256_loadu_ps(&src[i]); + __m256 b = _mm256_loadu_ps(&src[i + 8]); + + __m256i c = cvtfp32_fp16(a, b); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +inline void convert(const double* src, Half* dst, int64_t n) { + auto load_float = [](const double* src) -> __m256 { + // Load one float vector from an array of doubles + __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src)); + __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4)); + return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1); + }; + + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m256 a = load_float(&src[i]); + __m256 b = load_float(&src[i + 8]); + + __m256i c = cvtfp32_fp16(a, b); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + __m256 a_lo, a_hi; + __m256 b_lo, b_hi; + __m256 c_lo, c_hi; + cvtfp16_fp32(__m256i(a), a_lo, a_hi); + cvtfp16_fp32(__m256i(b), b_lo, b_hi); + cvtfp16_fp32(__m256i(c), c_lo, c_hi); + auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo); + auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi); + return cvtfp32_fp16(o1, o2); +} + +CONVERT_VECTORIZED_INIT(Half, half) +LOAD_FP32_VECTORIZED_INIT(Half, fp16) + +#else // defined(CPU_CAPABILITY_AVX2) + +#if !( \ + defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE256)) +CONVERT_NON_VECTORIZED_INIT(Half, half) +#endif + +LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16) +#endif // defined(CPU_CAPABILITY_AVX2) +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h new file mode 100644 index 0000000000000000000000000000000000000000..bb2866dfc45192365a6d31495ccfdfe9fe5c1a98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h @@ -0,0 +1,2327 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#ifdef CPU_CAPABILITY_AVX2 + +struct Vectorizedi { + protected: + __m256i values; + + static inline __m256i invert(const __m256i& v) { + const auto ones = _mm256_set1_epi64x(-1); + return _mm256_xor_si256(ones, v); + } + + public: + Vectorizedi() { + values = _mm256_setzero_si256(); + } + Vectorizedi(__m256i v) : values(v) {} + operator __m256i() const { + return values; + } +}; + +#else + +struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined + +#endif // CPU_CAPABILITY_AVX2 + +#ifdef CPU_CAPABILITY_AVX2 + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: + using value_type = int64_t; + using size_type = int; + static constexpr size_type size() { + return 4; + } + using Vectorizedi::Vectorizedi; + Vectorized() { + values = _mm256_setzero_si256(); + } + Vectorized(int64_t v) { + values = _mm256_set1_epi64x(v); + } + Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) { + values = _mm256_setr_epi64x(val1, val2, val3, val4); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + __at_align__ int64_t tmp_values[size()]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi64(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi64(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi64(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi64(b.values, 3); + return loadu(tmp_values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int64_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ int64_t tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to one using "={1}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 1; + } + std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else if (count > 0) { + __at_align__ int64_t tmp_values[size()]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); + } + } + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + Vectorized abs() const { + auto zero = _mm256_set1_epi64x(0); + auto is_larger = _mm256_cmpgt_epi64(zero, values); + auto inverse = _mm256_xor_si256(values, is_larger); + return _mm256_sub_epi64(inverse, is_larger); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_epi64x(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmpeq_epi64(values, other.values); + } + Vectorized operator!=(const Vectorized& other) const { + return invert(_mm256_cmpeq_epi64(values, other.values)); + } + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmpgt_epi64(other.values, values); + } + Vectorized operator<=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi64(values, other.values)); + } + Vectorized operator>(const Vectorized& other) const { + return _mm256_cmpgt_epi64(values, other.values); + } + Vectorized operator>=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi64(other.values, values)); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: + using value_type = int32_t; + static constexpr int size() { + return 8; + } + using Vectorizedi::Vectorizedi; + Vectorized() {} + Vectorized(int32_t v) { + values = _mm256_set1_epi32(v); + } + Vectorized( + int32_t val1, + int32_t val2, + int32_t val3, + int32_t val4, + int32_t val5, + int32_t val6, + int32_t val7, + int32_t val8) { + values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm256_blend_epi32(a, b, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int32_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int32_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int32_t count) { + __at_align__ int32_t tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to one using "={1}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 1; + } + std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else if (count > 0) { + __at_align__ int32_t tmp_values[size()]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); + } + } + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; + Vectorized abs() const { + return _mm256_abs_epi32(values); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_epi32(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + int32_t reduce_add() const { + auto v = values; + // 128-bit shuffle + auto v1 = _mm256_permute2f128_si256(v, v, 0x1); + v = _mm256_add_epi32(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_epi32(v, 0x4E); + v = _mm256_add_epi32(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_epi32(v, 0xB1); + v = _mm256_add_epi32(v, v1); + __m128i lo = _mm256_castsi256_si128(v); + return _mm_cvtsi128_si32(lo); + } + int32_t reduce_max() const { + auto v = values; + // 128-bit shuffle + auto v1 = _mm256_permute2f128_si256(v, v, 0x1); + v = _mm256_max_epi32(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_epi32(v, 0x4E); + v = _mm256_max_epi32(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_epi32(v, 0xB1); + v = _mm256_max_epi32(v, v1); + __m128i lo = _mm256_castsi256_si128(v); + return _mm_cvtsi128_si32(lo); + } + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmpeq_epi32(values, other.values); + } + Vectorized operator!=(const Vectorized& other) const { + return invert(_mm256_cmpeq_epi32(values, other.values)); + } + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmpgt_epi32(other.values, values); + } + Vectorized operator<=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi32(values, other.values)); + } + Vectorized operator>(const Vectorized& other) const { + return _mm256_cmpgt_epi32(values, other.values); + } + Vectorized operator>=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi32(other.values, values)); + } + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + int64_t i; + // int32_t and float have same size +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_vec = + _mm256_loadu_si256(reinterpret_cast(src + i)); + auto output_vec = _mm256_cvtepi32_ps(input_vec); + _mm256_storeu_ps(reinterpret_cast(dst + i), output_vec); + } +#ifndef _MSC_VER +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int32_t* src, double* dst, int64_t n) { + int64_t i; + // int32_t has half the size of double +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_128_vec = + _mm_loadu_si128(reinterpret_cast(src + i)); + auto output_vec = _mm256_cvtepi32_pd(input_128_vec); + _mm256_storeu_pd(reinterpret_cast(dst + i), output_vec); + } +#ifndef _MSC_VER +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: + using value_type = int16_t; + static constexpr int size() { + return 16; + } + using Vectorizedi::Vectorizedi; + Vectorized() {} + Vectorized(int16_t v) { + values = _mm256_set1_epi16(v); + } + Vectorized( + int16_t val1, + int16_t val2, + int16_t val3, + int16_t val4, + int16_t val5, + int16_t val6, + int16_t val7, + int16_t val8, + int16_t val9, + int16_t val10, + int16_t val11, + int16_t val12, + int16_t val13, + int16_t val14, + int16_t val15, + int16_t val16) { + values = _mm256_setr_epi16( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + __at_align__ int16_t tmp_values[size()]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi16(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi16(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi16(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi16(b.values, 3); + if (mask & 0x10) + tmp_values[4] = _mm256_extract_epi16(b.values, 4); + if (mask & 0x20) + tmp_values[5] = _mm256_extract_epi16(b.values, 5); + if (mask & 0x40) + tmp_values[6] = _mm256_extract_epi16(b.values, 6); + if (mask & 0x80) + tmp_values[7] = _mm256_extract_epi16(b.values, 7); + if (mask & 0x100) + tmp_values[8] = _mm256_extract_epi16(b.values, 8); + if (mask & 0x200) + tmp_values[9] = _mm256_extract_epi16(b.values, 9); + if (mask & 0x400) + tmp_values[10] = _mm256_extract_epi16(b.values, 10); + if (mask & 0x800) + tmp_values[11] = _mm256_extract_epi16(b.values, 11); + if (mask & 0x1000) + tmp_values[12] = _mm256_extract_epi16(b.values, 12); + if (mask & 0x2000) + tmp_values[13] = _mm256_extract_epi16(b.values, 13); + if (mask & 0x4000) + tmp_values[14] = _mm256_extract_epi16(b.values, 14); + if (mask & 0x8000) + tmp_values[15] = _mm256_extract_epi16(b.values, 15); + return loadu(tmp_values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int16_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int16_t count) { + __at_align__ int16_t tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to one using "={1}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 1; + } + std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else if (count > 0) { + __at_align__ int16_t tmp_values[size()]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); + } + } + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; + Vectorized abs() const { + return _mm256_abs_epi16(values); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_epi16(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmpeq_epi16(values, other.values); + } + Vectorized operator!=(const Vectorized& other) const { + return invert(_mm256_cmpeq_epi16(values, other.values)); + } + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmpgt_epi16(other.values, values); + } + Vectorized operator<=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi16(values, other.values)); + } + Vectorized operator>(const Vectorized& other) const { + return _mm256_cmpgt_epi16(values, other.values); + } + Vectorized operator>=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi16(other.values, values)); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template +class Vectorized8 : public Vectorizedi { + static_assert( + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + + protected: + static const Vectorized ones; + + public: + using value_type = T; + static constexpr int size() { + return 32; + } + using Vectorizedi::Vectorizedi; + Vectorized8() {} + Vectorized8(T v) { + values = _mm256_set1_epi8(v); + } + Vectorized8( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32) { + values = _mm256_setr_epi8( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16, + val17, + val18, + val19, + val20, + val21, + val22, + val23, + val24, + val25, + val26, + val27, + val28, + val29, + val30, + val31, + val32); + } + template + static Vectorized blend(Vectorized a, Vectorized b) { + __at_align__ T tmp_values[size()]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi8(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi8(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi8(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi8(b.values, 3); + if (mask & 0x10) + tmp_values[4] = _mm256_extract_epi8(b.values, 4); + if (mask & 0x20) + tmp_values[5] = _mm256_extract_epi8(b.values, 5); + if (mask & 0x40) + tmp_values[6] = _mm256_extract_epi8(b.values, 6); + if (mask & 0x80) + tmp_values[7] = _mm256_extract_epi8(b.values, 7); + if (mask & 0x100) + tmp_values[8] = _mm256_extract_epi8(b.values, 8); + if (mask & 0x200) + tmp_values[9] = _mm256_extract_epi8(b.values, 9); + if (mask & 0x400) + tmp_values[10] = _mm256_extract_epi8(b.values, 10); + if (mask & 0x800) + tmp_values[11] = _mm256_extract_epi8(b.values, 11); + if (mask & 0x1000) + tmp_values[12] = _mm256_extract_epi8(b.values, 12); + if (mask & 0x2000) + tmp_values[13] = _mm256_extract_epi8(b.values, 13); + if (mask & 0x4000) + tmp_values[14] = _mm256_extract_epi8(b.values, 14); + if (mask & 0x8000) + tmp_values[15] = _mm256_extract_epi8(b.values, 15); + if (mask & 0x010000) + tmp_values[16] = _mm256_extract_epi8(b.values, 16); + if (mask & 0x020000) + tmp_values[17] = _mm256_extract_epi8(b.values, 17); + if (mask & 0x040000) + tmp_values[18] = _mm256_extract_epi8(b.values, 18); + if (mask & 0x080000) + tmp_values[19] = _mm256_extract_epi8(b.values, 19); + if (mask & 0x100000) + tmp_values[20] = _mm256_extract_epi8(b.values, 20); + if (mask & 0x200000) + tmp_values[21] = _mm256_extract_epi8(b.values, 21); + if (mask & 0x400000) + tmp_values[22] = _mm256_extract_epi8(b.values, 22); + if (mask & 0x800000) + tmp_values[23] = _mm256_extract_epi8(b.values, 23); + if (mask & 0x1000000) + tmp_values[24] = _mm256_extract_epi8(b.values, 24); + if (mask & 0x2000000) + tmp_values[25] = _mm256_extract_epi8(b.values, 25); + if (mask & 0x4000000) + tmp_values[26] = _mm256_extract_epi8(b.values, 26); + if (mask & 0x8000000) + tmp_values[27] = _mm256_extract_epi8(b.values, 27); + if (mask & 0x10000000) + tmp_values[28] = _mm256_extract_epi8(b.values, 28); + if (mask & 0x20000000) + tmp_values[29] = _mm256_extract_epi8(b.values, 29); + if (mask & 0x40000000) + tmp_values[30] = _mm256_extract_epi8(b.values, 30); + if (mask & 0x80000000) + tmp_values[31] = _mm256_extract_epi8(b.values, 31); + return loadu(tmp_values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set(Vectorized a, Vectorized b, T count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<0x1>(a, b); + case 2: + return blend<0x3>(a, b); + case 3: + return blend<0x7>(a, b); + case 4: + return blend<0xF>(a, b); + case 5: + return blend<0x1F>(a, b); + case 6: + return blend<0x3F>(a, b); + case 7: + return blend<0x7F>(a, b); + case 8: + return blend<0xFF>(a, b); + case 9: + return blend<0x1FF>(a, b); + case 10: + return blend<0x3FF>(a, b); + case 11: + return blend<0x7FF>(a, b); + case 12: + return blend<0xFFF>(a, b); + case 13: + return blend<0x1FFF>(a, b); + case 14: + return blend<0x3FFF>(a, b); + case 15: + return blend<0x7FFF>(a, b); + case 16: + return blend<0xFFFF>(a, b); + case 17: + return blend<0x1FFFF>(a, b); + case 18: + return blend<0x3FFFF>(a, b); + case 19: + return blend<0x7FFFF>(a, b); + case 20: + return blend<0xFFFFF>(a, b); + case 21: + return blend<0x1FFFFF>(a, b); + case 22: + return blend<0x3FFFFF>(a, b); + case 23: + return blend<0x7FFFFF>(a, b); + case 24: + return blend<0xFFFFFF>(a, b); + case 25: + return blend<0x1FFFFFF>(a, b); + case 26: + return blend<0x3FFFFFF>(a, b); + case 27: + return blend<0x7FFFFFF>(a, b); + case 28: + return blend<0xFFFFFFF>(a, b); + case 29: + return blend<0x1FFFFFFF>(a, b); + case 30: + return blend<0x3FFFFFFF>(a, b); + case 31: + return blend<0x7FFFFFFF>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vectorized loadu_one_fourth(const void* ptr) { + // Fast path if only load element number of 8. + // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), + // Because loadu(const void* ptr, T count) requires zero initialization for + // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128 + // bits of the result are undefined. + // TODO We can use _mm256_zextsi128_si256 in the future, + // since gcc 9.3 doesn't support it now. + __m128i input_128 = _mm_loadl_epi64(reinterpret_cast(ptr)); + return _mm256_castsi128_si256(input_128); + } + static Vectorized loadu(const void* ptr, T count) { + __at_align__ T tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to one using "={1}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 1; + } + std::memcpy(tmp_values, ptr, count * sizeof(T)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else if (count > 0) { + if (count == 8) { + // Fast path if only store element number of 8 + _mm_storel_epi64( + reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values)); + } else { + __at_align__ T tmp_values[size()]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(T)); + } + } + } + const T& operator[](int idx) const = delete; + T& operator[](int idx) = delete; + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm256_set1_epi8(0); + } + Vectorized conj() const { + return *this; + } +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + Vectorized neg() const; + + Vectorized abs() const { + return _mm256_abs_epi8(values); + } + + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmpeq_epi8(values, other.values); + } + Vectorized operator!=(const Vectorized& other) const { + return invert(_mm256_cmpeq_epi8(values, other.values)); + } + Vectorized operator<(const Vectorized& other) const { + return _mm256_cmpgt_epi8(other.values, values); + } + Vectorized operator<=(const Vectorized& other) const { + return invert(_mm256_cmpgt_epi8(values, other.values)); + } + Vectorized operator>(const Vectorized& other) const { + return other < *this; + } + Vectorized operator>=(const Vectorized& other) const { + return other <= *this; + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + Vectorized neg() const; + + Vectorized abs() const { + return *this; + } + + Vectorized operator==(const Vectorized& other) const { + return _mm256_cmpeq_epi8(values, other.values); + } + Vectorized operator!=(const Vectorized& other) const { + return invert(_mm256_cmpeq_epi8(values, other.values)); + } + Vectorized operator<(const Vectorized& other) const { + __m256i max = _mm256_max_epu8(values, other.values); + return invert(_mm256_cmpeq_epi8(max, values)); + } + Vectorized operator<=(const Vectorized& other) const { + __m256i max = _mm256_max_epu8(values, other.values); + return _mm256_cmpeq_epi8(max, other.values); + } + Vectorized operator>(const Vectorized& other) const { + return other < *this; + } + Vectorized operator>=(const Vectorized& other) const { + return other <= *this; + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi64(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi32(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi16(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi8(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi8(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_epi64(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_epi32(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_epi16(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_epi8(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sub_epi8(a, b); +} + +// Negation. Defined here so we can utilize operator- +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +// Emulate operations with no native 64-bit support in avx, +// by extracting each element, performing the operation pointwise, +// then combining the results into a vector. +template +Vectorized inline emulate( + const Vectorized& a, + const Vectorized& b, + const op_t& op) { + int64_t a0 = _mm256_extract_epi64(a, 0); + int64_t a1 = _mm256_extract_epi64(a, 1); + int64_t a2 = _mm256_extract_epi64(a, 2); + int64_t a3 = _mm256_extract_epi64(a, 3); + + int64_t b0 = _mm256_extract_epi64(b, 0); + int64_t b1 = _mm256_extract_epi64(b, 1); + int64_t b2 = _mm256_extract_epi64(b, 2); + int64_t b3 = _mm256_extract_epi64(b, 3); + + int64_t c0 = op(a0, b0); + int64_t c1 = op(a1, b1); + int64_t c2 = op(a2, b2); + int64_t c3 = op(a3, b3); + + return _mm256_set_epi64x(c3, c2, c1, c0); +} + +template +Vectorized inline emulate( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c, + const op_t& op) { + int64_t a0 = _mm256_extract_epi64(a, 0); + int64_t a1 = _mm256_extract_epi64(a, 1); + int64_t a2 = _mm256_extract_epi64(a, 2); + int64_t a3 = _mm256_extract_epi64(a, 3); + + int64_t b0 = _mm256_extract_epi64(b, 0); + int64_t b1 = _mm256_extract_epi64(b, 1); + int64_t b2 = _mm256_extract_epi64(b, 2); + int64_t b3 = _mm256_extract_epi64(b, 3); + + int64_t c0 = _mm256_extract_epi64(c, 0); + int64_t c1 = _mm256_extract_epi64(c, 1); + int64_t c2 = _mm256_extract_epi64(c, 2); + int64_t c3 = _mm256_extract_epi64(c, 3); + + int64_t d0 = op(a0, b0, c0); + int64_t d1 = op(a1, b1, c1); + int64_t d2 = op(a2, b2, c2); + int64_t d3 = op(a3, b3, c3); + + return _mm256_set_epi64x(d3, d2, d1, d0); +} + +// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated +// This could be implemented more efficiently using epi32 instructions +// This is also technically avx compatible, but then we'll need AVX +// code for add as well. +// Note: intentionally ignores undefined behavior like (-lowest * -1). +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return emulate( + a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ { + return a_point * b_point; + }); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm256_mullo_epi32(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm256_mullo_epi16(a, b); +} + +template +Vectorized inline int_elementwise_binary_256( + const Vectorized& a, + const Vectorized& b, + Op op) { + T values_a[Vectorized::size()]; + T values_b[Vectorized::size()]; + a.store(values_a); + b.store(values_b); + for (int i = 0; i != Vectorized::size(); i++) { + values_a[i] = op(values_a[i], values_b[i]); + } + return Vectorized::loadu(values_a); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + // We don't have an instruction for multiplying int8_t +#ifndef CPU_CAPABILITY_AVX2 + return int_elementwise_binary_256(a, b, std::multiplies()); +#else + __m256i mask00FF = _mm256_set1_epi16(0x00FF); + __m256i a_lo = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8); + __m256i b_lo = _mm256_srai_epi16(_mm256_slli_epi16(b, 8), 8); + __m256i a_hi = _mm256_srai_epi16(a, 8); + __m256i b_hi = _mm256_srai_epi16(b, 8); + __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF); + __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8); + __m256i res = _mm256_or_si256(res_hi, res_lo); + return res; +#endif +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + // We don't have an instruction for multiplying uint8_t +#ifndef CPU_CAPABILITY_AVX2 + return int_elementwise_binary_256(a, b, std::multiplies()); +#else + __m256i mask00FF = _mm256_set1_epi16(0x00FF); + __m256i a_lo = _mm256_and_si256(a, mask00FF); + __m256i b_lo = _mm256_and_si256(b, mask00FF); + __m256i a_hi = _mm256_srli_epi16(a, 8); + __m256i b_hi = _mm256_srli_epi16(b, 8); + __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF); + __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8); + __m256i res = _mm256_or_si256(res_hi, res_lo); + return res; +#endif +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, b, [](int64_t a_point, int64_t b_point) { + return std::min(a_point, b_point); + }); +#else + __m256i cmp = _mm256_cmpgt_epi64(a, b); + return _mm256_blendv_epi8(a, b, cmp); +#endif +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_min_epi32(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_min_epi16(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_min_epi8(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_min_epu8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, b, [](int64_t a_point, int64_t b_point) { + return std::max(a_point, b_point); + }); +#else + __m256i cmp = _mm256_cmpgt_epi64(a, b); + return _mm256_blendv_epi8(b, a, cmp); +#endif +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_max_epi32(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_max_epi16(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_max_epi8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm256_max_epu8(a, b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate( + a, + min_val, + max_val, + [](int64_t a_point, int64_t min_point, int64_t max_point) { + return std::min(max_point, std::max(a_point, min_point)); + }); +#else + return minimum(maximum(a, min_val), max_val); +#endif +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, max_val, [](int64_t a_point, int64_t max_point) { + return std::min(max_point, a_point); + }); +#else + return minimum(max_val, a); +#endif +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm256_min_epi32(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm256_min_epi16(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm256_min_epi8(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm256_min_epu8(max_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, min_val, [](int64_t a_point, int64_t min_point) { + return std::max(min_point, a_point); + }); +#else + return maximum(min_val, a); +#endif +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm256_max_epi32(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm256_max_epi16(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm256_max_epi8(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm256_max_epu8(min_val, a); +} + +template +std::enable_if_t< + !(std::is_same_v || std::is_same_v), + Vectorized< + int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized::size()) { + return Vectorized::loadu(ptr, count); +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const int8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm256_cvtepi8_epi32( + _mm_loadl_epi64(reinterpret_cast(ptr))); + } else { + auto a = Vectorized::loadu(ptr, count); + return _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a)); + } +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const uint8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast(ptr))); + } else { + auto a = Vectorized::loadu(ptr, count); + return _mm256_cvtepu8_epi32(_mm256_castsi256_si128(a)); + } +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} + +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return _mm256_and_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return _mm256_or_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + return _mm256_xor_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator~(const Vectorized& a) { + return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template +Vectorized inline shift_256_16( + const Vectorized& a, + const Vectorized& b) { + // No vector instruction for shifting int16_t, so emulating it instead. + + // Control masks for shuffle operation, treating 256 bits as an + // array of 16-bit elements, and considering pairs of neighboring + // elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and + // M!=N) is set so that shuffle will move element with index M from + // input pair into element with index N in output pair, and element + // with index M in output pair will be set to all 0s. + __m256i ctl_0_1 = _mm256_set_epi8( + 29, + 28, + 0x80, + 0x80, + 25, + 24, + 0x80, + 0x80, + 21, + 20, + 0x80, + 0x80, + 17, + 16, + 0x80, + 0x80, + 13, + 12, + 0x80, + 0x80, + 9, + 8, + 0x80, + 0x80, + 5, + 4, + 0x80, + 0x80, + 1, + 0, + 0x80, + 0x80); + __m256i ctl_1_0 = _mm256_set_epi8( + 0x80, + 0x80, + 31, + 30, + 0x80, + 0x80, + 27, + 26, + 0x80, + 0x80, + 23, + 22, + 0x80, + 0x80, + 19, + 18, + 0x80, + 0x80, + 15, + 14, + 0x80, + 0x80, + 11, + 10, + 0x80, + 0x80, + 7, + 6, + 0x80, + 0x80, + 3, + 2); + + // Masks for bitwise and operation, treating 256 bits as an array of + // 16-bit elements, and considering them in pairs of neighboring + // elements. A mask named "keep_M" (M in [0,1]) is set so that + // bitwise and will copy element with index M from input pair into + // element with the same index in output pair, while the other + // element in output pair will be set to all 0s. + __m256i keep_0 = _mm256_set1_epi32(0xFFFF); + __m256i keep_1 = _mm256_set1_epi32(0xFFFF0000); + + // Take each 16-bit element with idx%2==0 from input array to be + // shifted and extend it to 32 bits so that 0s are added to the + // right. Then, perform shifting on this 32-bit number. Upper 16 + // bits will be proper result of shifting original 16-bit number, so + // write them to result array, into the same position from which + // corresponding input element is taken. Also, make sure that + // result array elements with idx%2!=0 are set to all 0s. + // + // Note that number of bits to shift for is extended to 32 bits by + // adding 0s to the left. That means this number is not properly + // sign-extended for negative values. However, number of bits to + // shift is treated as an unsigned integer by respective shift + // intrinsics anyway so if negative then either with or without + // proper sign extension, it will be interpreted as a number greater + // than 32, and the shifting result will be the same. + __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1); + __m256i b0 = _mm256_and_si256(b, keep_0); + __m256i c0; + if (left_shift) + c0 = _mm256_sllv_epi32(a0, b0); + else + c0 = _mm256_srav_epi32(a0, b0); + c0 = _mm256_shuffle_epi8(c0, ctl_1_0); + + // Perform shifting the same way for input array elements with + // idx%2==1. + __m256i a1 = _mm256_and_si256(a, keep_1); + __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); + __m256i c1; + if (left_shift) + c1 = _mm256_sllv_epi32(a1, b1); + else + c1 = _mm256_srav_epi32(a1, b1); + c1 = _mm256_and_si256(c1, keep_1); + + // Merge partial results into the final result. + __m256i c = _mm256_or_si256(c0, c1); + + return c; +} + +template < + bool left_shift, + typename T, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + int> = 0> +Vectorized inline shift_256_8( + const Vectorized& a, + const Vectorized& b) { + // No vector instruction for shifting int8_t/uint8_t, so emulating + // it instead. + + // Control masks for shuffle operation, treating 256 bits as an + // array of 8-bit elements, and considering quadruples of + // neighboring elements. Specifically, a mask named "ctl_M_N" (M,N + // in [0,1,2,3], and M!=N) is set so that shuffle will move element + // with index M from input quadruple into element with index N in + // output quadruple, and other elements in output quadruple will be + // set to all 0s. + __m256i ctl_0_3 = _mm256_set_epi8( + 28, + 0x80, + 0x80, + 0x80, + 24, + 0x80, + 0x80, + 0x80, + 20, + 0x80, + 0x80, + 0x80, + 16, + 0x80, + 0x80, + 0x80, + 12, + 0x80, + 0x80, + 0x80, + 8, + 0x80, + 0x80, + 0x80, + 4, + 0x80, + 0x80, + 0x80, + 0, + 0x80, + 0x80, + 0x80); + __m256i ctl_1_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 29, + 0x80, + 0x80, + 0x80, + 25, + 0x80, + 0x80, + 0x80, + 21, + 0x80, + 0x80, + 0x80, + 17, + 0x80, + 0x80, + 0x80, + 13, + 0x80, + 0x80, + 0x80, + 9, + 0x80, + 0x80, + 0x80, + 5, + 0x80, + 0x80, + 0x80, + 1); + __m256i ctl_1_3 = _mm256_set_epi8( + 29, + 0x80, + 0x80, + 0x80, + 25, + 0x80, + 0x80, + 0x80, + 21, + 0x80, + 0x80, + 0x80, + 17, + 0x80, + 0x80, + 0x80, + 13, + 0x80, + 0x80, + 0x80, + 9, + 0x80, + 0x80, + 0x80, + 5, + 0x80, + 0x80, + 0x80, + 1, + 0x80, + 0x80, + 0x80); + __m256i ctl_2_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 30, + 0x80, + 0x80, + 0x80, + 26, + 0x80, + 0x80, + 0x80, + 22, + 0x80, + 0x80, + 0x80, + 18, + 0x80, + 0x80, + 0x80, + 14, + 0x80, + 0x80, + 0x80, + 10, + 0x80, + 0x80, + 0x80, + 6, + 0x80, + 0x80, + 0x80, + 2); + __m256i ctl_2_3 = _mm256_set_epi8( + 30, + 0x80, + 0x80, + 0x80, + 26, + 0x80, + 0x80, + 0x80, + 22, + 0x80, + 0x80, + 0x80, + 18, + 0x80, + 0x80, + 0x80, + 14, + 0x80, + 0x80, + 0x80, + 10, + 0x80, + 0x80, + 0x80, + 6, + 0x80, + 0x80, + 0x80, + 2, + 0x80, + 0x80, + 0x80); + __m256i ctl_3_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3); + __m256i ctl_3_1 = _mm256_set_epi8( + 0x80, + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3, + 0x80); + __m256i ctl_3_2 = _mm256_set_epi8( + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3, + 0x80, + 0x80); + + // Masks for bitwise and operation, treating 256 bits as an array of + // 8-bit elements, and considering them in quadruples of neighboring + // elements. A mask named "keep_M" (M in [0,1,2,3]) is set so that + // bitwise and will copy element with index M from input quadruple + // into element with the same index in output quadruple, while the + // other elements in output quadruple will be set to all 0s. + __m256i keep_0 = _mm256_set1_epi32(0xFF); + __m256i keep_3 = _mm256_set1_epi32(0xFF000000); + + // Take each 8-bit element with idx%4==0 from input array to be + // shifted and extend it to 32 bits so that 0s are added to the + // right. Then, perform shifting on this 32-bit number. Upper 8 + // bits will be proper result of shifting original 8-bit number, so + // write them to result array, into the same position from which + // corresponding input element is taken. Also, make sure that + // result array elements with idx%4!=0 are set to all 0s. + // + // Note that number of bits to shift for is extended to 32 bits by + // adding 0s to the left. That means this number is not properly + // sign-extended for negative values. However, number of bits to + // shift is treated as an unsigned integer by respective shift + // intrinsics anyway so if negative then either with or without + // proper sign extension, it will be interpreted as a number greater + // than 32, and the shifting result will be the same. + __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3); + __m256i b0 = _mm256_and_si256(b, keep_0); + __m256i c0; + if (left_shift) + c0 = _mm256_sllv_epi32(a0, b0); + else if constexpr (std::is_same_v) + c0 = _mm256_srav_epi32(a0, b0); + else + c0 = _mm256_srlv_epi32(a0, b0); + c0 = _mm256_shuffle_epi8(c0, ctl_3_0); + + // Perform shifting the same way for input array elements with + // idx%4==1. + __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3); + __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); + __m256i c1; + if (left_shift) + c1 = _mm256_sllv_epi32(a1, b1); + else if constexpr (std::is_same_v) + c1 = _mm256_srav_epi32(a1, b1); + else + c1 = _mm256_srlv_epi32(a1, b1); + c1 = _mm256_shuffle_epi8(c1, ctl_3_1); + + // Perform shifting the same way for input array elements with + // idx%4==2. + __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3); + __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0); + __m256i c2; + if (left_shift) + c2 = _mm256_sllv_epi32(a2, b2); + else if constexpr (std::is_same_v) + c2 = _mm256_srav_epi32(a2, b2); + else + c2 = _mm256_srlv_epi32(a2, b2); + c2 = _mm256_shuffle_epi8(c2, ctl_3_2); + + // Perform shifting the same way for input array elements with + // idx%4==3. + __m256i a3 = _mm256_and_si256(a, keep_3); + __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); + __m256i c3; + if (left_shift) + c3 = _mm256_sllv_epi32(a3, b3); + else if constexpr (std::is_same_v) + c3 = _mm256_srav_epi32(a3, b3); + else + c3 = _mm256_srlv_epi32(a3, b3); + c3 = _mm256_and_si256(c3, keep_3); + + // Merge partial results into the final result. + __m256i c01 = _mm256_or_si256(c0, c1); + __m256i c23 = _mm256_or_si256(c2, c3); + __m256i c = _mm256_or_si256(c01, c23); + + return c; +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sllv_epi64(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return _mm256_sllv_epi32(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return shift_256_16(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return shift_256_8(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return shift_256_8(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + // No vector instruction for right arithmetic shifting int64_t, so emulating + // it instead. + + // Clamp the shift values such that shift values < 0 and > 64 are changed to + // 64 which results in -1 for negative input and 0 for non-negative input. + __m256i zero = _mm256_set1_epi64x(0); + __m256i max_shift = _mm256_set1_epi64x(64); + __m256i mask = _mm256_or_si256( + _mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift)); + __m256i shift = _mm256_blendv_epi8(b, max_shift, mask); + // Shift the number logically to the right, thus filling the most + // significant bits with 0s. Then, replace these bits with the sign + // bit. + __m256i sign_bits = _mm256_cmpgt_epi64(zero, a); + __m256i sign_shift = _mm256_sub_epi64(max_shift, shift); + __m256i sign_ext = _mm256_sllv_epi64(sign_bits, sign_shift); + __m256i c = _mm256_srlv_epi64(a, shift); + c = _mm256_or_si256(c, sign_ext); + + return c; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return _mm256_srav_epi32(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return shift_256_16(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return shift_256_8(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return shift_256_8(a, b); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h new file mode 100644 index 0000000000000000000000000000000000000000..595e0c4946a461bb6cc446d202f2156ef4bfbdc9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h @@ -0,0 +1,303 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +template +struct VecMaskLoad< + T, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (mask_n == dst_n * 2 && dst_n >= 1) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + VectorizedN tmp_vec; + VectorizedN result; + for (int i = 0; i < dst_n; i++) { + tmp_vec[0] = vec_mask[2 * i]; + tmp_vec[1] = vec_mask[2 * i + 1]; + auto int64_mask = VecMask(tmp_vec).template cast(); + auto int_mask = int64_mask.template cast()[0]; + if constexpr (std::is_same_v) { + result[i] = Vectorized( + _mm256_maskload_ps(ptr + i * Vectorized::size(), int_mask)); + } else { + result[i] = Vectorized( + _mm256_maskload_epi32(ptr + i * Vectorized::size(), int_mask)); + } + } + return result; + } +}; + +template +struct VecMaskLoad< + T, + dst_n, + mask_t, + dst_n, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < dst_n; i++) { + auto tmp_mask = VecMask(vec_mask[i]); + auto int_mask = tmp_mask.template cast()[0]; + if constexpr (std::is_same_v) { + result[i] = Vectorized( + _mm256_maskload_ps(ptr + i * Vectorized::size(), int_mask)); + } else { + result[i] = Vectorized( + _mm256_maskload_epi32(ptr + i * Vectorized::size(), int_mask)); + } + } + return result; + } +}; + +template +struct VecMaskLoad< + T, + 2, + mask_t, + 1, + typename std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + auto int64_mask = vec_mask.template cast(); + auto result = at::vec::VectorizedN(); + if constexpr (std::is_same_v) { + result[0] = _mm256_maskload_pd(ptr, int64_mask[0]); + result[1] = _mm256_maskload_pd( + ptr + at::vec::Vectorized::size(), int64_mask[1]); + } else { + result[0] = _mm256_maskload_epi64( + reinterpret_cast(ptr), int64_mask[0]); + result[1] = _mm256_maskload_epi64( + reinterpret_cast( + ptr + at::vec::Vectorized::size()), + int64_mask[1]); + } + return result; + } +}; + +// TODO: add specialization of VecMaskLoad for bfloat16/half and int8/uint8 + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm256_castsi256_ps(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm256_castps_si256(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm256_castpd_si256(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm256_castsi256_pd(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast< + int64_t, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (dst_n == 2 * mask_n) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VecMask apply( + const VecMask& vec_mask) { + VectorizedN result; + auto int_mask = vec_mask.template cast(); +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < mask_n; ++i) { + auto int64_vec = + convert(VectorizedN(int_mask[i])); + result[2 * i] = int64_vec[0]; + result[2 * i + 1] = int64_vec[1]; + } + return VecMask(result); + } +}; + +template +struct VecMaskCast< + dst_t, + dst_n, + int64_t, + mask_n, + typename std::enable_if_t< + (mask_n == 2 * dst_n) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VecMask apply( + const VecMask& vec_mask) { + VectorizedN result; + VectorizedN int64_vec; + for (int i = 0; i < dst_n; ++i) { + int64_vec[0] = vec_mask[2 * i]; + int64_vec[1] = vec_mask[2 * i + 1]; + result[i] = convert(int64_vec); + } + return VecMask(result).template cast(); + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int64_mask = VecMaskCast::apply(vec_mask); + return VecMaskCast::apply(int64_mask); + } +}; +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int64_mask = VecMaskCast::apply(vec_mask); + return VecMaskCast::apply(int64_mask); + } +}; + +template <> +inline bool VecMask::all_zero() const { + return _mm256_testz_si256(mask_[0], mask_[0]); +} + +template <> +inline bool VecMask::is_masked(int i) const { + return _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])) & (1 << i); +} + +template <> +inline bool VecMask::all_masked() const { + int mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])); + return mask == 0xff; +} + +template +struct VecMaskCheck { + static inline bool all_zero(const VectorizedN& vec_mask) { + bool all_zero = true; + for (int i = 0; i < N; ++i) { + all_zero = all_zero && (_mm256_testz_si256(vec_mask[i], vec_mask[i]) > 0); + if (!all_zero) { + return all_zero; + } + } + return all_zero; + } + + static inline bool is_masked(const VectorizedN& vec_mask, int i) { + for (int j = 0; j < N; ++j) { + if (i < (j + 1) * 4) { + return _mm256_movemask_pd(_mm256_castsi256_pd(vec_mask[j])) & + (1 << (i - j * 4)); + } + } + return false; + } + + static inline bool all_masked(const VectorizedN& vec_mask) { + bool all_masked = true; + for (int i = 0; i < N; ++i) { + all_masked = all_masked && + (_mm256_movemask_pd(_mm256_castsi256_pd(vec_mask[i])) == 0x0f); + if (!all_masked) { + return all_masked; + } + } + return all_masked; + } +}; + +#define VEC_MASK_METHOD_WITH_CAST_TO_INT( \ + T, N, return_type, method, args_def, args) \ + template <> \ + inline return_type VecMask::method args_def const { \ + return cast().method args; \ + } + +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ()) + +#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h new file mode 100644 index 0000000000000000000000000000000000000000..7e77d78528b5d6a069347064e8dc21cbf6151682 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h @@ -0,0 +1,1429 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 4x Vectorized +// Vectorized -> 4x Vectorized +// Vectorized -> 1x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) + +#ifdef _MSC_VER +__declspec(align(64)) struct Vectorizedqi { + protected: + __m256i vals; +#else +struct Vectorizedqi { + protected: + __m256i vals __attribute__((aligned(64))); +#endif + + public: + Vectorizedqi() { + vals = _mm256_setzero_si256(); + } + Vectorizedqi(__m256i v) : vals(v) {} + operator __m256i() const { + return vals; + } +}; + +template +__m256i pack_saturate_and_clamp( + __m256i first, + __m256i second, + T min_val, + T max_val); + +template <> +inline __m256i pack_saturate_and_clamp( + __m256i /*first*/, + __m256i /*second*/, + int32_t /*min_val*/, + int32_t /*max_val*/) { + // This function is for linkage only, will not be used + TORCH_CHECK(false, "pack_saturate_and_clamp is not supported"); +} + +template <> +inline __m256i pack_saturate_and_clamp( + __m256i first, + __m256i second, + int8_t min_val, + int8_t max_val) { + __m256i packed_and_sat = _mm256_packs_epi16(first, second); + return _mm256_max_epi8( + _mm256_set1_epi8(min_val), + _mm256_min_epi8(packed_and_sat, _mm256_set1_epi8(max_val))); +} + +template <> +inline __m256i pack_saturate_and_clamp( + __m256i first, + __m256i second, + uint8_t min_val, + uint8_t max_val) { + __m256i packed_and_sat = _mm256_packus_epi16(first, second); + return _mm256_max_epu8( + _mm256_set1_epi8(min_val), + _mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val))); +} + +template +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(at::vec::Vectorized src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 8*8 bits + __m128i input_128 = _mm256_castsi256_si128(src); + // Convert from 8*uint8/int8 to 8*int32 + __m256i input_256_int32; + if constexpr (std::is_same_v) + input_256_int32 = _mm256_cvtepu8_epi32(input_128); + else + input_256_int32 = _mm256_cvtepi8_epi32(input_128); + // Convert from 8*int32 to 8*float + return _mm256_cvtepi32_ps(input_256_int32); +} + +template +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src); + +template <> +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src) { + // Convert from float32 to int32 with truncation + __m256i x_values_int32 = _mm256_cvttps_epi32(src); + + // Convert from int32 to int16 using signed saturation + __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32); + + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + // Convert from int16 to int8 using unsigned saturation + __m256i xyzw_clamped_v = pack_saturate_and_clamp( + xy_packed_v, xy_packed_v, min_val, max_val); + __m256i permute_mask_v = + _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); + return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); +} + +template <> +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src) { + // The type of *_val should be int32_t to ensure correct clamping behavior. + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + __m256 float32_min_val = _mm256_set1_ps(float(min_val)); + __m256 float32_max_val = _mm256_set1_ps(float(max_val)); + __m256 float32_src = _mm256_max_ps(src, float32_min_val); + float32_src = _mm256_min_ps(float32_src, float32_max_val); + __m256i truncated_src = _mm256_cvttps_epi32(float32_src); + + __m128i r1 = _mm256_castsi256_si128(truncated_src); + __m128i mask = _mm_setr_epi8( + 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i r1_shuffled = _mm_shuffle_epi8(r1, mask); + __m128i r2 = _mm256_extractf128_si256(truncated_src, 1); + __m128i r2_shuffled = _mm_shuffle_epi8(r2, mask); + __m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled); + + return _mm256_castsi128_si256(result); +} + +template +__FORCE_INLINE void QuantizeAvx2( + const float* src, + T* dst, + int len, + float inverse_scale, + int64_t zero_point) { + constexpr int VLEN = 8; + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + const __m256i min_v = _mm256_set1_epi32(min_val); + const __m256i max_v = _mm256_set1_epi32(max_val); + // This is the largest int32 value < int32_max exactly representable in float + constexpr int32_t int32_float_max_val = + std::numeric_limits::max() - 127; + int i = 0; + __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); + // clang-format off + static const __m256i shuffle_mask_v = _mm256_set_epi8( + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00); + // clang-format on + __m256i permute_mask_v = + _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); + __m256i permute_mask_l8_v = + _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); + int len_aligned = len / (VLEN * 4) * (VLEN * 4); + for (; i < len_aligned; i += 4 * VLEN) { + // x + __m256 x_vals = _mm256_load_ps(src + i); + __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v); + // If the floating point value is greater than int32_max, + // _mm256_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to + // Clip at int32_float_max_val to avoid this. + x_transformed_v = + _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val)); + // y + __m256 y_vals = _mm256_load_ps(src + i + VLEN); + __m256 y_transformed_v = _mm256_mul_ps(y_vals, inverse_scale_v); + y_transformed_v = + _mm256_min_ps(y_transformed_v, _mm256_set1_ps(int32_float_max_val)); + // z + __m256 z_vals = _mm256_load_ps(src + i + 2 * VLEN); + __m256 z_transformed_v = _mm256_mul_ps(z_vals, inverse_scale_v); + z_transformed_v = + _mm256_min_ps(z_transformed_v, _mm256_set1_ps(int32_float_max_val)); + // w + __m256 w_vals = _mm256_load_ps(src + i + 3 * VLEN); + __m256 w_transformed_v = _mm256_mul_ps(w_vals, inverse_scale_v); + w_transformed_v = + _mm256_min_ps(w_transformed_v, _mm256_set1_ps(int32_float_max_val)); + + __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v); + __m256i y_rounded_v = _mm256_cvtps_epi32(y_transformed_v); + __m256i z_rounded_v = _mm256_cvtps_epi32(z_transformed_v); + __m256i w_rounded_v = _mm256_cvtps_epi32(w_transformed_v); + + // add zero point + x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point)); + y_rounded_v = _mm256_add_epi32(y_rounded_v, _mm256_set1_epi32(zero_point)); + z_rounded_v = _mm256_add_epi32(z_rounded_v, _mm256_set1_epi32(zero_point)); + w_rounded_v = _mm256_add_epi32(w_rounded_v, _mm256_set1_epi32(zero_point)); + + __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); + __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); + __m256i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, zw_packed_v, min_val, max_val); + + xyzw_clamped_v = + _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), xyzw_clamped_v); + } + + // Additional 8-lane AVX2 version to take advantage when len is smaller + // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM) + for (; i < len / VLEN * VLEN; i += VLEN) { + __m256 x_vals = _mm256_load_ps(src + i); + __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v); + x_transformed_v = + _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val)); + __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v); + x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point)); + __m256i x_clipped_v = + _mm256_max_epi32(min_v, _mm256_min_epi32(max_v, x_rounded_v)); + + x_clipped_v = _mm256_shuffle_epi8(x_clipped_v, shuffle_mask_v); + x_clipped_v = _mm256_permutevar8x32_epi32(x_clipped_v, permute_mask_l8_v); + _mm_storel_epi64( + reinterpret_cast<__m128i*>(dst + i), + _mm256_castsi256_si128(x_clipped_v)); + } + + for (; i < len; ++i) { + float transformed = src[i] * inverse_scale; + + // Not exactly the same behavior as the vectorized code. + // The vectorized code above always rounds to even in halfway cases + // (https://software.intel.com/en-us/node/523819), but std::nearbyint + // does the same only when the current rounding mode is FE_TONEAREST. + // However, in practice, this should not be a problem because most cases + // use the default rounding mode FE_TONEAREST. + // Note that we cannot implement the same behavior as the vectorized code + // using std::round because it does rounding away from zero in halfway + // cases. + transformed = zero_point + std::nearbyint(transformed); + float clipped = + std::min(std::max(transformed, float(min_val)), float(max_val)); + dst[i] = clipped; + } +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + using size_type = int; + static constexpr size_type kSize = Vectorized::size(); + static constexpr size_type size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int int_num_vecs() { + return 1; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint32& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi32(uw); + } + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized /*zero_point*/, + Vectorized scale_zp_premul) const { + __m256 float_vals = _mm256_cvtepi32_ps(vals); + return {vec::fmadd(scale, Vectorized(float_vals), scale_zp_premul)}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m256 float_vals = _mm256_cvtepi32_ps(vals); + return {(Vectorized(float_vals) - zero_point) * scale}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float /*inverse_scale*/) { + Vectorized retval; + auto rhs_data = (__m256)rhs[0]; + at::native::quantize_vec( + scale, + zero_point, + (float*)&rhs_data, + (c10::qint32*)&retval.vals, + size()); + return retval; + } + + Vectorized maximum(Vectorized b) const { + return _mm256_max_epi32(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epi32(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epi32( + _mm256_max_epi32(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + return {_mm256_sub_epi32(vals, b)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + + __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v); + __m256i rounded = _mm256_cvtps_epi32(scaled); + return _mm256_add_epi32(rounded, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm256_mullo_epi32(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm256_add_epi32(a, b); +} + +/* + * Convert values from int32 back to int8/uint8 + */ +template +__m256i RequantizeAvx2( + const std::array, 4>& inp, + __m256 multiplier, + __m256i zp) { + static_assert( + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + __m256i permute_mask_v = + _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); + __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier); + __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier); + __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier); + __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier); + + __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v); + __m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v); + __m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v); + __m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v); + + /* Add zero point */ + __m256i x_v = _mm256_add_epi32(x_rounded_v, zp); + __m256i y_v = _mm256_add_epi32(y_rounded_v, zp); + __m256i z_v = _mm256_add_epi32(z_rounded_v, zp); + __m256i w_v = _mm256_add_epi32(w_rounded_v, zp); + + /* Pack to int16_t and saturate */ + __m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v); + __m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v); + + __m256i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, zw_packed_v, min_val, max_val); + + /* + * xyzw_clamped_v has results in the following layout so we need to + * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 + */ + xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); + return xyzw_clamped_v; +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int kSize = VECTOR_WIDTH; + static constexpr int size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int kIntNumVecs = kSize / Vectorized::size(); + static constexpr int int_num_vecs() { + return kIntNumVecs; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, kIntNumVecs>; + using value_type = c10::qint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + + Vectorized() {} + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint8& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi8(uw); + } + + // This is needed because the compiler emits awful code for the default + // constructor for moving the enum + // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy) + C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") + C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") +#endif + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + C10_CLANG_DIAGNOSTIC_POP() + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + private: + __m256i cvtepi8_epi32(__m128i epi8_vals) const { + return _mm256_cvtepi8_epi32(epi8_vals); + } + + public: + float_vec_return_type dequantize( + Vectorized scale, + Vectorized /*zero_point*/, + Vectorized scale_neg_zp_premul) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1)); + __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2)); + __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3)); + + auto val0 = + vec::fmadd(scale, Vectorized(float_val0), scale_neg_zp_premul); + auto val1 = + vec::fmadd(scale, Vectorized(float_val1), scale_neg_zp_premul); + auto val2 = + vec::fmadd(scale, Vectorized(float_val2), scale_neg_zp_premul); + auto val3 = + vec::fmadd(scale, Vectorized(float_val3), scale_neg_zp_premul); + return {val0, val1, val2, val3}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1)); + __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2)); + __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3)); + + auto val0 = (Vectorized(float_val0) - zero_point) * scale; + auto val1 = (Vectorized(float_val1) - zero_point) * scale; + auto val2 = (Vectorized(float_val2) - zero_point) * scale; + auto val3 = (Vectorized(float_val3) - zero_point) * scale; + return {val0, val1, val2, val3}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float /*scale*/, + int32_t zero_point, + float inverse_scale) { + auto* rhs_data = (float*)rhs.data(); + int8_t quantized_values[32]; + QuantizeAvx2( + rhs_data, quantized_values, 32, inverse_scale, zero_point); + return Vectorized::loadu(quantized_values); + } + + Vectorized maximum(Vectorized b) const { + return _mm256_max_epi8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epi8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epi8(_mm256_max_epi8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256i int32_val0 = cvtepi8_epi32(int_val0); + __m256i int32_val1 = cvtepi8_epi32(int_val1); + __m256i int32_val2 = cvtepi8_epi32(int_val2); + __m256i int32_val3 = cvtepi8_epi32(int_val3); + + __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); + __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); + __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); + __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); + + __m256i int32_b0 = cvtepi8_epi32(int_b0); + __m256i int32_b1 = cvtepi8_epi32(int_b1); + __m256i int32_b2 = cvtepi8_epi32(int_b2); + __m256i int32_b3 = cvtepi8_epi32(int_b3); + + __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); + __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); + __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); + __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); + + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + return RequantizeAvx2(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int kSize = VECTOR_WIDTH; + static constexpr int size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int kIntNumVecs = kSize / Vectorized::size(); + static constexpr int int_num_vecs() { + return kIntNumVecs; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, kIntNumVecs>; + using value_type = c10::quint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::quint8& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi8(uw); + } + + // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy) + C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") + C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") +#endif + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + C10_CLANG_DIAGNOSTIC_POP() + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + private: + __m256i cvtepu8_epi32(__m128i epu8_vals) const { + return _mm256_cvtepu8_epi32(epu8_vals); + } + + public: + float_vec_return_type dequantize( + Vectorized scale, + Vectorized /*zero_point*/, + Vectorized scale_zp_premul) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1)); + __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2)); + __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3)); + + auto val0 = + vec::fmadd(scale, Vectorized(float_val0), scale_zp_premul); + auto val1 = + vec::fmadd(scale, Vectorized(float_val1), scale_zp_premul); + auto val2 = + vec::fmadd(scale, Vectorized(float_val2), scale_zp_premul); + auto val3 = + vec::fmadd(scale, Vectorized(float_val3), scale_zp_premul); + return {val0, val1, val2, val3}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1)); + __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2)); + __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3)); + + auto val0 = (Vectorized(float_val0) - zero_point) * scale; + auto val1 = (Vectorized(float_val1) - zero_point) * scale; + auto val2 = (Vectorized(float_val2) - zero_point) * scale; + auto val3 = (Vectorized(float_val3) - zero_point) * scale; + return {val0, val1, val2, val3}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float /*scale*/, + int32_t zero_point, + float inverse_scale) { + auto* rhs_data = (float*)rhs.data(); + uint8_t quantized_values[32]; + QuantizeAvx2( + rhs_data, quantized_values, 32, inverse_scale, zero_point); + return Vectorized::loadu(quantized_values); + } + + Vectorized maximum(Vectorized b) const { + return _mm256_max_epu8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epu8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epu8(_mm256_max_epu8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256i int32_val0 = cvtepu8_epi32(int_val0); + __m256i int32_val1 = cvtepu8_epi32(int_val1); + __m256i int32_val2 = cvtepu8_epi32(int_val2); + __m256i int32_val3 = cvtepu8_epi32(int_val3); + + __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); + __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); + __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); + __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); + + __m256i int32_b0 = cvtepu8_epi32(int_b0); + __m256i int32_b1 = cvtepu8_epi32(int_b1); + __m256i int32_b2 = cvtepu8_epi32(int_b2); + __m256i int32_b3 = cvtepu8_epi32(int_b3); + + __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); + __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); + __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); + __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + return RequantizeAvx2(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +#elif !defined(CPU_CAPABILITY_SVE256) + +// NOTE: These are low-performance implementations that we fall back on +// if we are not building with AVX2. This may not be an issue, because +// currently for quantization we assume the user has at least AVX512 +// installed, so these can simply act as a reference implementation. +// +// If in the future we relax this requirement (AVX2+), we should probably +// revisit these implementations + +template < + typename T, + typename float_vec_return_type_, + typename int_vec_return_type_, + int size_> +struct VectorizedQuantizedConverter { + static constexpr int size() { + return size_; + } + + static constexpr int float_num_vecs() { + return size_ / Vectorized::size(); + } + + static constexpr int int_num_vecs() { + return size_ / Vectorized::size(); + } + + using float_vec_return_type = float_vec_return_type_; + using int_vec_return_type = int_vec_return_type_; + + using value_type = typename T::underlying; + std::array vals; + + VectorizedQuantizedConverter(T val) { + for (const auto i : c10::irange(size())) { + vals[i] = val.val_; + } + } + + VectorizedQuantizedConverter(const void* ptr) { + memcpy(vals.data(), ptr, sizeof(value_type) * size()); + } + + void store(void* ptr, int count = size()) const { + memcpy(ptr, vals.data(), count * sizeof(value_type)); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized /*scale_zp_premul*/) const { + float_vec_return_type rv; + for (const auto i : c10::irange(float_num_vecs())) { + float tmp_vals[Vectorized::size()]; + for (const auto j : c10::irange(Vectorized::size())) { + tmp_vals[j] = at::native::dequantize_val( + scale[j], + zero_point[j], + T(vals[Vectorized::size() * i + j])); + } + rv[i] = Vectorized(tmp_vals); + } + return rv; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + Vectorized scale_zp_premul; + return dequantize(scale, zero_point, scale_zp_premul); + } + + protected: + VectorizedQuantizedConverter() {} +}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + Vectorized::size()> { + using VectorizedQuantizedConverter::VectorizedQuantizedConverter; + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return Vectorized(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float /*inverse_scale*/) { + std::array qvals; + std::array::size()> float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * Vectorized::size()]); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint32*)qvals.data(), + float_vals.size()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + for (const auto i : c10::irange(size())) { + retval[0].vals[i] = vals[i] - b.vals[i]; + } + return retval; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = + std::nearbyint(static_cast(inp[0].vals[i]) * multiplier) + + zero_point; + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (const auto i : c10::irange(std::decay_t::size())) { + retval.vals[i] = a.vals[i] * b.vals[i]; + } + return retval; +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (const auto i : c10::irange(std::decay_t::size())) { + retval.vals[i] = a.vals[i] + b.vals[i]; + } + return retval; +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 4 * Vectorized::size()> { + using VectorizedQuantizedConverter::VectorizedQuantizedConverter; + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return Vectorized(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float /*inverse_scale*/) { + std::array qvals; + std::array::size()> float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * Vectorized::size()]); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint8*)qvals.data(), + float_vals.size()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + int32_t rounded = + std::nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 4 * Vectorized::size()> { + using VectorizedQuantizedConverter::VectorizedQuantizedConverter; + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return Vectorized(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float /*inverse_scale*/) { + std::array qvals; + std::array::size()> float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * Vectorized::size()]); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::quint8*)qvals.data(), + float_vals.size()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + int32_t rounded = + std::nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +#endif // if defined(CPU_CAPABILITY_AVX2) + +#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) +std::pair, Vectorized> inline convert_int8_to_float( + at::vec::Vectorized src) { + auto s8x8 = vget_low_s8(src); + auto s16x8 = vmovl_s8(s8x8); + + auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8)); + auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); + + return std::make_pair( + Vectorized(vcvtq_f32_s32(s32x4_lo)), + Vectorized(vcvtq_f32_s32(s32x4_hi))); +} + +std::pair, Vectorized> inline convert_int8_to_float( + at::vec::Vectorized src) { + auto u8x8 = vget_low_u8(src); + auto u16x8 = vmovl_u8(u8x8); + auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8)); + auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); + + return std::make_pair( + Vectorized(vcvtq_f32_u32(u32x4_lo)), + Vectorized(vcvtq_f32_u32(u32x4_hi))); +} + +Vectorized inline convert_int8_half_register_to_float( + at::vec::Vectorized src) { + auto s8x8 = vget_low_s8(src); + auto s16x8 = vmovl_s8(s8x8); + + auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); + + return Vectorized(vcvtq_f32_s32(s32x4_lo)); +} + +Vectorized inline convert_int8_half_register_to_float( + at::vec::Vectorized src) { + auto u8x8 = vget_low_u8(src); + auto u16x8 = vmovl_u8(u8x8); + auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); + + return Vectorized(vcvtq_f32_u32(u32x4_lo)); +} + +#endif +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..a2cba8d412f2b1f8c5ba60d77d9a42c1ed0639b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +inline std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a) { + constexpr int64_t K = Vectorized::size(); + __at_align__ float arr[K]; + __at_align__ BFloat16 arr2[K]; + a.store(arr2); + convert(arr2, arr, K); + return std::make_tuple( + Vectorized::loadu(arr), + Vectorized::loadu(arr + Vectorized::size())); +} + +inline Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b) { + constexpr int64_t K = Vectorized::size(); + __at_align__ float arr[K]; + __at_align__ BFloat16 arr2[K]; + a.store(arr); + b.store(arr + Vectorized::size()); + convert(arr, arr2, K); + return Vectorized::loadu(arr2); +} + +inline void load_fp32_from_bf16( + const c10::BFloat16* data, + Vectorized& out) { + __at_align__ float values[Vectorized::size()]; + for (const auto k : c10::irange(Vectorized::size())) { + values[k] = data[k]; + } + out = Vectorized::loadu(values); +} + +inline void load_fp32_from_bf16( + const c10::BFloat16* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_bf16(data, out1); + data += Vectorized::size(); + load_fp32_from_bf16(data, out2); +} + +inline void load_fp32_from_fp16(const c10::Half* data, Vectorized& out) { + __at_align__ float values[Vectorized::size()]; + for (const auto k : c10::irange(Vectorized::size())) { + values[k] = data[k]; + } + out = Vectorized::loadu(values); +} + +inline void load_fp32_from_fp16( + const c10::Half* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_fp16(data, out1); + data += Vectorized::size(); + load_fp32_from_fp16(data, out2); +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..849f75c2854a361c936288792495f3b6ae0af801 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h @@ -0,0 +1,255 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +// Note: header order is important here +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace at { +namespace vec { + +inline namespace CPU_CAPABILITY { + +DEFINE_CLAMP_FUNCS(c10::quint8) +DEFINE_CLAMP_FUNCS(c10::qint8) +DEFINE_CLAMP_FUNCS(c10::qint32) +DEFINE_CLAMP_FUNCS(int16_t) +DEFINE_CLAMP_FUNCS(int32_t) +DEFINE_CLAMP_FUNCS(int64_t) +DEFINE_CLAMP_FUNCS(float) +DEFINE_CLAMP_FUNCS(double) + +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + vec_madd(a.vec0(), b.vec0(), c.vec0()), + vec_madd(a.vec1(), b.vec1(), c.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} + +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t) + +template <> +Vectorized C10_ALWAYS_INLINE +convert_to_int_of_same_size(const Vectorized& src) { + return Vectorized{vec_signed(src.vec0()), vec_signed(src.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +convert_to_int_of_same_size(const Vectorized& src) { + return Vectorized{vec_signed(src.vec0()), vec_signed(src.vec1())}; +} + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + // int32_t and float have same size + int64_t i; + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + const int32_t* src_a = src + i; + float* dst_a = dst + i; + vint32 input_vec0 = + vec_vsx_ld(offset0, reinterpret_cast(src_a)); + vint32 input_vec1 = + vec_vsx_ld(offset16, reinterpret_cast(src_a)); + vfloat32 c0 = vec_float(input_vec0); + vfloat32 c1 = vec_float(input_vec1); + vec_vsx_st(c0, offset0, dst_a); + vec_vsx_st(c1, offset16, dst_a); + } + + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int64_t* src, double* dst, int64_t n) { + int64_t i; + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + const int64_t* src_a = src + i; + double* dst_a = dst + i; + vint64 input_vec0 = + vec_vsx_ld(offset0, reinterpret_cast(src_a)); + vint64 input_vec1 = + vec_vsx_ld(offset16, reinterpret_cast(src_a)); + vfloat64 c0 = vec_double(input_vec0); + vfloat64 c1 = vec_double(input_vec1); + vec_vsx_st(c0, offset0, reinterpret_cast(dst_a)); + vec_vsx_st(c1, offset16, reinterpret_cast(dst_a)); + } + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} +// Generic implementation to fix compiler error +// TO-DO : Add optimized version for ppc64 +inline std::tuple, Vectorized> convert_half_float( + const Vectorized& a) { + constexpr int64_t K = Vectorized::size(); + __at_align__ float arr[K]; + __at_align__ Half arr2[K]; + a.store(arr2); + convert(arr2, arr, K); + return std::make_tuple( + Vectorized::loadu(arr), + Vectorized::loadu(arr + Vectorized::size())); +} + +inline Vectorized convert_float_half( + const Vectorized& a, + const Vectorized& b) { + constexpr int64_t K = Vectorized::size(); + __at_align__ float arr[K]; + __at_align__ Half arr2[K]; + a.store(arr); + b.store(arr + Vectorized::size()); + convert(arr, arr2, K); + return Vectorized::loadu(arr2); +}; + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3} + // b = {b0, b1, b2, b3} + + vfloat64 ab00 = vec_xxpermdi(a.vec0(), b.vec0(), 0); + vfloat64 ab11 = vec_xxpermdi(a.vec0(), b.vec0(), 3); + vfloat64 ab2_00 = vec_xxpermdi(a.vec1(), b.vec1(), 0); + vfloat64 ab2_11 = vec_xxpermdi(a.vec1(), b.vec1(), 3); + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + return std::make_pair( + Vectorized{ab00, ab11}, Vectorized{ab2_00, ab2_11}); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + vfloat64 aa01 = vec_xxpermdi(a.vec0(), a.vec1(), 0); + vfloat64 aa23 = vec_xxpermdi(b.vec0(), b.vec1(), 0); + + vfloat64 bb_01 = vec_xxpermdi(a.vec0(), a.vec1(), 3); + vfloat64 bb_23 = vec_xxpermdi(b.vec0(), b.vec1(), 3); + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + return std::make_pair( + Vectorized{aa01, aa23}, Vectorized{bb_01, bb_23}); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3,, a4, a5, a6, a7} + // b = {b0, b1, b2, b3,, b4, b5, b6, b7} + + vfloat32 ab0011 = vec_mergeh(a.vec0(), b.vec0()); + vfloat32 ab2233 = vec_mergel(a.vec0(), b.vec0()); + + vfloat32 ab2_0011 = vec_mergeh(a.vec1(), b.vec1()); + vfloat32 ab2_2233 = vec_mergel(a.vec1(), b.vec1()); + // group cols crossing lanes: + // return {a0, b0, a1, b1,, a2, b2, a3, b3} + // {a4, b4, a5, b5,, a6, b6, a7, b7} + + return std::make_pair( + Vectorized{ab0011, ab2233}, Vectorized{ab2_0011, ab2_2233}); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1,, a2, b2, a3, b3} + // b = {a4, b4, a5, b5,, a6, b6, a7, b7} + + // {a0,a2,b0,b2} {a1,a3,b1,b3} + vfloat32 a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1()); + vfloat32 a1a3b1b3 = vec_mergel(a.vec0(), a.vec1()); + + vfloat32 aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3); + vfloat32 bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3); + + vfloat32 a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1()); + vfloat32 a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1()); + + vfloat32 aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2); + vfloat32 bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2); + + // it could be done with vec_perm ,too + // swap lanes: + // return {a0, a1, a2, a3,, a4, a5, a6, a7} + // {b0, b1, b2, b3,, b4, b5, b6, b7} + + return std::make_pair( + Vectorized{aa0123, aa0123_2}, Vectorized{bb0123, bb0123_2}); +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..6cc03ca753ae4817b50565c03c732ba3b763a973 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h @@ -0,0 +1,684 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { +using ComplexDbl = c10::complex; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + union { + struct { + vfloat64 _vec0; + vfloat64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = ComplexDbl; + using vec_internal_type = vfloat64; + using vec_internal_mask_type = vbool64; + using size_type = int; + static constexpr size_type size() { + return 2; + } + Vectorized() {} + C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} + + Vectorized(ComplexDbl val) { + double real_value = val.real(); + double imag_value = val.imag(); + _vec0 = vfloat64{real_value, imag_value}; + _vec1 = vfloat64{real_value, imag_value}; + } + Vectorized(ComplexDbl val1, ComplexDbl val2) { + _vec0 = vfloat64{val1.real(), val1.imag()}; + _vec1 = vfloat64{val2.real(), val2.imag()}; + } + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { + return a; + } + + template + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { + return b; + } + + template + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static Vectorized C10_ALWAYS_INLINE + el_blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + const vbool64 mask_2nd = VsxDblMask2(mask); + return { + (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // convert std::complex index mask to V index mask: xy -> xxyy + auto mask_complex = Vectorized( + vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0)); + return { + vec_sel(a._vec0, b._vec0, mask_complex._vecb0), + vec_sel(a._vec1, b._vec1, mask_complex._vecb1)}; + } + + static Vectorized C10_ALWAYS_INLINE elwise_blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + template + static Vectorized arange( + ComplexDbl base = 0., + step_t step = static_cast(1)) { + return Vectorized(base, base + step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + } + return b; + } + + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + vec_vsx_ld(offset0, reinterpret_cast(tmp_values)), + vec_vsx_ld(offset16, reinterpret_cast(tmp_values))}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, reinterpret_cast(tmp_values)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(tmp_values)); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const ComplexDbl& operator[](int idx) const = delete; + ComplexDbl& operator[](int idx) = delete; + + Vectorized map(ComplexDbl (*const f)(ComplexDbl)) const { + __at_align__ ComplexDbl tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vectorized map(ComplexDbl (*const f)(const ComplexDbl&)) const { + __at_align__ ComplexDbl tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vectorized el_swapped() const { + vfloat64 v0 = vec_xxpermdi(_vec0, _vec0, 2); + vfloat64 v1 = vec_xxpermdi(_vec1, _vec1, 2); + return {v0, v1}; + } + + Vectorized el_madd( + const Vectorized& multiplier, + const Vectorized& val) const { + return { + vec_madd(_vec0, multiplier._vec0, val._vec0), + vec_madd(_vec1, multiplier._vec1, val._vec1)}; + } + + Vectorized el_mergeo() const { + vfloat64 v0 = vec_splat(_vec0, 1); + vfloat64 v1 = vec_splat(_vec1, 1); + return {v0, v1}; + } + + Vectorized el_mergee() const { + vfloat64 v0 = vec_splat(_vec0, 0); + vfloat64 v1 = vec_splat(_vec1, 0); + return {v0, v1}; + } + + static Vectorized el_mergee( + const Vectorized& first, + const Vectorized& second) { + return { + vec_mergeh(first._vec0, second._vec0), + vec_mergeh(first._vec1, second._vec1)}; + } + + static Vectorized el_mergeo( + const Vectorized& first, + const Vectorized& second) { + return { + vec_mergel(first._vec0, second._vec0), + vec_mergel(first._vec1, second._vec1)}; + } + + Vectorized abs_2_() const { + auto a = (*this).elwise_mult(*this); + auto permuted = a.el_swapped(); + a = a + permuted; + return a; + } + + Vectorized abs_() const { + auto vi = el_mergeo(); + auto vr = el_mergee(); + return { + Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0), + Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)}; + } + + Vectorized abs() const { + return abs_() & vd_real_mask; + } + + Vectorized angle_() const { + // angle = atan2(b/a) + // auto b_a = _mm256_permute_pd(values, 0x05); // b a + // return Sleef_atan2d4_u10(values, b_a); // 90-angle angle + Vectorized ret; + ret._vec0[0] = std::atan2(_vec0[1], _vec0[0]); + ret._vec1[0] = std::atan2(_vec1[1], _vec1[0]); + return ret; + } + + Vectorized angle() const { + return angle_() & vd_real_mask; + } + + Vectorized real_() const { + return *this & vd_real_mask; + } + Vectorized real() const { + return *this & vd_real_mask; + } + Vectorized imag_() const { + return *this & vd_imag_mask; + } + Vectorized imag() const { + return imag_().el_swapped(); + } + + Vectorized conj_() const { + return *this ^ vd_isign_mask; + } + Vectorized conj() const { + return *this ^ vd_isign_mask; + } + + Vectorized log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + + Vectorized log2() const { + // log2eB_inv + auto ret = log(); + return ret.elwise_mult(vd_log2e_inv); + } + Vectorized log10() const { + auto ret = log(); + return ret.elwise_mult(vd_log10e_inv); + } + + Vectorized log1p() const { + return map(std::log1p); + } + + Vectorized asin() const { + // asin(x) + // = -i*ln(iz + sqrt(1 -z^2)) + // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + auto conj = conj_(); + auto b_a = conj.el_swapped(); + auto ab = conj.elwise_mult(b_a); + auto im = ab + ab; + auto val_2 = (*this).elwise_mult(*this); + auto val_2_swapped = val_2.el_swapped(); + auto re = horizontal_sub(val_2, val_2_swapped); + re = Vectorized(vd_one) - re; + auto root = el_blend<0x0A>(re, im).sqrt(); + auto ln = (b_a + root).log(); + return ln.el_swapped().conj(); + } + + Vectorized acos() const { + // acos(x) = pi/2 - asin(x) + return Vectorized(vd_pi_2) - asin(); + } + + Vectorized atan() const { + // atan(x) = i/2 * ln((i + z)/(i - z)) + auto ione = Vectorized(vd_imag_one); + auto sum = ione + *this; + auto sub = ione - *this; + auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) + return ln * vd_imag_half; // i/2*ln() + } + Vectorized atanh() const { + return map(std::atanh); + } + + Vectorized sin() const { + return map(std::sin); + } + Vectorized sinh() const { + return map(std::sinh); + } + Vectorized cos() const { + return map(std::cos); + } + Vectorized cosh() const { + return map(std::cosh); + } + + Vectorized tan() const { + return map(std::tan); + } + Vectorized tanh() const { + return map(std::tanh); + } + Vectorized ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vectorized floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vectorized neg() const { + auto z = Vectorized(vd_zero); + return z - *this; + } + Vectorized round() const { + return {vec_rint(_vec0), vec_rint(_vec1)}; + } + + Vectorized trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vectorized elwise_sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + + Vectorized sqrt() const { + return map(std::sqrt); + } + + Vectorized reciprocal() const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() = c/abs_2() + // im = (bc - ad)/abs_2() = d/abs_2() + auto c_d = *this ^ vd_isign_mask; // c -d + auto abs = abs_2_(); + return c_d.elwise_div(abs); + } + + Vectorized rsqrt() const { + return sqrt().reciprocal(); + } + + static Vectorized horizontal_add( + Vectorized& first, + Vectorized& second) { + // Operates on individual floats, see _mm_hadd_ps + // {f0+f1, s0+s1, f2+f3, s2+s3, ...} + // i.e. it sums the re and im of each value and interleaves first and + // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} + return el_mergee(first, second) + el_mergeo(first, second); + } + + static Vectorized horizontal_sub( + Vectorized& first, + Vectorized& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // summ + auto first_ret = first - first_perm; // 2sub + auto second_ret = second - second_perm; // 2 sub + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + Vectorized inline operator*( + const Vectorized& b) const { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i +#if 1 + // this is more vsx friendly than simulating horizontal from x86 + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + vi = vi ^ vd_rsign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.elwise_mult(vi) + ret; +#else + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ vd_isign_mask; + auto ad_bc = elwise_mult(d_c); + auto ret = horizontal_sub(ac_bd, ad_bc); +#endif + return ret; + } + + Vectorized inline operator/( + const Vectorized& b) const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() + // auto fabs_cd = Vectorized{ + // vec_andc(b._vec0, vd_sign_mask), + // vec_andc(b._vec1, vd_sign_mask)}; // |c| |d| + // auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + // auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + // auto a2 = elwise_div(scale); // a/sc b/sc + // auto b2 = b.elwise_div(scale); // c/sc d/sc + // auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/sc^2 + // auto dc2 = b2.el_swapped(); // d/sc c/sc + // dc2 = dc2 ^ vd_rsign_mask; // -d/sc c/sc + // auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + // auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + // auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 + // (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret; + + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + this->store(tmp1); + b.store(tmp2); + + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return loadu(out); + } + + Vectorized exp() const { + return map(std::exp); + } + Vectorized exp2() const { + return map(exp2_impl); + } + Vectorized expm1() const { + return map(std::expm1); + } + + Vectorized pow(const Vectorized& exp) const { + __at_align__ ComplexDbl x_tmp[size()]; + __at_align__ ComplexDbl y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + + Vectorized sgn() const { + return map(at::native::sgn_impl); + } + + Vectorized operator<(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized operator<=(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized operator>(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized operator>=(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized eq(const Vectorized& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & vd_one; + } + Vectorized ne(const Vectorized& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & vd_one; + } + + DEFINE_MEMBER_OP(operator==, ComplexDbl, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, ComplexDbl, vec_cmpne) + + DEFINE_MEMBER_OP(operator+, ComplexDbl, vec_add) + DEFINE_MEMBER_OP(operator-, ComplexDbl, vec_sub) + DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and) + DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or) + DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor) + // elementwise helpers + DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul) + DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div) + DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt) + DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge) + DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt) + DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple) + DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); + // auto max = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_lt(abs_b); + auto max = Vectorized::elwise_blendv(a, b, mask); + + return max; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(max, isnan); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); + // auto min = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_gt(abs_b); + auto min = Vectorized::elwise_blendv(a, b, mask); + return min; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(min, isnan); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + // (a + ib) * (c + id) = (ac - bd) + i(ad + bc) + // Split into real and imaginary parts + auto a_real = a.el_mergee(); // real part of a + auto a_imag = a.el_mergeo(); // imag part of a + auto b_real = b.el_mergee(); // real part of b + auto b_imag = b.el_mergeo(); // imag part of b + + // Compute components + auto ac = a_real.elwise_mult(b_real); // real*real + auto bd = a_imag.elwise_mult(b_imag); // imag*imag + + // Real part: ac - bd + auto real = ac - bd; + + auto ad = a_real.elwise_mult(b_imag); // real*imag + auto bc = a_imag.elwise_mult(b_real); // imag*real + + // Imag = ad + bc + auto imag = ad + bc; + + // Merge real and imaginary parts into vectors + __vector double v0 = vec_mergeh(real.vec0(), imag.vec0()); // [r0, i0] + __vector double v1 = vec_mergeh(real.vec1(), imag.vec1()); // [r1, i1] + + // Create the final result + auto result = Vectorized{v0, v1}; + return result; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() + // Take absolute values of real and imaginary parts of b + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return Vectorized::loadu(out); +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..ebeab3693c288277f434948d6e9a805e5b188cf0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h @@ -0,0 +1,776 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#pragma once +#include +#include +#include +#include +#include + +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { +using ComplexFlt = c10::complex; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vfloat32 _vec0; + vfloat32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = ComplexFlt; + using vec_internal_type = vfloat32; + using vec_internal_mask_type = vbool32; + using size_type = int; + + static constexpr size_type size() { + return 4; + } + Vectorized() {} + + C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} + + Vectorized(ComplexFlt val) { + float real_value = val.real(); + float imag_value = val.imag(); + _vec0 = vfloat32{real_value, imag_value, real_value, imag_value}; + _vec1 = vfloat32{real_value, imag_value, real_value, imag_value}; + } + + Vectorized( + ComplexFlt val1, + ComplexFlt val2, + ComplexFlt val3, + ComplexFlt val4) { + _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()}; + _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()}; + } + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_2nd = VsxComplexMask2(mask); + // generated masks + return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_2nd = VsxComplexMask2(mask); + // generated masks + return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + const vbool32 mask_2nd = VsxComplexMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static Vectorized C10_ALWAYS_INLINE + el_blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxMask1(mask); + const vbool32 mask_2nd = VsxMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // convert std::complex index mask to V index mask: xy -> xxyy + auto mask_complex = Vectorized( + vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1)); + return { + vec_sel( + a._vec0, b._vec0, reinterpret_cast(mask_complex._vec0)), + vec_sel( + a._vec1, b._vec1, reinterpret_cast(mask_complex._vec1)), + }; + } + + static Vectorized elwise_blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return { + vec_sel(a._vec0, b._vec0, reinterpret_cast(mask._vec0)), + vec_sel(a._vec1, b._vec1, reinterpret_cast(mask._vec1)), + }; + } + + template + static Vectorized arange( + ComplexFlt base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + ComplexFlt(2) * step, + base + ComplexFlt(3) * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + vec_vsx_ld(offset0, reinterpret_cast(tmp_values)), + vec_vsx_ld(offset16, reinterpret_cast(tmp_values))}; + } + + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, reinterpret_cast(tmp_values)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(tmp_values)); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const ComplexFlt& operator[](int idx) const = delete; + ComplexFlt& operator[](int idx) = delete; + + Vectorized map(ComplexFlt (*const f)(ComplexFlt)) const { + __at_align__ ComplexFlt tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vectorized map(ComplexFlt (*const f)(const ComplexFlt&)) const { + __at_align__ ComplexFlt tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + static Vectorized horizontal_add( + Vectorized& first, + Vectorized& second) { + // Operates on individual floats, see _mm_hadd_ps + // {f0+f1, s0+s1, f2+f3, s2+s3, ...} + // i.e. it sums the re and im of each value and interleaves first and + // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} + return el_mergee(first, second) + el_mergeo(first, second); + } + + static Vectorized horizontal_sub_permD8( + Vectorized& first, + Vectorized& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // sum + auto first_ret = first - first_perm; // 2sub + auto second_ret = second - second_perm; // 2 sub + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + Vectorized abs_2_() const { + auto a = (*this).elwise_mult(*this); + auto permuted = a.el_swapped(); + a = a + permuted; + return a.el_mergee(); + } + + Vectorized abs_() const { + auto vi = el_mergeo(); + auto vr = el_mergee(); + return { + Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0), + Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)}; + } + + Vectorized abs() const { + return abs_() & real_mask; + } + + Vectorized real_() const { + return *this & real_mask; + } + Vectorized real() const { + return *this & real_mask; + } + Vectorized imag_() const { + return *this & imag_mask; + } + Vectorized imag() const { + // we can use swap_mask or sldwi + auto ret = imag_(); + return { + vec_sldw(ret._vec0, ret._vec0, 3), vec_sldw(ret._vec1, ret._vec1, 3)}; + } + + Vectorized conj_() const { + return *this ^ isign_mask; + } + Vectorized conj() const { + return *this ^ isign_mask; + } + + Vectorized log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + + Vectorized log2() const { + // log2eB_inv + auto ret = log(); + return ret.elwise_mult(log2e_inv); + } + Vectorized log10() const { + auto ret = log(); + return ret.elwise_mult(log10e_inv); + } + + Vectorized log1p() const { + return map(std::log1p); + } + + Vectorized el_swapped() const { + vfloat32 v0 = vec_perm(_vec0, _vec0, swap_mask); + vfloat32 v1 = vec_perm(_vec1, _vec1, swap_mask); + return {v0, v1}; + } + + Vectorized el_mergee() const { + // as mergee phased in , we can use vec_perm with mask + return {vec_mergee(_vecb0, _vecb0), vec_mergee(_vecb1, _vecb1)}; + } + + Vectorized el_mergeo() const { + // as mergeo phased in , we can use vec_perm with mask + return {vec_mergeo(_vecb0, _vecb0), vec_mergeo(_vecb1, _vecb1)}; + } + + Vectorized el_madd( + const Vectorized& multiplier, + const Vectorized& val) const { + return { + vec_madd(_vec0, multiplier._vec0, val._vec0), + vec_madd(_vec1, multiplier._vec1, val._vec1)}; + } + + static Vectorized el_mergee( + const Vectorized& first, + const Vectorized& second) { + return { + vec_mergee(first._vecb0, second._vecb0), + vec_mergee(first._vecb1, second._vecb1)}; + } + + static Vectorized el_mergeo( + const Vectorized& first, + const Vectorized& second) { + return { + vec_mergeo(first._vecb0, second._vecb0), + vec_mergeo(first._vecb1, second._vecb1)}; + } + + Vectorized angle_() const { + // angle = atan2(b/a) + // auto b_a = _mm256_permute_ps(values, 0xB1); // b a + // return Sleef_atan2f8_u10(values, b_a); // 90-angle angle + Vectorized ret; + for (int i = 0; i < 4; i += 2) { + ret._vec0[i] = std::atan2(_vec0[i + 1], _vec0[i]); + ret._vec1[i] = std::atan2(_vec1[i + 1], _vec1[i]); + } + return ret; + } + + Vectorized angle() const { + return angle_() & real_mask; + } + + Vectorized sin() const { + return map(std::sin); + } + Vectorized sinh() const { + return map(std::sinh); + } + Vectorized cos() const { + return map(std::cos); + } + Vectorized cosh() const { + return map(std::cosh); + } + Vectorized ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vectorized floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vectorized neg() const { + auto z = Vectorized(zero); + return z - *this; + } + Vectorized round() const { + return {vec_round(_vec0), vec_round(_vec1)}; + } + Vectorized tan() const { + return map(std::tan); + } + Vectorized tanh() const { + return map(std::tanh); + } + Vectorized trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vectorized elwise_sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + + Vectorized sqrt() const { + return map(std::sqrt); + } + + Vectorized reciprocal() const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() = c/abs_2() + // im = (bc - ad)/abs_2() = d/abs_2() + auto c_d = *this ^ isign_mask; // c -d + auto abs = abs_2_(); + return c_d.elwise_div(abs); + } + + Vectorized rsqrt() const { + return sqrt().reciprocal(); + } + + Vectorized pow(const Vectorized& exp) const { + __at_align__ ComplexFlt x_tmp[size()]; + __at_align__ ComplexFlt y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + + Vectorized atan() const { + // atan(x) = i/2 * ln((i + z)/(i - z)) + auto ione = Vectorized(imag_one); + auto sum = ione + *this; + auto sub = ione - *this; + auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) + return ln * imag_half; // i/2*ln() + } + Vectorized atanh() const { + return map(std::atanh); + } + + Vectorized acos() const { + // acos(x) = pi/2 - asin(x) + return Vectorized(pi_2) - asin(); + } + + Vectorized inline operator*( + const Vectorized& b) const { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + +#if 1 + // this is more vsx friendly than simulating horizontal from x86 + + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + vi = vi ^ rsign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.elwise_mult(vi) + ret; + return ret; + +#else + + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ isign_mask; + auto ad_bc = elwise_mult(d_c); + auto ret = horizontal_sub_permD8(ac_bd, ad_bc); + return ret; +#endif + } + + Vectorized inline operator/( + const Vectorized& b) const { +#if 1 + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + this->store(tmp1); + b.store(tmp2); + + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return loadu(out); +#else + auto fabs_cd = Vectorized{ + vec_andc(b._vec0, sign_mask), vec_andc(b._vec1, sign_mask)}; // |c| |d| + auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + auto a2 = elwise_div(scale); // a/sc b/sc + auto b2 = b.elwise_div(scale); // c/sc d/sc + auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/s + auto dc2 = b2.el_swapped(); // d/sc c/sc + dc2 = dc2 ^ rsign_mask; // -d/sc c/sc + auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 + ret = ret.elwise_div(denom2); + return ret; +#endif + } + + Vectorized asin() const { + // asin(x) + // = -i*ln(iz + sqrt(1 -z^2)) + // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + +#if 1 + auto conj = conj_(); + auto b_a = conj.el_swapped(); + auto ab = conj.elwise_mult(b_a); + auto im = ab + ab; + auto val_2 = (*this).elwise_mult(*this); + auto val_2_swapped = val_2.el_swapped(); + auto re = horizontal_sub_permD8(val_2, val_2_swapped); + re = Vectorized(one) - re; + auto root = el_blend<0xAA>(re, im).sqrt(); + auto ln = (b_a + root).log(); + return ln.el_swapped().conj(); +#else + return map(std::asin); +#endif + } + + Vectorized exp() const { + return map(std::exp); + } + Vectorized exp2() const { + return map(exp2_impl); + } + Vectorized expm1() const { + return map(std::expm1); + } + + Vectorized eq(const Vectorized& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & one; + } + Vectorized ne(const Vectorized& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & one; + } + + Vectorized sgn() const { + return map(at::native::sgn_impl); + } + + Vectorized operator<(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized operator<=(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized operator>(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized operator>=(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + DEFINE_MEMBER_OP(operator==, ComplexFlt, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, ComplexFlt, vec_cmpne) + + DEFINE_MEMBER_OP(operator+, ComplexFlt, vec_add) + DEFINE_MEMBER_OP(operator-, ComplexFlt, vec_sub) + DEFINE_MEMBER_OP(operator&, ComplexFlt, vec_and) + DEFINE_MEMBER_OP(operator|, ComplexFlt, vec_or) + DEFINE_MEMBER_OP(operator^, ComplexFlt, vec_xor) + // elementwise helpers + DEFINE_MEMBER_OP(elwise_mult, ComplexFlt, vec_mul) + DEFINE_MEMBER_OP(elwise_div, ComplexFlt, vec_div) + DEFINE_MEMBER_OP(elwise_gt, ComplexFlt, vec_cmpgt) + DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge) + DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt) + DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple) + DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); + // auto max = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_lt(abs_b); + auto max = Vectorized::elwise_blendv(a, b, mask); + + return max; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(max, isnan); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); + // auto min = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_gt(abs_b); + auto min = Vectorized::elwise_blendv(a, b, mask); + return min; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(min, isnan); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + // (a + ib) * (c + id) = (ac - bd) + i(ad + bc) + // Split into real and imaginary parts + auto a_real = a.el_mergee(); // real part of a + auto a_imag = a.el_mergeo(); // imag part of a + auto b_real = b.el_mergee(); // real part of b + auto b_imag = b.el_mergeo(); // imag part of b + + auto b_imag_neg = b_imag ^ rsign_mask; + // Compute components + auto ac = a_real.elwise_mult(b_real); // real * real + auto bd = a_imag.elwise_mult(b_imag_neg); // imag * imag + auto ad = a_real.elwise_mult(b_imag); // real * imag + auto bc = a_imag.elwise_mult(b_real); // imag * real + + // Real = ac - bd (fix the negative bd part) + auto real = ac + bd; // Real part calculation + auto imag = ad + bc; // Imaginary part calculation + + // Step 1: Extract from real and imag + __vector float r0 = real.vec0(); // {r0, r1, r2, r3} + __vector float i0 = imag.vec0(); // {i0, i1, i2, i3} + + __vector float r1 = real.vec1(); // imag[0..3] + __vector float i1 = imag.vec1(); // imag[4..7] + + __vector unsigned char perm_lo = { + 0, + 1, + 2, + 3, // r0 + 16, + 17, + 18, + 19, // + 8, + 9, + 10, + 11, // r1 + 24, + 25, + 26, + 27}; + __vector float v0 = + vec_perm(r0, i0, perm_lo); // Interleave r0 and i0, r1 and i1 + __vector float v1 = vec_perm(r1, i1, perm_lo); + Vectorized result(v0, v1); + return result; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + // Take absolute values of real and imaginary parts of b + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : + c10::irange(Vectorized>:: + size())) { //{Vectorized>::size())) + //{ + out[i] = tmp1[i] / tmp2[i]; + } + return Vectorized::loadu(out); +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..63a9e5e2f1ad1328a85db5e0228b81dfd41ab215 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h @@ -0,0 +1,520 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace at { +namespace vec { + +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vfloat64 _vec0; + vfloat64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = double; + using vec_internal_type = vfloat64; + using vec_internal_mask_type = vbool64; + using size_type = int; + static constexpr size_type size() { + return 4; + } + Vectorized() {} + C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vectorized(double scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vectorized( + double scalar1, + double scalar2, + double scalar3, + double scalar4) + : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + int zero_mask() const { + auto cmp = (*this == vd_zero); + return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) | + (cmp._vecb1[1] & 8); + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return {a._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return {b._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + const vbool64 mask_2nd = VsxDblMask2(mask); + return { + (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // the mask used here returned by comparison of vec256 + + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + + static Vectorized C10_ALWAYS_INLINE + set(const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + + return b; + } + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + Vectorized map(double (*const f)(double)) const { + Vectorized ret; + for (const auto i : c10::irange(size() / 2)) { + ret._vec0[i] = f(_vec0[i]); + } + for (const auto i : c10::irange(size() / 2)) { + ret._vec1[i] = f(_vec1[i]); + } + return ret; + } + + Vectorized mapbi( + double (*const f)(double, double), + const Vectorized& other) const { + Vectorized ret; + for (const auto i : c10::irange(size() / 2)) { + ret._vec0[i] = f(_vec0[i], other._vec0[i]); + } + for (const auto i : c10::irange(size() / 2)) { + ret._vec1[i] = f(_vec1[i], other._vec1[i]); + } + return ret; + } + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE acos() const { + return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE acosh() const { + return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asin() const { + return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asinh() const { + return {Sleef_asinhd2_u10(_vec0), Sleef_asinhd2_u10(_vec1)}; + } + Vectorized atan() const { + return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)}; + } + Vectorized atanh() const { + return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)}; + } + Vectorized atan2(const Vectorized& b) const { + return { + Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)}; + } + Vectorized copysign(const Vectorized& sign) const { + return { + Sleef_copysignd2(_vec0, sign._vec0), + Sleef_copysignd2(_vec1, sign._vec1)}; + } + Vectorized erf() const { + return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)}; + } + Vectorized erfc() const { + return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp() const { + return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp2() const { + return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)}; + } + Vectorized expm1() const { + return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp_u20() const { + return exp(); + } + Vectorized C10_ALWAYS_INLINE fexp_u20() const { + return exp(); + } + + Vectorized lgamma() const __ubsan_ignore_undefined__ { + return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)}; + } + + Vectorized erfinv() const { + return map(calc_erfinv); + } + + Vectorized angle() const { + auto tmp = blendv( + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); + return blendv(tmp, *this, isnan()); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + Vectorized C10_ALWAYS_INLINE log() const { + return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log10() const { + return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log1p() const { + return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log2() const { + return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE cos() const { + return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE cosh() const { + return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE round() const { + return {vec_rint(_vec0), vec_rint(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE sin() const { + return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE sinh() const { + return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tan() const { + return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tanh() const { + return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE frac() const { + return *this - trunc(); + } + + Vectorized C10_ALWAYS_INLINE sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE reciprocal() const { + return { + vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one. + vec_div(vd_one, _vec1)}; + } + Vectorized C10_ALWAYS_INLINE rsqrt() const { + return sqrt().reciprocal(); + } + + Vectorized C10_ALWAYS_INLINE pow(const Vectorized& b) const { + return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)}; + } + Vectorized C10_ALWAYS_INLINE fmod(const Vectorized& b) const { + return {Sleef_fmodd2(_vec0, b._vec0), Sleef_fmodd2(_vec1, b._vec1)}; + } + + Vectorized hypot(const Vectorized& b) const { + return { + Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)}; + } + + Vectorized nextafter(const Vectorized& b) const { + return { + Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)}; + } + + Vectorized igamma(const Vectorized& x) const { + return mapbi(calc_igamma, x); + } + + Vectorized igammac(const Vectorized& x) const { + return mapbi(calc_igammac, x); + } + + Vectorized i0() const { + return map(calc_i0); + } + + Vectorized i0e() const { + return map(calc_i0e); + } + + Vectorized digamma() const { + return map(calc_digamma); + } + + Vectorized _nor() const { + return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)}; + } + + Vectorized isnan() const { + auto x = *this; + auto ret = (x == x); + return ret._nor(); + } + bool has_inf_nan() const { + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; + } + + DEFINE_MEMBER_OP(operator==, double, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, double, vec_cmpne) + DEFINE_MEMBER_OP(operator<, double, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, double, vec_cmple) + DEFINE_MEMBER_OP(operator>, double, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, double, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge) + DEFINE_MEMBER_OP(operator+, double, vec_add) + DEFINE_MEMBER_OP(operator-, double, vec_sub) + DEFINE_MEMBER_OP(operator*, double, vec_mul) + DEFINE_MEMBER_OP(operator/, double, vec_div) + DEFINE_MEMBER_OP(maximum, double, vec_max_nan2) + DEFINE_MEMBER_OP(minimum, double, vec_min_nan2) + DEFINE_MEMBER_OP(operator&, double, vec_and) + DEFINE_MEMBER_OP(operator|, double, vec_or) + DEFINE_MEMBER_OP(operator^, double, vec_xor) + DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd) +}; +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..f26ea32fe0b1e8d2ab91149b28b002ceadfa1f3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h @@ -0,0 +1,553 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] + +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vfloat32 _vec0; + vfloat32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = float; + using vec_internal_type = vfloat32; + using vec_internal_mask_type = vbool32; + using size_type = int; + + static constexpr size_type size() { + return 8; + } + Vectorized() {} + + C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vectorized(float scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vectorized( + float scalar1, + float scalar2, + float scalar3, + float scalar4, + float scalar5, + float scalar6, + float scalar7, + float scalar8) + : _vec0{vfloat32{scalar1, scalar2, scalar3, scalar4}}, + _vec1{vfloat32{scalar5, scalar6, scalar7, scalar8}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_2nd = VsxMask2(mask); + // generated masks + return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_2nd = VsxMask2(mask); + // generated masks + return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool32 mask_1st = VsxMask1(mask); + const vbool32 mask_2nd = VsxMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // the mask used here returned by comparison of vec256 + // assuming this we can use the same mask directly with vec_sel + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + + return b; + } + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + + Vectorized map(float (*const f)(float)) const { + Vectorized ret; + for (int i = 0; i < size() / 2; i++) { + ret._vec0[i] = f(_vec0[i]); + } + for (int i = 0; i < size() / 2; i++) { + ret._vec1[i] = f(_vec1[i]); + } + return ret; + } + + Vectorized mapbi( + float (*const f)(float, float), + const Vectorized& other) const { + Vectorized ret; + for (int i = 0; i < size() / 2; i++) { + ret._vec0[i] = f(_vec0[i], other._vec0[i]); + } + for (int i = 0; i < size() / 2; i++) { + ret._vec1[i] = f(_vec1[i], other._vec1[i]); + } + return ret; + } + + Vectorized _nor() const { + return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)}; + } + + Vectorized isnan() const { + auto x = *this; + auto ret = (x == x); + return ret._nor(); + } + + bool has_inf_nan() const { + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; + } + + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + //__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ); + auto cmp = (*this == zero); + // return _mm256_movemask_ps(cmp); + // possible simulation //mask= lvsl ( 0 ) vbpermq( vec, mask <<5) + vuint64 result0 = vec_vbpermq((vuint8)cmp._vecb0, mask_zero_bits); + vuint64 result1 = vec_vbpermq((vuint8)cmp._vecb1, mask_zero_bits); + return (result0[1] >> 12 | (result1[1] >> 8)); + } + + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE acos() const { + return {Sleef_acosf4_u10(_vec0), Sleef_acosf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE acosh() const { + return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asin() const { + return {Sleef_asinf4_u10(_vec0), Sleef_asinf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asinh() const { + return {Sleef_asinhf4_u10(_vec0), Sleef_asinhf4_u10(_vec1)}; + } + Vectorized atan() const { + return {Sleef_atanf4_u10(_vec0), Sleef_atanf4_u10(_vec1)}; + } + Vectorized atanh() const { + return {Sleef_atanhf4_u10(_vec0), Sleef_atanhf4_u10(_vec1)}; + } + Vectorized atan2(const Vectorized& b) const { + return { + Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)}; + } + Vectorized copysign(const Vectorized& sign) const { + return { + Sleef_copysignf4(_vec0, sign._vec0), + Sleef_copysignf4(_vec1, sign._vec1)}; + } + Vectorized lgamma() const { + return {Sleef_lgammaf4_u10(_vec0), Sleef_lgammaf4_u10(_vec1)}; + } + Vectorized erf() const { + return {Sleef_erff4_u10(_vec0), Sleef_erff4_u10(_vec1)}; + } + + Vectorized erfc() const { + return {Sleef_erfcf4_u15(_vec0), Sleef_erfcf4_u15(_vec1)}; + } + + Vectorized erfinv() const { + return map(calc_erfinv); + } + + Vectorized angle() const { + auto tmp = blendv( + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); + return blendv(tmp, *this, isnan()); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + Vectorized C10_ALWAYS_INLINE exp() const { + return {Sleef_expf4_u10(_vec0), Sleef_expf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp2() const { + return {Sleef_exp2f4_u10(_vec0), Sleef_exp2f4_u10(_vec1)}; + } + Vectorized expm1() const { + return {Sleef_expm1f4_u10(_vec0), Sleef_expm1f4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp_u20() const { + return exp(); + } + Vectorized C10_ALWAYS_INLINE fexp_u20() const { + return exp(); + } + + Vectorized C10_ALWAYS_INLINE log() const { + return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log10() const { + return {Sleef_log10f4_u10(_vec0), Sleef_log10f4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log1p() const { + return {Sleef_log1pf4_u10(_vec0), Sleef_log1pf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log2() const { + return {Sleef_log2f4_u10(_vec0), Sleef_log2f4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE cos() const { + return {Sleef_cosf4_u10(_vec0), Sleef_cosf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE cosh() const { + return {Sleef_coshf4_u10(_vec0), Sleef_coshf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE round() const { + return {vec_round(_vec0), vec_round(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE sin() const { + return {Sleef_sinf4_u10(_vec0), Sleef_sinf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE sinh() const { + return {Sleef_sinhf4_u10(_vec0), Sleef_sinhf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tan() const { + return {Sleef_tanf4_u10(_vec0), Sleef_tanf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tanh() const { + return {Sleef_tanhf4_u10(_vec0), Sleef_tanhf4_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE frac() const { + return *this - trunc(); + } + + Vectorized C10_ALWAYS_INLINE sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE reciprocal() const { + return Vectorized(one) / (*this); + } + Vectorized C10_ALWAYS_INLINE rsqrt() const { + return sqrt().reciprocal(); + } + + Vectorized C10_ALWAYS_INLINE pow(const Vectorized& exp) const { + return { + Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)}; + } + + Vectorized fmod(const Vectorized& b) const { + return {Sleef_fmodf4(_vec0, b._vec0), Sleef_fmodf4(_vec1, b._vec1)}; + } + + Vectorized hypot(const Vectorized& b) const { + return { + Sleef_hypotf4_u05(_vec0, b._vec0), Sleef_hypotf4_u05(_vec1, b._vec1)}; + } + + Vectorized nextafter(const Vectorized& b) const { + return { + Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)}; + } + + Vectorized igamma(const Vectorized& x) const { + return mapbi(calc_igamma, x); + } + + Vectorized igammac(const Vectorized& x) const { + return mapbi(calc_igammac, x); + } + + Vectorized i0() const { + return map(calc_i0); + } + + Vectorized i0e() const { + return map(calc_i0e); + } + + Vectorized digamma() const { + return map(calc_digamma); + } + + DEFINE_MEMBER_OP(operator==, float, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, float, vec_cmpne) + DEFINE_MEMBER_OP(operator<, float, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, float, vec_cmple) + DEFINE_MEMBER_OP(operator>, float, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, float, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, float, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, float, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, float, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, float, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, float, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, float, vec_cmpge) + DEFINE_MEMBER_OP(operator+, float, vec_add) + DEFINE_MEMBER_OP(operator-, float, vec_sub) + DEFINE_MEMBER_OP(operator*, float, vec_mul) + DEFINE_MEMBER_OP(operator/, float, vec_div) + DEFINE_MEMBER_OP(maximum, float, vec_max_nan2) + DEFINE_MEMBER_OP(minimum, float, vec_min_nan2) + DEFINE_MEMBER_OP(operator&, float, vec_and) + DEFINE_MEMBER_OP(operator|, float, vec_or) + DEFINE_MEMBER_OP(operator^, float, vec_xor) + DEFINE_MEMBER_TERNARY_OP(madd, float, vec_madd) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..5150ccf3a2cd6df9c05e1f2b1184912ebd9ad7fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h @@ -0,0 +1,422 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vint16 _vec0; + vint16 _vec1; + }; + struct { + vbool16 _vecb0; + vbool16 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int16_t; + using vec_internal_type = vint16; + using vec_internal_mask_type = vbool16; + using size_type = int; + static constexpr size_type size() { + return 16; + } + Vectorized() {} + C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) + : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vectorized(int16_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + + C10_ALWAYS_INLINE Vectorized( + int16_t scalar1, + int16_t scalar2, + int16_t scalar3, + int16_t scalar4, + int16_t scalar5, + int16_t scalar6, + int16_t scalar7, + int16_t scalar8, + int16_t scalar9, + int16_t scalar10, + int16_t scalar11, + int16_t scalar12, + int16_t scalar13, + int16_t scalar14, + int16_t scalar15, + int16_t scalar16) + : _vec0{vint16{ + scalar1, + scalar2, + scalar3, + scalar4, + scalar5, + scalar6, + scalar7, + scalar8}}, + _vec1{vint16{ + scalar9, + scalar10, + scalar11, + scalar12, + scalar13, + scalar14, + scalar15, + scalar16}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t<(mask & 65535) == 65535, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask > 0 && mask < 255), Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr int16_t g0 = (mask & 1) * 0xffff; + constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; + const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; + + return {(vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), a._vec1}; + } + + template + static std::enable_if_t< + (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return {b._vec0, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr int16_t mask2 = (mask & 65535) >> 16; + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return {a, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) && + ((mask & 255) != 255)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr int16_t g0 = (mask & 1) * 0xffff; + constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; + constexpr int16_t mask2 = (mask & 65535) >> 16; + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return { + (vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), + (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // the mask used here returned by comparison of vec256 + // assuming this we can use the same mask directly with vec_sel + // warning intel style mask will not work properly + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + template + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; + + Vectorized angle() const { + return blendv( + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int16_t, vec_not) + DEFINE_MEMBER_OP(operator==, int16_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int16_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int16_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int16_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int16_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int16_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int16_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int16_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int16_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int16_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int16_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int16_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int16_t, vec_add) + DEFINE_MEMBER_OP(operator-, int16_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int16_t, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int16_t, /) + DEFINE_MEMBER_OP(maximum, int16_t, vec_max) + DEFINE_MEMBER_OP(minimum, int16_t, vec_min) + DEFINE_MEMBER_OP(operator&, int16_t, vec_and) + DEFINE_MEMBER_OP(operator|, int16_t, vec_or) + DEFINE_MEMBER_OP(operator^, int16_t, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +DEFINE_SHIFT_FUNCS(int16_t) + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..baa0a95a9bd194a8a4f7cc3a1518a77d12bd8e58 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h @@ -0,0 +1,352 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vint32 _vec0; + vint32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int32_t; + using vec_internal_type = vint32; + using vec_internal_mask_type = vbool32; + using size_type = int; + static constexpr size_type size() { + return 8; + } + Vectorized() {} + C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vectorized(int32_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vectorized( + int32_t scalar1, + int32_t scalar2, + int32_t scalar3, + int32_t scalar4, + int32_t scalar5, + int32_t scalar6, + int32_t scalar7, + int32_t scalar8) + : _vec0{vint32{scalar1, scalar2, scalar3, scalar4}}, + _vec1{vint32{scalar5, scalar6, scalar7, scalar8}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t<(mask & 255) == 255, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask > 0 && mask < 15), Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint32_t g0 = (mask & 1) * 0xffffffff; + constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; + + return {(vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), a._vec1}; + } + + template + static std::enable_if_t< + (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return {b._vec0, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return {a, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) && + ((mask & 15) != 15)), + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint32_t g0 = (mask & 1) * 0xffffffff; + constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return { + (vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), + (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // the mask used here returned by comparison of vec256 + // assuming this we can use the same mask directly with vec_sel + // warning intel style mask will not work properly + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + template + static Vectorized arange( + int32_t base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + + return b; + } + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; + + Vectorized angle() const { + return blendv( + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int32_t, vec_not) + DEFINE_MEMBER_OP(operator==, int32_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int32_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int32_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int32_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int32_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int32_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int32_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int32_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int32_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int32_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int32_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int32_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int32_t, vec_add) + DEFINE_MEMBER_OP(operator-, int32_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int32_t, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int32_t, /) + DEFINE_MEMBER_OP(maximum, int32_t, vec_max) + DEFINE_MEMBER_OP(minimum, int32_t, vec_min) + DEFINE_MEMBER_OP(operator&, int32_t, vec_and) + DEFINE_MEMBER_OP(operator|, int32_t, vec_or) + DEFINE_MEMBER_OP(operator^, int32_t, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +DEFINE_SHIFT_FUNCS(int32_t) + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..c3012293b3c7b0c10855f86f6c747b50e4ee1a17 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h @@ -0,0 +1,306 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + union { + struct { + vint64 _vec0; + vint64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int64_t; + using vec_internal_type = vint64; + using vec_internal_mask_type = vbool64; + using size_type = int; + using ElementType = signed long long; + static constexpr size_type size() { + return 4; + } + Vectorized() {} + C10_ALWAYS_INLINE Vectorized(vint64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vectorized(int64_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vectorized( + int64_t scalar1, + int64_t scalar2, + int64_t scalar3, + int64_t scalar4) + : _vec0{vint64{scalar1, scalar2}}, _vec1{vint64{scalar3, scalar4}} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask & 15) == 15, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t<(mask > 0 && mask < 3), Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + const vbool64 mask_1st = (vbool64){g0, g1}; + return {(vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), a._vec1}; + } + + template + static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; + constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff; + + const vbool64 mask_2nd = (vbool64){g0_2, g1_2}; + return {a._vec0, (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15, + Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; + constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff; + + const vbool64 mask_1st = (vbool64){g0, g1}; + const vbool64 mask_2nd = (vbool64){g0_2, g1_2}; + return { + (vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), + (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)}; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // the mask used here returned by comparison of vec256 + + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + template + static Vectorized arange( + int64_t base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + + static Vectorized C10_ALWAYS_INLINE + set(const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + + return b; + } + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + static_assert(sizeof(double) == sizeof(value_type)); + const double* dptr = reinterpret_cast(ptr); + return {// treat it as double load + (vint64)vec_vsx_ld(offset0, dptr), + (vint64)vec_vsx_ld(offset16, dptr)}; + } + + __at_align__ double tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + (vint64)vec_vsx_ld(offset0, tmp_values), + (vint64)vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + double* dptr = reinterpret_cast(ptr); + vec_vsx_st((vfloat64)_vec0, offset0, dptr); + vec_vsx_st((vfloat64)_vec1, offset16, dptr); + } else if (count > 0) { + __at_align__ double tmp_values[size()]; + vec_vsx_st((vfloat64)_vec0, offset0, tmp_values); + vec_vsx_st((vfloat64)_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + + Vectorized angle() const { + return blendv( + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int64_t, vec_not) + DEFINE_MEMBER_OP(operator==, int64_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int64_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int64_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int64_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int64_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int64_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int64_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int64_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int64_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int64_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int64_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int64_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int64_t, vec_add) + DEFINE_MEMBER_OP(operator-, int64_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int64_t, vec_mul) + DEFINE_MEMBER_OP(operator/, int64_t, vec_div) + DEFINE_MEMBER_OP(maximum, int64_t, vec_max) + DEFINE_MEMBER_OP(minimum, int64_t, vec_min) + DEFINE_MEMBER_OP(operator&, int64_t, vec_and) + DEFINE_MEMBER_OP(operator|, int64_t, vec_or) + DEFINE_MEMBER_OP(operator^, int64_t, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +DEFINE_SHIFT_FUNCS(int64_t) + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..f02be95efa692b75a8ba7349492d58177b66a978 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h @@ -0,0 +1,74 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_VSX) + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; + + for (int i = 0; i < N; ++i) { + auto tmp = vec_mask[i]; + result[i] = reinterpret_cast&>(tmp); + } + return VecMask(result); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; + + for (int i = 0; i < N; ++i) { + auto tmp = vec_mask[i]; + result[i] = reinterpret_cast&>(tmp); + } + return VecMask(result); + } +}; + +template +struct VecMaskCast< + int64_t, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (dst_n == 2 * mask_n) && + (std::is_same_v || std::is_same_v)>> { + static inline VecMask apply( + const VecMask& vec_mask) { + VectorizedN result; + + auto int_mask = vec_mask.template cast(); + + for (int i = 0; i < mask_n; ++i) { + VectorizedN in_int_n; + in_int_n[0] = int_mask[i]; + + auto int64_vecs = convert(in_int_n); + + result[2 * i] = int64_vecs[0]; + result[2 * i + 1] = int64_vecs[1]; + } + return VecMask(result); + } +}; + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..692607d4d5254353f74d43ce88404cb96d9d770b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h @@ -0,0 +1,306 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 1x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at { +namespace vec { +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; +template <> +struct Vectorized { + private: + union { + struct { + vint32 _vec0; + vint32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vectorized() {} + + using size_type = int; + static constexpr size_type size() { + return 8; + } + + static constexpr size_t float_num_vecs() { + return 1; + } + static constexpr int int_num_vecs() { + return 1; + } + using float_vec_return_type = std::array, 1>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + using vec_internal_type = vint32; + using vec_internal_mask_type = vbool32; + C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} + + Vectorized(const c10::qint32& val) + : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {} + + static Vectorized C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + vfloat32 float_vals0 = vec_float(_vec0); + vfloat32 float_vals1 = vec_float(_vec1); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_sub_zero_point_0 = vec_sub(float_vals0, zero_point_vec0); + vfloat32 vec_sub_zero_point_1 = vec_sub(float_vals1, zero_point_vec1); + Vectorized vf0 = { + vec_mul(scale_vec0, vec_sub_zero_point_0), + vec_mul(scale_vec1, vec_sub_zero_point_1)}; + return {vf0}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + vfloat32 float_vals0 = vec_float(_vec0); + vfloat32 float_vals1 = vec_float(_vec1); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 zero_point0 = zero_point.vec0(); + vfloat32 zero_point1 = zero_point.vec1(); + return {Vectorized{ + (float_vals0 - zero_point0) * scale_vec0, + (float_vals1 - zero_point1) * scale_vec1}}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + Vectorized retval; + + const vint32 vmin = vec_splats(std::numeric_limits::min()); + const vint32 vmax = vec_splats(std::numeric_limits::max()); + vfloat32 inverse_scale_v = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)(zero_point)); + Vectorized vf0 = rhs[0]; + + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vecf0 = vec_mul(vecf0, inverse_scale_v); + vecf1 = vec_mul(vecf1, inverse_scale_v); + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + + veci0 = vec_max(veci0, vmin); + veci1 = vec_max(veci1, vmin); + veci0 = vec_min(veci0, vmax); + veci1 = vec_min(veci1, vmax); + + return {veci0, veci1}; + } + + Vectorized relu(Vectorized zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) const { + vint32 max0 = vec_max(_vec0, zero_point._vec0); + vint32 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + return {*this - b}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + const vint32 vmin = vec_splats(std::numeric_limits::min()); + const vint32 vmax = vec_splats(std::numeric_limits::max()); + vfloat32 vec_mult = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + Vectorized vi = inp[0]; + vfloat32 vecf0 = vec_float(vi.vec0()); + vfloat32 vecf1 = vec_float(vi.vec1()); + + vecf0 = vec_mul(vecf0, vec_mult); + vecf1 = vec_mul(vecf1, vec_mult); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + + vint32 veci0 = vec_add(vec_signed(vecf0), vec_zero_point); + vint32 veci1 = vec_add(vec_signed(vecf1), vec_zero_point); + + veci0 = vec_max(veci0, vmin); + veci1 = vec_max(veci1, vmin); + veci0 = vec_min(veci0, vmax); + veci1 = vec_min(veci1, vmax); + + return {veci0, veci1}; + } + + DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::qint32, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::qint32, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::qint32, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::qint32, vec_add) + DEFINE_MEMBER_OP(operator-, c10::qint32, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::qint32, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint32, /) + DEFINE_MEMBER_OP(maximum, c10::qint32, vec_max) + DEFINE_MEMBER_OP(minimum, c10::qint32, vec_min) + DEFINE_MEMBER_OP(operator&, c10::qint32, vec_and) + DEFINE_MEMBER_OP(operator|, c10::qint32, vec_or) + DEFINE_MEMBER_OP(operator^, c10::qint32, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..3fb5b62c5c0d898bd0fba05898123b7fa53bed5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h @@ -0,0 +1,517 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 4x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at { +namespace vec { +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; +template <> +struct Vectorized { + private: + union { + struct { + vint8 _vec0; + vint8 _vec1; + }; + struct { + vbool8 _vecb0; + vbool8 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vectorized() {} + using size_type = int; + static constexpr size_type size() { + return 32; + } + + static constexpr size_t float_num_vecs() { + return 4; + } + static constexpr int int_num_vecs() { + return 4; + } + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::qint8::underlying; + using vec_internal_type = vint8; + using vec_internal_mask_type = vbool8; + // Broadcast constructor + C10_ALWAYS_INLINE Vectorized(const c10::qint8& val) + : _vec0{vec_splats(val.val_)}, _vec1{vec_splats(val.val_)} {} + + C10_ALWAYS_INLINE Vectorized(const Vectorized& other) + : _vec0{other._vec0}, _vec1(other._vec1) {} + + C10_ALWAYS_INLINE Vectorized(vint8 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vint8 v1, vint8 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + static C10_ALWAYS_INLINE Vectorized loadu( + const void* ptr, + int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + public: + float_vec_return_type C10_ALWAYS_INLINE dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + vint16 vecshi0 = vec_unpackh(_vec0); + vint16 vecshi1 = vec_unpackl(_vec0); + + vint16 vecshi2 = vec_unpackh(_vec1); + vint16 vecshi3 = vec_unpackl(_vec1); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0); + vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1); + Vectorized vf0_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_0), + vec_mul(scale_vec1, vec_substract_src_zp1_0)}; + + vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0); + vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1); + Vectorized vf1_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_1), + vec_mul(scale_vec1, vec_substract_src_zp1_1)}; + + vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0); + vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1); + Vectorized vf2_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_2), + vec_mul(scale_vec1, vec_substract_src_zp1_2)}; + + vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0); + vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1); + Vectorized vf3_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_3), + vec_mul(scale_vec1, vec_substract_src_zp1_3)}; + + return {vf0_zp, vf1_zp, vf2_zp, vf3_zp}; + } + + float_vec_return_type C10_ALWAYS_INLINE + dequantize(Vectorized scale, Vectorized zero_point) const { + vint16 vecshi0 = vec_unpackh(_vec0); + vint16 vecshi1 = vec_unpackl(_vec0); + + vint16 vecshi2 = vec_unpackh(_vec1); + vint16 vecshi3 = vec_unpackl(_vec1); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 zero_point0 = zero_point.vec0(); + vfloat32 zero_point1 = zero_point.vec1(); + return { + Vectorized{ + (vecf0_0 - zero_point0) * scale_vec0, + (vecf1_0 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_1 - zero_point0) * scale_vec0, + (vecf1_1 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_2 - zero_point0) * scale_vec0, + (vecf1_2 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_3 - zero_point0) * scale_vec0, + (vecf1_3 - zero_point1) * scale_vec1}}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + // constexpr int32_t min_val = std::numeric_limits::min(); + // constexpr int32_t max_val = std::numeric_limits::max(); + + vfloat32 inverse_scale_v = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)zero_point); + // vint32 vmin = vec_splats(min_val); + // vint32 vmax = vec_splats(max_val); + + Vectorized vf0 = rhs[0]; + Vectorized vf1 = rhs[1]; + Vectorized vf2 = rhs[2]; + Vectorized vf3 = rhs[3]; + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vfloat32 vecf2 = vf1.vec0(); + vfloat32 vecf3 = vf1.vec1(); + + vfloat32 vecf4 = vf2.vec0(); + vfloat32 vecf5 = vf2.vec1(); + vfloat32 vecf6 = vf3.vec0(); + vfloat32 vecf7 = vf3.vec1(); + + vecf0 = vec_mul(vecf0, inverse_scale_v); + vecf1 = vec_mul(vecf1, inverse_scale_v); + vecf2 = vec_mul(vecf2, inverse_scale_v); + vecf3 = vec_mul(vecf3, inverse_scale_v); + + vecf4 = vec_mul(vecf4, inverse_scale_v); + vecf5 = vec_mul(vecf5, inverse_scale_v); + vecf6 = vec_mul(vecf6, inverse_scale_v); + vecf7 = vec_mul(vecf7, inverse_scale_v); + + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vecf2 = vec_add(vec_rint(vecf2), vec_zero_point); + vecf3 = vec_add(vec_rint(vecf3), vec_zero_point); + + vecf4 = vec_add(vec_rint(vecf4), vec_zero_point); + vecf5 = vec_add(vec_rint(vecf5), vec_zero_point); + vecf6 = vec_add(vec_rint(vecf6), vec_zero_point); + vecf7 = vec_add(vec_rint(vecf7), vec_zero_point); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + // veci0 = vec_min(vmax, vec_max( vmin, vecf0)) ; + // veci1 = vec_min(vmax, vec_max( vmin, vecf1)) ; + // veci2 = vec_min(vmax, vec_max( vmin, vecf2)) ; + // veci3 = vec_min(vmax, vec_max( vmin, vecf3)) ; + + // veci4 = vec_min(vmax, vec_max( vmin, vecf4)) ; + // veci5 = vec_min(vmax, vec_max( vmin, vecf5)) ; + // veci6 = vec_min(vmax, vec_max( vmin, vecf6)) ; + // veci7 = vec_min(vmax, vec_max( vmin, vecf7)) ; + // vec_packs CLAMP already + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vint8 vec0 = vec_packs(vecshi0, vecshi1); + vint8 vec1 = vec_packs(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + Vectorized C10_ALWAYS_INLINE + relu(Vectorized zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vectorized C10_ALWAYS_INLINE + relu6(Vectorized zero_point, Vectorized q_six) const { + vint8 max0 = vec_max(_vec0, zero_point._vec0); + vint8 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + vint16 vecshi0 = vec_unpackh(_vec0); + vint16 vecBshi0 = vec_unpackh(b._vec0); + vint16 vecshi1 = vec_unpackl(_vec0); + vint16 vecBshi1 = vec_unpackl(b._vec0); + + vint16 vecshi2 = vec_unpackh(_vec1); + vint16 vecBshi2 = vec_unpackh(b._vec1); + vint16 vecshi3 = vec_unpackl(_vec1); + vint16 vecBshi3 = vec_unpackl(b._vec1); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 vecBi0 = vec_unpackh(vecBshi0); + vint32 veci1 = vec_unpackl(vecshi0); + vint32 vecBi1 = vec_unpackl(vecBshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 vecBi2 = vec_unpackh(vecBshi1); + vint32 veci3 = vec_unpackl(vecshi1); + vint32 vecBi3 = vec_unpackl(vecBshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 vecBi4 = vec_unpackh(vecBshi2); + vint32 veci5 = vec_unpackl(vecshi2); + vint32 vecBi5 = vec_unpackl(vecBshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 vecBi6 = vec_unpackh(vecBshi3); + vint32 veci7 = vec_unpackl(vecshi3); + vint32 vecBi7 = vec_unpackl(vecBshi3); + + return { + Vectorized(veci0 - vecBi0, veci1 - vecBi1), + Vectorized(veci2 - vecBi2, veci3 - vecBi3), + Vectorized(veci4 - vecBi4, veci5 - vecBi5), + Vectorized(veci6 - vecBi6, veci7 - vecBi7)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + vfloat32 vec_multiplier = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + + Vectorized vi0 = inp[0]; + Vectorized vi1 = inp[1]; + Vectorized vi2 = inp[2]; + Vectorized vi3 = inp[3]; + + vfloat32 vecf0 = vec_float(vi0.vec0()); + vfloat32 vecf1 = vec_float(vi0.vec1()); + vfloat32 vecf2 = vec_float(vi1.vec0()); + vfloat32 vecf3 = vec_float(vi1.vec1()); + + vfloat32 vecf4 = vec_float(vi2.vec0()); + vfloat32 vecf5 = vec_float(vi2.vec1()); + vfloat32 vecf6 = vec_float(vi3.vec0()); + vfloat32 vecf7 = vec_float(vi3.vec1()); + + vecf0 = vec_mul(vecf0, vec_multiplier); + vecf1 = vec_mul(vecf1, vec_multiplier); + vecf2 = vec_mul(vecf2, vec_multiplier); + vecf3 = vec_mul(vecf3, vec_multiplier); + + vecf4 = vec_mul(vecf4, vec_multiplier); + vecf5 = vec_mul(vecf5, vec_multiplier); + vecf6 = vec_mul(vecf6, vec_multiplier); + vecf7 = vec_mul(vecf7, vec_multiplier); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + vecf2 = vec_rint(vecf2); + vecf3 = vec_rint(vecf3); + + vecf4 = vec_rint(vecf4); + vecf5 = vec_rint(vecf5); + vecf6 = vec_rint(vecf6); + vecf7 = vec_rint(vecf7); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + veci0 = vec_add(veci0, vec_zero_point); + veci1 = vec_add(veci1, vec_zero_point); + veci2 = vec_add(veci2, vec_zero_point); + veci3 = vec_add(veci3, vec_zero_point); + + veci4 = vec_add(veci4, vec_zero_point); + veci5 = vec_add(veci5, vec_zero_point); + veci6 = vec_add(veci6, vec_zero_point); + veci7 = vec_add(veci7, vec_zero_point); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vint8 vec0 = vec_packs(vecshi0, vecshi1); + vint8 vec1 = vec_packs(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::qint8, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::qint8, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::qint8, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::qint8, vec_add) + DEFINE_MEMBER_OP(operator-, c10::qint8, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::qint8, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint8, /) + DEFINE_MEMBER_OP(maximum, c10::qint8, vec_max) + DEFINE_MEMBER_OP(minimum, c10::qint8, vec_min) + DEFINE_MEMBER_OP(operator&, c10::qint8, vec_and) + DEFINE_MEMBER_OP(operator|, c10::qint8, vec_or) + DEFINE_MEMBER_OP(operator^, c10::qint8, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h new file mode 100644 index 0000000000000000000000000000000000000000..9da6dec9db5e0314d3b70f8b4f0e5d919f02490d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h @@ -0,0 +1,538 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 4x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at { +namespace vec { +inline namespace CPU_CAPABILITY { + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +const vint16 mask_unsigned = vec_splats((short int)0xFF); +template <> +struct Vectorized { + private: + union { + struct { + vuint8 _vec0; + vuint8 _vec1; + }; + struct { + vbool8 _vecb0; + vbool8 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vectorized() {} + using size_type = int; + static constexpr size_type size() { + return 32; + } + + static constexpr size_t float_num_vecs() { + return 4; + } + static constexpr int int_num_vecs() { + return 4; + } + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::quint8::underlying; + using vec_internal_type = vuint8; + using vec_internal_mask_type = vbool8; + // Broadcast constructor + C10_ALWAYS_INLINE Vectorized(const c10::quint8& val) + : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {} + + C10_ALWAYS_INLINE Vectorized(const Vectorized& other) + : _vec0{other._vec0}, _vec1(other._vec1) {} + + C10_ALWAYS_INLINE Vectorized(vuint8 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vectorized(vuint8 v1, vuint8 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + static C10_ALWAYS_INLINE Vectorized loadu( + const void* ptr, + int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + __at_align__ value_type tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + public: + float_vec_return_type C10_ALWAYS_INLINE dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + // unpacking unsigned as signed + vint16 vecshi0 = vec_unpackh((vint8)_vec0); + vint16 vecshi1 = vec_unpackl((vint8)_vec0); + + vint16 vecshi2 = vec_unpackh((vint8)_vec1); + vint16 vecshi3 = vec_unpackl((vint8)_vec1); + + // signed -> unsigned + vecshi0 = vec_and(vecshi0, mask_unsigned); + vecshi1 = vec_and(vecshi1, mask_unsigned); + + vecshi2 = vec_and(vecshi2, mask_unsigned); + vecshi3 = vec_and(vecshi3, mask_unsigned); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0); + vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1); + Vectorized vf0_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_0), + vec_mul(scale_vec1, vec_substract_src_zp1_0)}; + + vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0); + vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1); + Vectorized vf1_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_1), + vec_mul(scale_vec1, vec_substract_src_zp1_1)}; + + vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0); + vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1); + Vectorized vf2_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_2), + vec_mul(scale_vec1, vec_substract_src_zp1_2)}; + + vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0); + vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1); + Vectorized vf3_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_3), + vec_mul(scale_vec1, vec_substract_src_zp1_3)}; + + return {vf0_zp, vf1_zp, vf2_zp, vf3_zp}; + } + + float_vec_return_type C10_ALWAYS_INLINE + dequantize(Vectorized scale, Vectorized zero_point) const { + // unpacking unsigned as signed + vint16 vecshi0 = vec_unpackh((vint8)_vec0); + vint16 vecshi1 = vec_unpackl((vint8)_vec0); + + vint16 vecshi2 = vec_unpackh((vint8)_vec1); + vint16 vecshi3 = vec_unpackl((vint8)_vec1); + + // signed -> unsigned + vecshi0 = vec_and(vecshi0, mask_unsigned); + vecshi1 = vec_and(vecshi1, mask_unsigned); + + vecshi2 = vec_and(vecshi2, mask_unsigned); + vecshi3 = vec_and(vecshi3, mask_unsigned); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + + vfloat32 zero_point0 = zero_point.vec0(); + vfloat32 zero_point1 = zero_point.vec1(); + return { + Vectorized{ + (vecf0_0 - zero_point0) * scale_vec0, + (vecf1_0 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_1 - zero_point0) * scale_vec0, + (vecf1_1 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_2 - zero_point0) * scale_vec0, + (vecf1_2 - zero_point1) * scale_vec1}, + Vectorized{ + (vecf0_3 - zero_point0) * scale_vec0, + (vecf1_3 - zero_point1) * scale_vec1}}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + // constexpr int32_t min_val = std::numeric_limits::min(); + // constexpr int32_t max_val = std::numeric_limits::max(); + + vfloat32 vec_inverse = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)zero_point); + // vuint32 vmin = vec_splats(min_val); + // vuint32 vmax = vec_splats(max_val); + Vectorized vf0 = rhs[0]; + Vectorized vf1 = rhs[1]; + Vectorized vf2 = rhs[2]; + Vectorized vf3 = rhs[3]; + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vfloat32 vecf2 = vf1.vec0(); + vfloat32 vecf3 = vf1.vec1(); + + vfloat32 vecf4 = vf2.vec0(); + vfloat32 vecf5 = vf2.vec1(); + vfloat32 vecf6 = vf3.vec0(); + vfloat32 vecf7 = vf3.vec1(); + + vecf0 = vec_mul(vecf0, vec_inverse); + vecf1 = vec_mul(vecf1, vec_inverse); + vecf2 = vec_mul(vecf2, vec_inverse); + vecf3 = vec_mul(vecf3, vec_inverse); + + vecf4 = vec_mul(vecf4, vec_inverse); + vecf5 = vec_mul(vecf5, vec_inverse); + vecf6 = vec_mul(vecf6, vec_inverse); + vecf7 = vec_mul(vecf7, vec_inverse); + + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vecf2 = vec_add(vec_rint(vecf2), vec_zero_point); + vecf3 = vec_add(vec_rint(vecf3), vec_zero_point); + + vecf4 = vec_add(vec_rint(vecf4), vec_zero_point); + vecf5 = vec_add(vec_rint(vecf5), vec_zero_point); + vecf6 = vec_add(vec_rint(vecf6), vec_zero_point); + vecf7 = vec_add(vec_rint(vecf7), vec_zero_point); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vuint8 vec0 = vec_packsu(vecshi0, vecshi1); + vuint8 vec1 = vec_packsu(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + Vectorized C10_ALWAYS_INLINE + relu(Vectorized zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vectorized C10_ALWAYS_INLINE relu6( + Vectorized zero_point, + Vectorized q_six) const { + vuint8 max0 = vec_max(_vec0, zero_point._vec0); + vuint8 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + vint16 vecshi0 = vec_unpackh((vint8)_vec0); + vint16 vecBshi0 = vec_unpackh((vint8)b._vec0); + vint16 vecshi1 = vec_unpackl((vint8)_vec0); + vint16 vecBshi1 = vec_unpackl((vint8)b._vec0); + + vint16 vecshi2 = vec_unpackh((vint8)_vec1); + vint16 vecBshi2 = vec_unpackh((vint8)b._vec1); + vint16 vecshi3 = vec_unpackl((vint8)_vec1); + vint16 vecBshi3 = vec_unpackl((vint8)b._vec1); + + vecshi0 = vec_and(vecshi0, mask_unsigned); + vecBshi0 = vec_and(vecBshi0, mask_unsigned); + vecshi1 = vec_and(vecshi1, mask_unsigned); + vecBshi1 = vec_and(vecBshi1, mask_unsigned); + + vecshi2 = vec_and(vecshi2, mask_unsigned); + vecBshi2 = vec_and(vecBshi2, mask_unsigned); + vecshi3 = vec_and(vecshi3, mask_unsigned); + vecBshi3 = vec_and(vecBshi3, mask_unsigned); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 vecBi0 = vec_unpackh(vecBshi0); + vint32 veci1 = vec_unpackl(vecshi0); + vint32 vecBi1 = vec_unpackl(vecBshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 vecBi2 = vec_unpackh(vecBshi1); + vint32 veci3 = vec_unpackl(vecshi1); + vint32 vecBi3 = vec_unpackl(vecBshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 vecBi4 = vec_unpackh(vecBshi2); + vint32 veci5 = vec_unpackl(vecshi2); + vint32 vecBi5 = vec_unpackl(vecBshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 vecBi6 = vec_unpackh(vecBshi3); + vint32 veci7 = vec_unpackl(vecshi3); + vint32 vecBi7 = vec_unpackl(vecBshi3); + + return { + Vectorized(veci0 - vecBi0, veci1 - vecBi1), + Vectorized(veci2 - vecBi2, veci3 - vecBi3), + Vectorized(veci4 - vecBi4, veci5 - vecBi5), + Vectorized(veci6 - vecBi6, veci7 - vecBi7)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + vfloat32 vec_multiplier = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + + Vectorized vi0 = inp[0]; + Vectorized vi1 = inp[1]; + Vectorized vi2 = inp[2]; + Vectorized vi3 = inp[3]; + + vfloat32 vecf0 = vec_float(vi0.vec0()); + vfloat32 vecf1 = vec_float(vi0.vec1()); + vfloat32 vecf2 = vec_float(vi1.vec0()); + vfloat32 vecf3 = vec_float(vi1.vec1()); + + vfloat32 vecf4 = vec_float(vi2.vec0()); + vfloat32 vecf5 = vec_float(vi2.vec1()); + vfloat32 vecf6 = vec_float(vi3.vec0()); + vfloat32 vecf7 = vec_float(vi3.vec1()); + + vecf0 = vec_mul(vecf0, vec_multiplier); + vecf1 = vec_mul(vecf1, vec_multiplier); + vecf2 = vec_mul(vecf2, vec_multiplier); + vecf3 = vec_mul(vecf3, vec_multiplier); + + vecf4 = vec_mul(vecf4, vec_multiplier); + vecf5 = vec_mul(vecf5, vec_multiplier); + vecf6 = vec_mul(vecf6, vec_multiplier); + vecf7 = vec_mul(vecf7, vec_multiplier); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + vecf2 = vec_rint(vecf2); + vecf3 = vec_rint(vecf3); + + vecf4 = vec_rint(vecf4); + vecf5 = vec_rint(vecf5); + vecf6 = vec_rint(vecf6); + vecf7 = vec_rint(vecf7); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + veci0 = vec_add(veci0, vec_zero_point); + veci1 = vec_add(veci1, vec_zero_point); + veci2 = vec_add(veci2, vec_zero_point); + veci3 = vec_add(veci3, vec_zero_point); + + veci4 = vec_add(veci4, vec_zero_point); + veci5 = vec_add(veci5, vec_zero_point); + veci6 = vec_add(veci6, vec_zero_point); + veci7 = vec_add(veci7, vec_zero_point); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vuint8 vec0 = vec_packsu(vecshi0, vecshi1); + vuint8 vec1 = vec_packsu(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + DEFINE_MEMBER_OP(operator==, c10::quint8, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::quint8, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::quint8, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::quint8, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::quint8, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::quint8, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::quint8, vec_add) + DEFINE_MEMBER_OP(operator-, c10::quint8, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::quint8, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::quint8, /) + DEFINE_MEMBER_OP(maximum, c10::quint8, vec_max) + DEFINE_MEMBER_OP(minimum, c10::quint8, vec_min) + DEFINE_MEMBER_OP(operator&, c10::quint8, vec_and) + DEFINE_MEMBER_OP(operator|, c10::quint8, vec_or) + DEFINE_MEMBER_OP(operator^, c10::quint8, vec_xor) +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return a.minimum(b); +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..a25216bd5db17b5a732f7bdb3ebd4047eef1e24f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h @@ -0,0 +1,581 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +#if defined(__clang__) +typedef __vector __bool char vbool8; +typedef __vector __bool short vbool16; +typedef __vector __bool int vbool32; +typedef __vector __bool long long vbool64; +using vint8 = __attribute__((vector_size(16))) signed char; +using vint16 = __attribute__((vector_size(16))) signed short; +using vint32 = __attribute__((vector_size(16))) signed int; +using vint64 = __attribute__((vector_size(16))) signed long long; +using vuint8 = __attribute__((vector_size(16))) unsigned char; +using vuint16 = __attribute__((vector_size(16))) unsigned short; +using vuint32 = __attribute__((vector_size(16))) unsigned int; +using vuint64 = __attribute__((vector_size(16))) unsigned long long; +using vfloat32 = __attribute__((vector_size(16))) float; +using vfloat64 = __attribute__((vector_size(16))) double; +#else +using vbool8 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char; +using vbool16 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short; +using vbool32 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int; +using vbool64 = __attribute__((altivec(vector__))) +__attribute__((altivec(bool__))) long long; +using vint8 = __attribute__((altivec(vector__))) signed char; +using vint16 = __attribute__((altivec(vector__))) signed short; +using vint32 = __attribute__((altivec(vector__))) signed int; +using vint64 = __attribute__((altivec(vector__))) signed long long; +using vuint8 = __attribute__((altivec(vector__))) unsigned char; +using vuint16 = __attribute__((altivec(vector__))) unsigned short; +using vuint32 = __attribute__((altivec(vector__))) unsigned int; +using vuint64 = __attribute__((altivec(vector__))) unsigned long long; +using vfloat32 = __attribute__((altivec(vector__))) float; +using vfloat64 = __attribute__((altivec(vector__))) double; +#endif + +inline auto make_vuint(vint8 v) { + return reinterpret_cast(v); +} +inline auto make_vuint(vint16 v) { + return reinterpret_cast(v); +} +inline auto make_vuint(vint32 v) { + return reinterpret_cast(v); +} +inline auto make_vuint(vint64 v) { + return reinterpret_cast(v); +} + +#if !defined(vec_float) +C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) { + vfloat32 vec_out; + __asm__("xvcvsxwsp %x0,%x1" : "=wf"(vec_out) : "wa"(vec_in)); + return vec_out; +} +#endif + +#if !defined(vec_signed) +C10_ALWAYS_INLINE vint32 vec_signed(const vfloat32& vec_in) { + vint32 vec_out; + __asm__("xvcvspsxws %x0,%x1" : "=wa"(vec_out) : "wf"(vec_in)); + return vec_out; +} + +C10_ALWAYS_INLINE vint64 vec_signed(const vfloat64& vec_in) { + vint64 vec_out; + __asm__("xvcvdpsxds %x0,%x1" : "=wa"(vec_out) : "wd"(vec_in)); + return vec_out; +} +#endif + +#if !defined(vec_neg) +C10_ALWAYS_INLINE vfloat32 vec_neg(const vfloat32& vec_in) { + vfloat32 vec_out; + __asm__("xvnegsp %x0,%x1" : "=wf"(vec_out) : "wf"(vec_in)); + return vec_out; +} + +C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) { + vfloat64 vec_out; + __asm__("xvnegdp %x0,%x1" : "=wd"(vec_out) : "wd"(vec_in)); + return vec_out; +} + +C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) { + vint16 vint0 = {0, 0, 0, 0, 0, 0, 0, 0}; + return vec_vsubuhm(vint0, vec_in); +} + +C10_ALWAYS_INLINE vint32 vec_neg(const vint32& vec_in) { + vint32 vint0 = {0, 0, 0, 0}; + return vec_vsubuwm(vint0, vec_in); +} + +C10_ALWAYS_INLINE vint64 vec_neg(const vint64& vec_in) { + return -vec_in; +} +#endif + +#if !defined(vec_sldw) +template +C10_ALWAYS_INLINE vfloat32 +vec_sldw_aux(const vfloat32& vec_in0, const vfloat32& vec_in1) { + vfloat32 vec_out; + __asm("xxsldwi %x0, %x1, %x2, %3 " + : "=wa"(vec_out) + : "wa"(vec_in0), "wa"(vec_in1), "I"(C)); + return vec_out; +} + +#define vec_sldw(a, b, c) vec_sldw_aux(a, b) +#endif + +#define vec_not(a) vec_nor(a, a) +#if defined(__clang__) && !defined(vec_splats) +C10_ALWAYS_INLINE vint64 vec_splats(const int64_t& a) { + return vec_splats(a); +} +#endif +// Vectorized min/max which return a if any operand is nan +template +C10_ALWAYS_INLINE T vec_min_nan(const T& a, const T& b) { + return vec_min(a, b); +} +template +C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) { + return vec_max(a, b); +} + +// Specializations for float/double taken from Eigen +template <> +C10_ALWAYS_INLINE vfloat32 +vec_min_nan(const vfloat32& a, const vfloat32& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE + // regarding NaN + vfloat32 ret; + __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} +// Specializations for float/double taken from Eigen +template <> +C10_ALWAYS_INLINE vfloat32 +vec_max_nan(const vfloat32& a, const vfloat32& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE + // regarding NaN + vfloat32 ret; + __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} + +template <> +C10_ALWAYS_INLINE vfloat64 +vec_min_nan(const vfloat64& a, const vfloat64& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE + // regarding NaN + vfloat64 ret; + __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} +template <> +C10_ALWAYS_INLINE vfloat64 +vec_max_nan(const vfloat64& a, const vfloat64& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE + // regarding NaN + vfloat64 ret; + __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} + +// Vectorizes min/max function which returns nan if any side is nan +#define C10_VSX_VEC_NAN_PROPAG(name, type, btype, func) \ + C10_ALWAYS_INLINE type name(const type& a, const type& b) { \ + type tmp = func(a, b); \ + btype nan_a = vec_cmpne(a, a); \ + btype nan_b = vec_cmpne(b, b); \ + tmp = vec_sel(tmp, a, nan_a); \ + return vec_sel(tmp, b, nan_b); \ + } + +C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat32, vbool32, vec_min) +C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat32, vbool32, vec_max) +C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat64, vbool64, vec_min) +C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max) + +#undef C10_VSX_VEC_NAN_PROPAG + +#define DEFINE_MEMBER_UNARY_OP(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op() const { \ + return Vectorized{func(_vec0), func(_vec1)}; \ + } + +#define DEFINE_MEMBER_OP(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + return Vectorized{ \ + func(_vec0, other._vec0), func(_vec1, other._vec1)}; \ + } + +#define DEFINE_MEMBER_BITWISE_OP(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + return Vectorized{ \ + func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)}; \ + } + +#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op( \ + const Vectorized& b, const Vectorized& c) const { \ + return Vectorized{ \ + func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)}; \ + } + +#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& b) \ + const { \ + Vectorized::vec_internal_type ret_0; \ + Vectorized::vec_internal_type ret_1; \ + for (int i = 0; i < Vectorized::size() / 2; i++) { \ + ret_0[i] = _vec0[i] binary_op b._vec0[i]; \ + ret_1[i] = _vec1[i] binary_op b._vec1[i]; \ + } \ + return Vectorized{ret_0, ret_1}; \ + } + +#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + using vvtype = Vectorized::vec_internal_type; \ + const vvtype v_one = vec_splats(static_cast(1.0)); \ + vvtype ret0 = (vvtype)func(_vec0, other._vec0); \ + vvtype ret1 = (vvtype)func(_vec1, other._vec1); \ + return Vectorized{vec_and(ret0, v_one), vec_and(ret1, v_one)}; \ + } + +#define DEFINE_CLAMP_FUNCS(operand_type) \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp( \ + const Vectorized& a, \ + const Vectorized& min, \ + const Vectorized& max) { \ + return Vectorized{ \ + vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()), \ + vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())}; \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_min( \ + const Vectorized& a, \ + const Vectorized& min) { \ + return Vectorized{ \ + vec_max_nan(a.vec0(), min.vec0()), vec_max_nan(a.vec1(), min.vec1())}; \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_max( \ + const Vectorized& a, \ + const Vectorized& max) { \ + return Vectorized{ \ + vec_min_nan(a.vec0(), max.vec0()), vec_min_nan(a.vec1(), max.vec1())}; \ + } + +#define DEFINE_REINTERPRET_CAST_FUNCS( \ + first_type, cast_type, cast_inner_vector_type) \ + template <> \ + C10_ALWAYS_INLINE Vectorized cast( \ + const Vectorized& src) { \ + return Vectorized{ \ + (cast_inner_vector_type)src.vec0(), \ + (cast_inner_vector_type)src.vec1()}; \ + } + +#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16) + +// it can be used to emulate blend faster +constexpr int blendChoice( + uint32_t mask, + uint32_t half1 = 0xF, + uint32_t half2 = 0xF0) { + uint32_t none = 0; + uint32_t both = half1 | half2; + // clamp it between 0 and both + mask = mask & both; + // return (a._vec0, a._vec1) + if (mask == none) + return 0; + // return (b._vec0,b._vec1) + else if (mask == both) + return 1; + // return (b._vec0,a._vec1) + else if (mask == half1) + return 2; + // return (a._vec0,b._vec1) + else if (mask == half2) + return 3; + // return (*_vec0,a._vec1) + else if (mask > 0 && mask < half1) + return 4; + // return (*_vec0,b._vec1) + else if ((mask & half2) == half2) + return 5; + // return (a._vec0,*_vec1) + else if ((mask & half1) == 0 && mask > half1) + return 6; + // return (b._vec0,*_vec1) + else if ((mask & half1) == half1 && mask > half1) + return 7; + // return (*_vec0,*_vec1) + return 8; +} + +// it can be used to emulate blend faster +constexpr int blendChoiceDbl(uint32_t mask) { + // clamp it 0 and 0xF + return blendChoice(mask, 0x3, 0xC); +} + +constexpr vbool32 VsxMask1(uint32_t mask) { + uint32_t g0 = (mask & 1) * 0xffffffff; + uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + return (vbool32){g0, g1, g2, g3}; +} + +constexpr vbool32 VsxMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xFF) >> 4; + return VsxMask1(mask2); +} + +constexpr vbool64 VsxDblMask1(uint32_t mask) { + uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + return (vbool64){g0, g1}; +} + +constexpr vbool64 VsxDblMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxDblMask1(mask2); +} + +constexpr int maskForComplex(uint32_t mask) { + mask = mask & 0xF; + int complex_mask = 0; + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); + if (mask & 4) + complex_mask |= (3 << 4); + if (mask & 8) + complex_mask |= (3 << 6); + return complex_mask; +} + +constexpr int maskForComplexDbl(uint32_t mask) { + mask = mask & 0x3; + int complex_mask = 0; + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); + return complex_mask; +} + +constexpr int blendChoiceComplex(uint32_t mask) { + return blendChoice(maskForComplex(mask)); +} + +constexpr int blendChoiceComplexDbl(uint32_t mask) { + return blendChoiceDbl(maskForComplexDbl(mask)); +} + +constexpr vbool32 VsxComplexMask1(uint32_t mask) { + return VsxMask1(maskForComplex(mask)); +} + +constexpr vbool32 VsxComplexMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxMask1(maskForComplex(mask2)); +} + +constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { + return VsxDblMask1(mask); +} + +constexpr vbool64 VsxComplexDblMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxDblMask1(mask2); +} + +// constants +namespace at { +namespace vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { +// +constexpr int offset0 = 0; +constexpr int offset16 = 16; + +// #Constants +const vuint8 mask_zero_bits = vuint8{ + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 96, + 64, + 32, + 0}; + +const vuint8 swap_mask = + vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + +const vint32 v0x7f = vec_splats(0x7f); +const vint32 vi_0 = vec_splats((int)(0)); +const vint32 vi_1 = vec_splats((int)1); +const vint32 vi_2 = vec_splats((int)2); +const vint32 vi_4 = vec_splats((int)4); +const vint32 vi_inv1 = vec_splats((int)~1); +const vuint32 vu_29 = vec_splats(29u); +const vuint32 vu_23 = vec_splats(23u); + +const vbool32 inv_mant_mask = (vbool32)vec_splats((unsigned int)~0xff800000); +const vbool32 sign_mask = (vbool32)vec_splats((int)0x80000000); +const vbool32 real_mask = vbool32{0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0}; +const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF}; +const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000}; +const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0}; + +const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000}; +const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF}; +const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0}; +const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000}; +const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0}; + +const vfloat32 zero = vec_splats(0.f); +const vfloat32 half = vec_splats(0.5f); +const vfloat32 one = vec_splats(1.f); +const vfloat32 two = vec_splats(2.0f); +const vfloat32 _4div_pi = vec_splats(1.27323954473516f); +const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u); +const vfloat32 v_minus_inf = + vfloat32{0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u}; +const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff); +const vfloat32 log10e_inv = vec_splats(0.43429448190325176f); +const vfloat32 log2e_inv = vec_splats(1.4426950408889634f); +const vfloat32 log2eB_inv = vec_splats(1.442695036924675f); +const vfloat32 cephes_SQRTHF = vec_splats(0.707106781186547524f); +const vfloat32 coscof_p0 = vec_splats(2.443315711809948E-005f); +const vfloat32 coscof_p1 = vec_splats(-1.388731625493765E-003f); +const vfloat32 coscof_p2 = vec_splats(4.166664568298827E-002f); +const vfloat32 exp_hi = vec_splats(104.f); +const vfloat32 exp_lo = vec_splats(-104.f); +const vfloat32 exp_p0 = vec_splats(0.000198527617612853646278381f); +const vfloat32 exp_p1 = vec_splats((0.00139304355252534151077271f)); +const vfloat32 exp_p2 = vec_splats(0.00833336077630519866943359f); +const vfloat32 exp_p3 = vec_splats(0.0416664853692054748535156f); +const vfloat32 exp_p4 = vec_splats(0.166666671633720397949219f); +const vfloat32 exp_p5 = vec_splats(0.5f); +const vfloat32 log_p0 = vec_splats(7.0376836292E-2f); +const vfloat32 log_p1 = vec_splats(-1.1514610310E-1f); +const vfloat32 log_p2 = vec_splats(1.1676998740E-1f); +const vfloat32 log_p3 = vec_splats(-1.2420140846E-1f); +const vfloat32 log_p4 = vec_splats(+1.4249322787E-1f); +const vfloat32 log_p5 = vec_splats(-1.6668057665E-1f); +const vfloat32 log_p6 = vec_splats(+2.0000714765E-1f); +const vfloat32 log_p7 = vec_splats(-2.4999993993E-1f); +const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f); +const vfloat32 log_q1 = vec_splats(-2.12194440e-4f); +const vfloat32 log_q2 = vec_splats(0.693359375f); +const vfloat32 max_logf = vec_splats(88.02969187150841f); +const vfloat32 max_numf = + vec_splats(1.7014117331926442990585209174225846272e38f); +const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u); +const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u); +const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f); +const vfloat32 minus_cephes_dp2 = vec_splats(-2.4187564849853515625e-4f); +const vfloat32 minus_cephes_dp3 = vec_splats(-3.77489497744594108e-8f); +const vfloat32 negln2f_hi = vec_splats(-0.693145751953125f); +const vfloat32 negln2f_lo = vec_splats(-1.428606765330187045e-06f); +const vfloat32 p0 = vec_splats(2.03721912945E-4f); +const vfloat32 p1 = vec_splats(8.33028376239E-3f); +const vfloat32 p2 = vec_splats(1.66667160211E-1f); +const vfloat32 sincof_p0 = vec_splats(-1.9515295891E-4f); +const vfloat32 sincof_p1 = vec_splats(8.3321608736E-3f); +const vfloat32 sincof_p2 = vec_splats(-1.6666654611E-1f); +const vfloat32 tanh_0p625 = vec_splats(0.625f); +const vfloat32 tanh_half_max = vec_splats(44.014845935754205f); +const vfloat32 tanh_p0 = vec_splats(-5.70498872745E-3f); +const vfloat32 tanh_p1 = vec_splats(2.06390887954E-2f); +const vfloat32 tanh_p2 = vec_splats(-5.37397155531E-2f); +const vfloat32 tanh_p3 = vec_splats(1.33314422036E-1f); +const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f); +const vfloat32 vcheck = vec_splats((float)(1LL << 24)); +const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f}; +const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f}; +const vfloat32 sqrt2_2 = vfloat32{ + 0.70710676908493042f, + 0.70710676908493042, + 0.70710676908493042, + 0.70710676908493042}; +const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0}; +const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f}; +const vfloat64 vd_one = vec_splats(1.0); +const vfloat64 vd_zero = vec_splats(0.0); +const vfloat64 vd_log10e_inv = vec_splats(0.43429448190325176); +const vfloat64 vd_log2e_inv = vec_splats(1.4426950408889634); +const vfloat64 vd_imag_one = vfloat64{0.0, 1.0}; +const vfloat64 vd_imag_half = vfloat64{0.0, 0.5}; +const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757}; +const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0}; + +template +Vectorized VsxShiftRightArith( + const Vectorized& a, + const Vectorized& b) { + const Vectorized max_shift(sizeof(T) * CHAR_BIT - std::is_signed_v); + const auto mask = (b < Vectorized(0)) | (b >= max_shift); + const auto shift = Vectorized::blendv(b, max_shift, mask); + return Vectorized{ + vec_sra(a.vec0(), make_vuint(shift.vec0())), + vec_sra(a.vec1(), make_vuint(shift.vec1()))}; +} + +template +Vectorized VsxShiftLeftArith( + const Vectorized& a, + const Vectorized& b) { + const Vectorized max_shift(sizeof(T) * CHAR_BIT); + const auto mask = (b < Vectorized(0)) | (b >= max_shift); + Vectorized ret( + vec_sl(a.vec0(), make_vuint(b.vec0())), + vec_sl(a.vec1(), make_vuint(b.vec1()))); + return Vectorized::blendv(ret, Vectorized(0), mask); +} + +#define DEFINE_SHIFT_FUNCS(operand_type) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator>>( \ + const Vectorized& a, const Vectorized& b) { \ + return VsxShiftRightArith(a, b); \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator<<( \ + const Vectorized& a, const Vectorized& b) { \ + return VsxShiftLeftArith(a, b); \ + } + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h new file mode 100644 index 0000000000000000000000000000000000000000..c48ae8c5732d8276a45ac698dedf87f27678d582 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h @@ -0,0 +1,2978 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#if defined(__clang__) +#include +#elif defined(__GNUC__) || defined(__GNUG__) +#include +#include +#endif +#include +#include +#include + +namespace at { +namespace vec { + +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +template +constexpr bool is_zarch_implemented() { + return ( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v); +} + +template +constexpr bool is_zarch_implemented_quant() { + return ( + std::is_same_v || std::is_same_v || + std::is_same_v); +} + +template +constexpr bool is_zarch_implemented_complex() { + return std::is_same_v> || + std::is_same_v>; +} + +constexpr int offset0 = 0; +constexpr int offset16 = 16; + +template +struct VecBinaryType { + using type __attribute__((vector_size(16))) = uintmax_t; +}; + +template <> +struct VecBinaryType<8> { + using type = __attribute__((vector_size(16))) unsigned long long; +}; + +template <> +struct VecBinaryType<4> { + using type = __attribute__((vector_size(16))) unsigned int; +}; + +template <> +struct VecBinaryType<2> { + using type = __attribute__((vector_size(16))) unsigned short; +}; + +template <> +struct VecBinaryType<1> { + using type = __attribute__((vector_size(16))) unsigned char; +}; + +template +struct VecInnerType { + using Type __attribute__((vector_size(16))) = T; + using BinaryType = typename VecBinaryType::type; + using ElementType = T; + static constexpr int size = 16 / sizeof(T); +}; + +// define for int64_t properly for load +template <> +struct VecInnerType { + using Type = __attribute__((vector_size(16))) signed long long; + using ElementType = signed long long; + using BinaryType = typename VecBinaryType::type; + static constexpr int size = 16 / sizeof(signed long long); +}; + +template +using ZSimdVect = typename VecInnerType::Type; +template +using ZSimdVectBinary = typename VecInnerType::BinaryType; +template +using ZSimdVectElement = typename VecInnerType::ElementType; + +constexpr int blendChoiceInner( + const uint64_t mask, + const uint64_t half1 = 0xF, + const uint64_t half2 = 0xF0) { + uint64_t none = 0; + uint64_t both = half1 | half2; + // clamp it between 0 and both + auto res_mask = mask & both; + // return (a._vec0, a._vec1) + if (res_mask == none) + return 0; + // return (b._vec0,b._vec1) + else if (res_mask == both) + return 1; + // return (b._vec0, a._vec1) + else if (res_mask == half1) + return 2; + // return (a._vec0,b._vec1) + else if (res_mask == half2) + return 3; + // return (*_vec0,a._vec1) + else if (res_mask > 0 && res_mask < half1) + return 4; + // return (*_vec0,b._vec1) + else if ((res_mask & half2) == half2) + return 5; + // return (a._vec0,*_vec1) + else if ((res_mask & half1) == 0 && res_mask > half1) + return 6; + // return (b._vec0,*_vec1) + else if ((res_mask & half1) == half1 && res_mask > half1) + return 7; + // return (*_vec0,*_vec1) + return 8; +} + +// it can be used to emulate blend faster +template +constexpr int blendChoice(const uint64_t mask) { + static_assert(Z < 1 || Z > 8, "not implemented"); + return blendChoiceInner(mask); +} + +template <> +constexpr int blendChoice<1>(const uint64_t mask) { + return blendChoiceInner(mask, 0x0000FFFF, 0xFFFF0000); +} + +template <> +constexpr int blendChoice<2>(const uint64_t mask) { + return blendChoiceInner(mask, 0x00FF, 0xFF00); +} + +template <> +constexpr int blendChoice<4>(const uint64_t mask) { + return blendChoiceInner(mask, 0xF, 0xF0); +} + +template <> +constexpr int blendChoice<8>(const uint64_t mask) { + // clamp it 0 and 0xF + return blendChoiceInner(mask, 0x3, 0xC); +} + +template +constexpr auto GetMask1(const uint64_t mask) { + return typename VecBinaryType::type{}; +} + +template +constexpr auto GetMask2(const uint64_t mask) { + return typename VecBinaryType::type{}; +} + +template <> +constexpr auto GetMask1<1>(const uint64_t mask) { + constexpr uint8_t t = (int)0xFF; + uint8_t g0 = (mask & 1) * t; + uint8_t g1 = ((mask & 2) >> 1) * t; + uint8_t g2 = ((mask & 4) >> 2) * t; + uint8_t g3 = ((mask & 8) >> 3) * t; + uint8_t g4 = ((mask & 16) >> 4) * t; + uint8_t g5 = ((mask & 32) >> 5) * t; + uint8_t g6 = ((mask & 64) >> 6) * t; + uint8_t g7 = ((mask & 128) >> 7) * t; + uint8_t g8 = ((mask & 256) >> 8) * t; + uint8_t g9 = ((mask & 512) >> 9) * t; + uint8_t g10 = ((mask & 1024) >> 10) * t; + uint8_t g11 = ((mask & 2048) >> 11) * t; + uint8_t g12 = ((mask & 4096) >> 12) * t; + uint8_t g13 = ((mask & 8192) >> 13) * t; + uint8_t g14 = ((mask & 16384) >> 14) * t; + uint8_t g15 = ((mask & 32768) >> 15) * t; + return (typename VecBinaryType<1>::type){ + g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15}; +} + +template <> +constexpr auto GetMask2<1>(const uint64_t mask) { + uint64_t mask2 = (mask & 0xFFFFFFFF) >> 16; + return GetMask1<1>(mask2); +} + +template <> +constexpr auto GetMask1<2>(const uint64_t mask) { + constexpr uint16_t t = (int)0xFFFF; + uint16_t g0 = (mask & 1) * t; + uint16_t g1 = ((mask & 2) >> 1) * t; + uint16_t g2 = ((mask & 4) >> 2) * t; + uint16_t g3 = ((mask & 8) >> 3) * t; + uint16_t g4 = ((mask & 16) >> 4) * t; + uint16_t g5 = ((mask & 32) >> 5) * t; + uint16_t g6 = ((mask & 64) >> 6) * t; + uint16_t g7 = ((mask & 128) >> 7) * t; + return (typename VecBinaryType<2>::type){g0, g1, g2, g3, g4, g5, g6, g7}; +} + +template <> +constexpr auto GetMask2<2>(const uint64_t mask) { + uint64_t mask2 = (mask & 0xFFFF) >> 8; + return GetMask1<2>(mask2); +} + +template <> +constexpr auto GetMask1<4>(const uint64_t mask) { + uint32_t g0 = (mask & 1) * 0xffffffff; + uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + return (typename VecBinaryType<4>::type){g0, g1, g2, g3}; +} + +template <> +constexpr auto GetMask2<4>(const uint64_t mask) { + uint64_t mask2 = (mask & 0xFF) >> 4; + return GetMask1<4>(mask2); +} + +template <> +constexpr auto GetMask1<8>(const uint64_t mask) { + uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + return (typename VecBinaryType<8>::type){g0, g1}; +} + +template <> +constexpr auto GetMask2<8>(const uint64_t mask) { + uint64_t mask2 = (mask & 0xF) >> 2; + return GetMask1<8>(mask2); +} + +template +constexpr int maskForComplex(uint32_t mask) { + return 0; +} + +template <> +constexpr int maskForComplex<8>(uint32_t mask) { + mask = mask & 0xF; + int complex_mask = 0; + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); + if (mask & 4) + complex_mask |= (3 << 4); + if (mask & 8) + complex_mask |= (3 << 6); + return complex_mask; +} + +template <> +constexpr int maskForComplex<16>(uint32_t mask) { + mask = mask & 0x3; + int complex_mask = 0; + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); + return complex_mask; +} + +template > +constexpr int blend_choice() { + return 0xAA; +} + +template <> +constexpr int blend_choice>() { + return 0x0A; +} + +constexpr int64_t allbitset(int16_t x) { + int64_t onex = 1; + return (onex << x) - onex; +} + +namespace { /* unnamed namespace */ + +ZSimdVect vec_mergee(ZSimdVect x, ZSimdVect y) { + constexpr ZSimdVectBinary mergee_mask{ + 0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27}; + return vec_perm(x, y, mergee_mask); +} + +ZSimdVect vec_mergee(ZSimdVect x, ZSimdVect y) { + return vec_mergeh(x, y); +} + +ZSimdVect vec_mergeo(ZSimdVect x, ZSimdVect y) { + constexpr ZSimdVectBinary mergeo_mask{ + 4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}; + return vec_perm(x, y, mergeo_mask); +} + +ZSimdVect vec_mergeo(ZSimdVect x, ZSimdVect y) { + return vec_mergel(x, y); +} + +} /* unnamed namespace */ + +// +template +constexpr auto GetBpermZeroMask() { + return ZSimdVectBinary{ + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 96, + 64, + 32, + 0}; +} + +template <> +constexpr auto GetBpermZeroMask() { + return ZSimdVectBinary{ + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 64, + 0}; +} + +constexpr auto GetSwapMaskFloat() { + return ZSimdVectBinary{ + 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; +} + +template +struct is_vec_specialized_for()>> + : std::bool_constant {}; + +template +struct Vectorized()>> { + public: + using value_type = T; + using vtype = ZSimdVect; + using vmaskType = ZSimdVectBinary; + using size_type = int; + // because of gcc inconsistency for int64_t we are obliged to use this, not + // value_type + using ElementType = ZSimdVectElement; + using vinner_data = std::pair; + + private: + vtype _vec0; + vtype _vec1; + + public: + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(ElementType); + } + Vectorized() {} + + C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vectorized(const vinner_data& v) + : _vec0{v.first}, _vec1{v.second} {} + C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(T s) + : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {} + + template + struct LoaduHelper { + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + __at_align__ ElementType tmp_values[size()] = {}; + std::memcpy( + tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); + + return { + vec_xl(offset0, &(tmp_values[0])), + vec_xl(offset16, &(tmp_values[0]))}; + } + }; + + template + struct LoaduHelper { + static Vectorized C10_ALWAYS_INLINE + loadu(const ElementType* ptr, int count = size()) { + if (count == size()) { + return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)}; + } + + __at_align__ ElementType tmp_values[size()] = {}; + std::memcpy( + tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); + + return { + vec_xl(offset0, &(tmp_values[0])), + vec_xl(offset16, &(tmp_values[0]))}; + } + }; + + template + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + return LoaduHelper::loadu(ptr, count); + } + + template + static Vectorized C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) { + // load only first 8 bytes + // only intended to be used with uint8_t + return loadu(ptr, 8 / sizeof(ElementType)); + } + + template + struct StoreHelper { + static void C10_ALWAYS_INLINE + store(const Vectorized& vec, U* ptr, int count = size()) { + if (count > 0) { + __at_align__ ElementType tmp_values[size()]; + vec_xst(vec._vec0, offset0, &(tmp_values[0])); + vec_xst(vec._vec1, offset16, &(tmp_values[0])); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); + } + } + }; + + template + struct StoreHelper { + static void C10_ALWAYS_INLINE + store(const Vectorized& vec, ElementType* ptr, int count = size()) { + if (count == size()) { + vec_xst(vec._vec0, offset0, ptr); + vec_xst(vec._vec1, offset16, ptr); + } else if (count > 0) { + __at_align__ ElementType tmp_values[size()]; + vec_xst(vec._vec0, offset0, &(tmp_values[0])); + vec_xst(vec._vec1, offset16, &(tmp_values[0])); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); + } + } + }; + + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { + return StoreHelper::store(*this, ptr, count); + } + + C10_ALWAYS_INLINE const vtype& vec0() const { + return _vec0; + } + + C10_ALWAYS_INLINE const vtype& vec1() const { + return _vec1; + } + + C10_ALWAYS_INLINE vinner_data data() const { + return std::make_pair<>(_vec0, _vec1); + } + + C10_ALWAYS_INLINE operator vinner_data() const { + return data(); + } + + C10_ALWAYS_INLINE const vmaskType vecb0() const { + return (vmaskType)_vec0; + } + C10_ALWAYS_INLINE const vmaskType vecb1() const { + return (vmaskType)_vec1; + } + + static Vectorized C10_ALWAYS_INLINE blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return { + vec_sel(a._vec0, b._vec0, mask.vecb0()), + vec_sel(a._vec1, b._vec1, mask.vecb1())}; + } + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4) + : _vec0{s1, s2}, _vec1{s3, s4} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8) + : _vec0{s1, s2, s3, s4}, _vec1{s5, s6, s7, s8} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized( + T s1, + T s2, + T s3, + T s4, + T s5, + T s6, + T s7, + T s8, + T s9, + T s10, + T s11, + T s12, + T s13, + T s14, + T s15, + T s16) + : _vec0{s1, s2, s3, s4, s5, s6, s7, s8}, + _vec1{s9, s10, s11, s12, s13, s14, s15, s16} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized( + T s1, + T s2, + T s3, + T s4, + T s5, + T s6, + T s7, + T s8, + T s9, + T s10, + T s11, + T s12, + T s13, + T s14, + T s15, + T s16, + T s17, + T s18, + T s19, + T s20, + T s21, + T s22, + T s23, + T s24, + T s25, + T s26, + T s27, + T s28, + T s29, + T s30, + T s31, + T s32) + : _vec0{s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16}, + _vec1{ + s17, + s18, + s19, + s20, + s21, + s22, + s23, + s24, + s25, + s26, + s27, + s28, + s29, + s30, + s31, + s32} {} + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized(base, base + step, base + 2 * step, base + 3 * step); + } + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + + // blend section + template + static std::enable_if_t(mask) == 0, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t(mask) == 1, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t(mask) == 2, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t(mask) == 3, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t(mask) == 4, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + const vmaskType mask_1st = GetMask1(mask); + return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t(mask) == 5, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + const vmaskType mask_1st = GetMask1(mask); + return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t(mask) == 6, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + const vmaskType mask_2nd = GetMask2(mask); + // generated masks + return {a._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t(mask) == 7, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + const vmaskType mask_2nd = GetMask2(mask); + // generated masks + return {b._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t(mask) == 8, Vectorized> + C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { + const vmaskType mask_1st = GetMask1(mask); + const vmaskType mask_2nd = GetMask2(mask); + return { + (vtype)vec_sel(a._vec0, b._vec0, mask_1st), + (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static inline std::enable_if_t<(Z >= C), Vectorized> set_inner( + const Vectorized& a, + const Vectorized& b, + size_t count) { + return b; + } + + template + static inline std::enable_if_t<(Z < C), Vectorized> set_inner( + const Vectorized& a, + const Vectorized& b, + size_t count) { + if (count == Z) + return blend(a, b); + else + return set_inner(a, b, count); + } + + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + if (count == 0) + return a; + return set_inner<1, size()>(a, b, count); + } + + const ElementType& operator[](int idx) const = delete; + ElementType& operator[](int idx) = delete; + + Vectorized _not() const { + return {(vtype)vec_nor(vecb0(), vecb0()), (vtype)vec_nor(vecb1(), vecb1())}; + } + + Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { + return (*this == other) & Vectorized((T)1.0); + } + Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { + return (*this != other) & Vectorized((T)1.0); + } + Vectorized C10_ALWAYS_INLINE gt(const Vectorized& other) const { + return (*this > other) & Vectorized((T)1.0); + } + Vectorized C10_ALWAYS_INLINE ge(const Vectorized& other) const { + return (*this >= other) & Vectorized((T)1.0); + } + Vectorized C10_ALWAYS_INLINE lt(const Vectorized& other) const { + return (*this < other) & Vectorized((T)1.0); + } + Vectorized C10_ALWAYS_INLINE le(const Vectorized& other) const { + return (*this <= other) & Vectorized((T)1.0); + } + + template , int> = 0> + Vectorized C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + template , int> = 0> + Vectorized C10_ALWAYS_INLINE abs() const { + return {_vec0, _vec1}; + } + + Vectorized C10_ALWAYS_INLINE neg() const { + return {-_vec0, -_vec1}; + } + + Vectorized isnan() const { + auto x = *this; + auto ret = (x == x); + return ret._not(); + } + + bool has_inf_nan() const { + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized angle() const { + auto tmp = blendv( + Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); + return blendv(tmp, *this, isnan()); + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized angle() const { + return blendv( + Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); + } + + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized{0}; + } + Vectorized conj() const { + return *this; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + int zero_mask() const { + auto cmp = (*this == Vectorized(0)); + constexpr auto mask_zero_bits = GetBpermZeroMask(); + ZSimdVectBinary result0 = + vec_bperm_u128((ZSimdVectBinary)cmp.vecb0(), mask_zero_bits); + ZSimdVectBinary result1 = + vec_bperm_u128((ZSimdVectBinary)cmp.vecb1(), mask_zero_bits); + return (result0[0] | (result1[0] << (size() / 2))); + } + + Vectorized C10_ALWAYS_INLINE floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE round() const { + return {vec_round(_vec0), vec_round(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE rint() const { + return {vec_rint(_vec0), vec_rint(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vectorized C10_ALWAYS_INLINE frac() const { + return *this - trunc(); + } + + Vectorized C10_ALWAYS_INLINE sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE reciprocal() const { + return Vectorized((T)1) / (*this); + } + Vectorized C10_ALWAYS_INLINE rsqrt() const { + return sqrt().reciprocal(); + } + + template , int> = 0> + inline Vectorized mapOrdinary(float (*const f)(float)) const { + float a00 = f(_vec0[0]); + float a01 = f(_vec0[1]); + float a02 = f(_vec0[2]); + float a03 = f(_vec0[3]); + float a10 = f(_vec1[0]); + float a11 = f(_vec1[1]); + float a12 = f(_vec1[2]); + float a13 = f(_vec1[3]); + return Vectorized{a00, a01, a02, a03, a10, a11, a12, a13}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapOrdinary(double (*const f)(double)) const { + return Vectorized(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1])); + } + + template , int> = 0> + inline Vectorized mapOrdinary( + float (*const f)(float, float), + const Vectorized& b) const { + float a00 = f(_vec0[0], b._vec0[0]); + float a01 = f(_vec0[1], b._vec0[1]); + float a02 = f(_vec0[2], b._vec0[2]); + float a03 = f(_vec0[3], b._vec0[3]); + float a10 = f(_vec1[0], b._vec1[0]); + float a11 = f(_vec1[1], b._vec1[1]); + float a12 = f(_vec1[2], b._vec1[2]); + float a13 = f(_vec1[3], b._vec1[3]); + return Vectorized{a00, a01, a02, a03, a10, a11, a12, a13}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapOrdinary( + double (*const f)(double, double), + const Vectorized& b) const { + return Vectorized( + f(_vec0[0], b._vec0[0]), + f(_vec0[1], b._vec0[1]), + f(_vec1[0], b._vec1[0]), + f(_vec1[1], b._vec1[1])); + } + + template < + typename FloatOp, + typename DoubleOp, + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapSleef(FloatOp f, DoubleOp d) const { + vtype a0 = f(_vec0); + vtype a1 = f(_vec1); + return Vectorized{a0, a1}; + } + + template < + typename FloatOp, + typename DoubleOp, + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapSleef(FloatOp f, DoubleOp d) const { + return Vectorized(d(_vec0), d(_vec1)); + } + + template < + typename FloatOp, + typename DoubleOp, + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapSleef(FloatOp f, DoubleOp d, const Vectorized& b) + const { + vtype a0 = f(_vec0, b._vec0); + vtype a1 = f(_vec1, b._vec1); + return Vectorized{a0, a1}; + } + + template < + typename FloatOp, + typename DoubleOp, + typename U = T, + std::enable_if_t, int> = 0> + inline Vectorized mapSleef(FloatOp f, DoubleOp d, const Vectorized& b) + const { + return Vectorized(d(_vec0, b._vec0), d(_vec1, b._vec1)); + } + + Vectorized acos() const { + return mapSleef(Sleef_acosf4_u10, Sleef_acosd2_u10); + } + Vectorized asin() const { + return mapSleef(Sleef_asinf4_u10, Sleef_asind2_u10); + } + Vectorized atan() const { + return mapSleef(Sleef_atanf4_u10, Sleef_atand2_u10); + } + Vectorized atanh() const { + return mapSleef(Sleef_atanhf4_u10, Sleef_atanhd2_u10); + } + + Vectorized erf() const { + return mapSleef(Sleef_erff4_u10, Sleef_erfd2_u10); + } + Vectorized erfc() const { + return mapSleef(Sleef_erfcf4_u15, Sleef_erfcd2_u15); + } + + Vectorized exp() const { + return mapSleef(Sleef_expf4_u10, Sleef_expd2_u10); + } + Vectorized exp2() const { + return mapSleef(Sleef_exp2f4_u10, Sleef_exp2d2_u10); + } + Vectorized expm1() const { + return mapSleef(Sleef_expm1f4_u10, Sleef_expm1d2_u10); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + + Vectorized log() const { + return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10); + } + Vectorized log2() const { + return mapSleef(Sleef_log2f4_u10, Sleef_log2d2_u10); + } + Vectorized log10() const { + return mapSleef(Sleef_log10f4_u10, Sleef_log10d2_u10); + } + Vectorized log1p() const { + return mapSleef(Sleef_log1pf4_u10, Sleef_log1pd2_u10); + } + + Vectorized sin() const { + return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10); + } + Vectorized sinh() const { + return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10); + } + Vectorized cos() const { + return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10); + } + Vectorized cosh() const { + return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10); + } + + Vectorized tan() const { + return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10); + } + Vectorized tanh() const { + return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10); + } + + Vectorized lgamma() const { + return mapSleef(Sleef_lgammaf4_u10, Sleef_lgammad2_u10); + } + + Vectorized atan2(const Vectorized& b) const { + return mapSleef(Sleef_atan2f4_u10, Sleef_atan2d2_u10, b); + } + Vectorized copysign(const Vectorized& sign) const { + return mapSleef(Sleef_copysignf4, Sleef_copysignd2, sign); + } + Vectorized fmod(const Vectorized& q) const { + return mapSleef(Sleef_fmodf4, Sleef_fmodd2, q); + } + + Vectorized hypot(const Vectorized& b) const { + return mapSleef(Sleef_hypotf4_u05, Sleef_hypotd2_u05, b); + } + + Vectorized pow(const Vectorized& b) const { + return mapSleef(Sleef_powf4_u10, Sleef_powd2_u10, b); + } + + Vectorized nextafter(const Vectorized& b) const { + return mapSleef(Sleef_nextafterf4, Sleef_nextafterd2, b); + } + + Vectorized erfinv() const { + return mapOrdinary(calc_erfinv); + } + + Vectorized digamma() const { + return mapOrdinary(calc_digamma); + } + + Vectorized igamma(const Vectorized& x) const { + return mapOrdinary(calc_igamma, x); + } + + Vectorized igammac(const Vectorized& x) const { + return mapOrdinary(calc_igammac, x); + } + + Vectorized i0() const { + return mapOrdinary(calc_i0); + } + + Vectorized i0e() const { + return mapOrdinary(calc_i0e); + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized minimum(const Vectorized& other) const { + return {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; + } + + /* Propagates NaN if either input is a NaN. */ + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized minimum(const Vectorized& other) const { + Vectorized tmp = { + vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; + tmp = blendv(tmp, *this, isnan()); + return blendv(tmp, other, other.isnan()); + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized maximum(const Vectorized& other) const { + return {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; + } + + /* Propagates NaN if either input is a NaN. */ + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized maximum(const Vectorized& other) const { + Vectorized tmp = { + vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; + tmp = blendv(tmp, *this, isnan()); + return blendv(tmp, other, other.isnan()); + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized clamp_min(const Vectorized& min) const { + return {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)}; + } + + /* Keeps NaN if actual value is NaN */ + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized clamp_min(const Vectorized& min) const { + Vectorized tmp = {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)}; + return blendv(tmp, *this, isnan()); + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized clamp_max(const Vectorized& max) const { + return {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)}; + } + + /* Keeps NaN if actual value is NaN */ + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized clamp_max(const Vectorized& max) const { + Vectorized tmp = {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)}; + return blendv(tmp, *this, isnan()); + } + + template , int> = 0> + Vectorized swapped() const { + auto swap_mask = GetSwapMaskFloat(); + vtype v0 = vec_perm(_vec0, _vec0, swap_mask); + vtype v1 = vec_perm(_vec1, _vec1, swap_mask); + return {v0, v1}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized swapped() const { + vtype v0 = {_vec0[1], _vec0[0]}; + vtype v1 = {_vec1[1], _vec1[0]}; + return {v0, v1}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + static Vectorized mergee(Vectorized& first, Vectorized& second) { + return { + vec_mergee(first._vec0, second._vec0), + vec_mergee(first._vec1, second._vec1)}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + static Vectorized mergeo(Vectorized& first, Vectorized& second) { + return { + vec_mergeo(first._vec0, second._vec0), + vec_mergeo(first._vec1, second._vec1)}; + } + + static Vectorized horizontal_add_perm( + Vectorized& first, + Vectorized& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.swapped(); // 2perm + auto second_perm = second.swapped(); // 2perm + // summ + auto first_ret = first + first_perm; // 2add + auto second_ret = second + second_perm; // 2 add + // now lets choose evens + return mergee(first_ret, second_ret); // 2 mergee's + } + + static Vectorized horizontal_sub_perm( + Vectorized& first, + Vectorized& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.swapped(); // 2perm + auto second_perm = second.swapped(); // 2perm + // summ + auto first_ret = first - first_perm; // 2sub + auto second_ret = second - second_perm; // 2 sub + // now lets choose evens + return mergee(first_ret, second_ret); // 2 mergee's + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized mergee() const { + return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized mergeo() const { + return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized to_vec_float_helper() const { + int32_t values[8] = { + _vec0[0], + _vec0[1], + _vec0[2], + _vec0[3], + _vec0[4], + _vec0[5], + _vec0[6], + _vec0[7], + }; + + return Vectorized{ + values[0], + values[1], + values[2], + values[3], + values[4], + values[5], + values[6], + values[7]}; + } + + template < + typename U = T, + std::enable_if_t, int> = 0> + Vectorized to_vec_uint8_helper() const { + // helper function for float to uint8_t conversion + uint8_t values[8] = { + static_cast(_vec0[0]), + static_cast(_vec0[1]), + static_cast(_vec0[2]), + static_cast(_vec0[3]), + static_cast(_vec1[0]), + static_cast(_vec1[1]), + static_cast(_vec1[2]), + static_cast(_vec1[3]), + }; + + return Vectorized{ + values[0], values[1], values[2], values[3], values[4], values[5], + values[6], values[7], 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, + }; + } +}; + +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() + b.vec0(), a.vec1() + b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() - b.vec0(), a.vec1() - b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() * b.vec0(), a.vec1() * b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator/( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() & b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() & b.vecb1())}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() | b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() | b.vecb1())}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() ^ b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() ^ b.vecb1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())} \ + ._not(); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpgt(a.vec0(), b.vec0()), vec_cmpgt(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpge(a.vec0(), b.vec0()), vec_cmpge(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmplt(a.vec0(), b.vec0()), vec_cmplt(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())}; \ + } + +ZVECTOR_OPERATORS(float) +ZVECTOR_OPERATORS(double) +ZVECTOR_OPERATORS(int8_t) +ZVECTOR_OPERATORS(uint8_t) +ZVECTOR_OPERATORS(uint16_t) +ZVECTOR_OPERATORS(int16_t) +ZVECTOR_OPERATORS(int32_t) +ZVECTOR_OPERATORS(int64_t) + +#undef ZVECTOR_OPERATORS + +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator<<( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr Vectorized::ElementType max_shift = \ + sizeof(Vectorized::ElementType) * CHAR_BIT; \ + \ + Vectorized::ElementType a_array[Vectorized::size()]; \ + Vectorized::ElementType b_array[Vectorized::size()]; \ + Vectorized::ElementType c_array[Vectorized::size()]; \ + \ + a.store(a_array); \ + b.store(b_array); \ + \ + for (int i = 0; i != Vectorized::size(); i++) { \ + typex shift = b_array[i]; \ + if ((static_cast>(shift) < 0) || \ + (shift >= max_shift)) { \ + c_array[i] = 0; \ + } else { \ + c_array[i] = static_cast>(a_array[i]) \ + << shift; \ + } \ + } \ + \ + return Vectorized::loadu(c_array); \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator>>( \ + const Vectorized& a, const Vectorized& b) { \ + /* right shift value to retain sign bit for signed and no bits for \ + * unsigned */ \ + constexpr Vectorized::ElementType max_shift = \ + sizeof(typex) * CHAR_BIT - std::is_signed_v; \ + \ + Vectorized::ElementType a_array[Vectorized::size()]; \ + Vectorized::ElementType b_array[Vectorized::size()]; \ + Vectorized::ElementType c_array[Vectorized::size()]; \ + \ + a.store(a_array); \ + b.store(b_array); \ + \ + for (int i = 0; i != Vectorized::size(); i++) { \ + typex shift = b_array[i]; \ + if ((static_cast>(shift) < 0) || \ + (shift >= max_shift)) { \ + c_array[i] = a_array[i] >> max_shift; \ + } else { \ + c_array[i] = a_array[i] >> shift; \ + } \ + } \ + \ + return Vectorized::loadu(c_array); \ + } \ + \ + template <> \ + inline Vectorized operator~(const Vectorized& a) { \ + return a._not(); \ + } + +ZVECTOR_OPERATORS(int8_t) +ZVECTOR_OPERATORS(uint8_t) +ZVECTOR_OPERATORS(uint16_t) +ZVECTOR_OPERATORS(int16_t) +ZVECTOR_OPERATORS(int32_t) +ZVECTOR_OPERATORS(int64_t) + +#undef ZVECTOR_OPERATORS + +#define DEFINE_MAXMIN_FUNCS(operand_type) \ + template <> \ + Vectorized inline maximum( \ + const Vectorized& a, const Vectorized& b) { \ + return a.maximum(b); \ + } \ + template <> \ + Vectorized inline minimum( \ + const Vectorized& a, const Vectorized& b) { \ + return a.minimum(b); \ + } + +#define DEFINE_CLAMP_MAXMIN_FUNCS(typex) \ + DEFINE_MAXMIN_FUNCS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_min( \ + const Vectorized& a, const Vectorized& min) { \ + return a.clamp_min(min); \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_max( \ + const Vectorized& a, const Vectorized& max) { \ + return a.clamp_max(max); \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp( \ + const Vectorized& a, \ + const Vectorized& min, \ + const Vectorized& max) { \ + return clamp_max(clamp_min(a, min), max); \ + } + +DEFINE_CLAMP_MAXMIN_FUNCS(int8_t) +DEFINE_CLAMP_MAXMIN_FUNCS(uint8_t) +DEFINE_CLAMP_MAXMIN_FUNCS(int16_t) +DEFINE_CLAMP_MAXMIN_FUNCS(int32_t) +DEFINE_CLAMP_MAXMIN_FUNCS(int64_t) +DEFINE_CLAMP_MAXMIN_FUNCS(float) +DEFINE_CLAMP_MAXMIN_FUNCS(double) + +namespace { /* unnamed namespace */ + +#if !defined(vec_float) || __ARCH__ < 13 +#warning \ + "float->int and int->float conversion is simulated. compile for z15 for improved performance" +inline ZSimdVect vec_int_flt(const ZSimdVect x) { + return ZSimdVect{float(x[0]), float(x[1]), float(x[2]), float(x[3])}; +} +inline ZSimdVect vec_flt_int(const ZSimdVect x) { + return ZSimdVect{int(x[0]), int(x[1]), int(x[2]), int(x[3])}; +} +#else +#define vec_int_flt vec_float +#define vec_flt_int vec_signed +#endif + +Vectorized zvec_convert_to_float(const Vectorized& x) { + return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())}; +} + +Vectorized zvec_convert_to_int(const Vectorized& x) { + return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())}; +} + +Vectorized zvec_convert_to_float(const Vectorized& x) { + return {vec_double(x.vec0()), vec_double(x.vec1())}; +} + +Vectorized zvec_convert_to_int(const Vectorized& x) { + return {vec_signed(x.vec0()), vec_signed(x.vec1())}; +} + +} /* unnamed namespace */ + +template +Vectorized cast_zvector(const Vectorized& x) { + using cast_type = typename Vectorized::vtype; + return Vectorized{(cast_type)x.vec0(), (cast_type)x.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + __builtin_s390_vfmasb(a.vec0(), b.vec0(), c.vec0()), + __builtin_s390_vfmasb(a.vec1(), b.vec1(), c.vec1())}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + __builtin_s390_vfmadb(a.vec0(), b.vec0(), c.vec0()), + __builtin_s390_vfmadb(a.vec1(), b.vec1(), c.vec1())}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vectorized C10_ALWAYS_INLINE fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +convert_to_int_of_same_size(const Vectorized& src) { + return zvec_convert_to_int(src); +} + +template <> +Vectorized C10_ALWAYS_INLINE +convert_to_int_of_same_size(const Vectorized& src) { + return zvec_convert_to_int(src); +} + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + // int32_t and float have same size + int64_t i; + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + const int32_t* src_a = src + i; + float* dst_a = dst + i; + auto input_vec = Vectorized::loadu(src_a); + auto output_vec = zvec_convert_to_float(input_vec); + output_vec.store(dst_a); + } + + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int64_t* src, double* dst, int64_t n) { + int64_t i; + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + const int64_t* src_a = src + i; + double* dst_a = dst + i; + auto input_vec = Vectorized::loadu(src_a); + auto output_vec = zvec_convert_to_float(input_vec); + output_vec.store(dst_a); + } + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +#define DEFINE_REINTERPRET_CAST_FUNCS(Fst, Cst) \ + template <> \ + C10_ALWAYS_INLINE Vectorized cast( \ + const Vectorized& src) { \ + return cast_zvector(src); \ + } + +#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(Fst) \ + DEFINE_REINTERPRET_CAST_FUNCS(Fst, double) \ + DEFINE_REINTERPRET_CAST_FUNCS(Fst, float) \ + DEFINE_REINTERPRET_CAST_FUNCS(Fst, int64_t) \ + DEFINE_REINTERPRET_CAST_FUNCS(Fst, int32_t) \ + DEFINE_REINTERPRET_CAST_FUNCS(Fst, int16_t) + +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t) + +#undef DEFINE_REINTERPRET_CAST_FUNCS + +template +struct unpack_type { + using type = T; +}; +template <> +struct unpack_type { + using type = int16_t; +}; +template <> +struct unpack_type { + using type = int16_t; +}; +template <> +struct unpack_type { + using type = int32_t; +}; + +template +struct pack_type { + using type = T; +}; +template <> +struct pack_type { + using type = int8_t; +}; +template <> +struct pack_type { + using type = int16_t; +}; + +namespace { /* unnamed namespace */ + +template ::type> +std::pair, Vectorized> unpack(const Vectorized& x) { + auto vec0 = vec_unpackh(x.vec0()); + auto vec1 = vec_unpackl(x.vec0()); + auto vec2 = vec_unpackh(x.vec1()); + auto vec3 = vec_unpackl(x.vec1()); + return {Vectorized{vec0, vec1}, Vectorized{vec2, vec3}}; +} + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") +template <> +std::pair, Vectorized> unpack( + const Vectorized& x) { + using typeX = typename Vectorized::vtype; + typeX vec0 = vec_unpackh(x.vec0()); + typeX vec1 = vec_unpackl(x.vec0()); + typeX vec2 = vec_unpackh(x.vec1()); + typeX vec3 = vec_unpackl(x.vec1()); + // auto mask = Vectorized(0xFF); + // vec0 = vec0 & mask; + // vec1 = vec1 & mask; + // vec2 = vec2 & mask; + // vec3 = vec3 & mask; + return { + cast_zvector(Vectorized{vec0, vec1}), + cast_zvector(Vectorized{vec2, vec3})}; +} +C10_DIAGNOSTIC_POP() + +template ::type> +Vectorized pack(const Vectorized& first, const Vectorized& second) { + auto vec0 = vec_packs(first.vec0(), first.vec1()); + auto vec1 = vec_packs(second.vec0(), second.vec1()); + return Vectorized{vec0, vec1}; +} + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") +template <> +Vectorized pack( + const Vectorized& first, + const Vectorized& second) { + auto vec0 = vec_packsu(first.vec0(), first.vec1()); + auto vec1 = vec_packsu(second.vec0(), second.vec1()); + return Vectorized{vec0, vec1}; +} +C10_DIAGNOSTIC_POP() + +} /* unnamed namespace */ + +//////////////////////////////////QUANT/////////////////////////////////////////// +template +struct is_vec_specialized_for< + T, + std::enable_if_t()>> + : std::bool_constant {}; + +template +struct Vectorized()>> { + public: + using value_type = typename T::underlying; + using vtype = ZSimdVect; + using vmaskType = ZSimdVectBinary; + using vinner_type = Vectorized; + using size_type = int; + + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(value_type); + } + + static constexpr int float_num_vecs() { + return size() / Vectorized::size(); + } + static constexpr int int_num_vecs() { + return float_num_vecs(); + } + using float_vec_return_type = std::array, float_num_vecs()>; + using int_vec_return_type = + std::array, int_num_vecs()>; + + private: + vinner_type _vec; + + public: + Vectorized() {} + + explicit C10_ALWAYS_INLINE Vectorized(vinner_type v) : _vec{v} {} + Vectorized(const T& val) : _vec(val.val_) {} + + C10_ALWAYS_INLINE const vinner_type& vec() const { + return _vec; + } + + template + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + return Vectorized{vinner_type::loadu(ptr, count)}; + } + + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { + _vec.store(ptr, count); + } + + Vectorized relu(Vectorized zero_point) const { + return Vectorized{_vec.maximum(zero_point._vec)}; + } + + Vectorized relu6(Vectorized zero_point, Vectorized q_six) const { + auto ret_max = _vec.maximum(zero_point._vec); + auto ret_min = ret_max.minimum(q_six._vec); + return Vectorized{ret_min}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 1, int> = 0> + int_vec_return_type widening_subtract(Vectorized b) const { + return {*this - b}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 1, int> = 0> + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + auto float_val = zvec_convert_to_float(_vec); + return {fmadd(scale, float_val, scale_zp_premul)}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 1, int> = 0> + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + auto float_val = zvec_convert_to_float(_vec); + return {(float_val - zero_point) * scale}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 1, int> = 0> + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + Vectorized vecf = rhs[0]; + vecf = vecf * Vectorized(inverse_scale); + vecf = vecf.rint() + Vectorized((float)(zero_point)); + auto veci = zvec_convert_to_int(vecf); + + return Vectorized{veci}; + } + + template < + typename U = T, + std::enable_if_t::int_num_vecs() == 1, int> = 0> + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + Vectorized vi = inp[0]; + auto vecf = zvec_convert_to_float(vi.vec()); + vecf = vecf * Vectorized(multiplier); + vecf = vecf.rint(); + auto veci = zvec_convert_to_int(vecf) + Vectorized(zero_point); + + return Vectorized{veci}; + } + + template < + typename U = T, + std::enable_if_t::int_num_vecs() == 4, int> = 0> + int_vec_return_type widening_subtract(Vectorized b) const { + auto ret16 = unpack(_vec); + auto ret16B = unpack(b.vec()); + auto ret32_0 = unpack(ret16.first); + auto ret32_1 = unpack(ret16.second); + auto ret32B_0 = unpack(ret16B.first); + auto ret32B_1 = unpack(ret16B.second); + + return { + Vectorized(ret32_0.first - ret32B_0.first), + Vectorized(ret32_0.second - ret32B_0.second), + Vectorized(ret32_1.first - ret32B_1.first), + Vectorized(ret32_1.second - ret32B_1.second)}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 4, int> = 0> + float_vec_return_type C10_ALWAYS_INLINE dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + // unpacking unsigned as signed + auto ret16 = unpack(_vec); + auto ret32_0 = unpack(ret16.first); + auto ret32_1 = unpack(ret16.second); + + auto vecf_0 = zvec_convert_to_float(ret32_0.first); + auto vecf_1 = zvec_convert_to_float(ret32_0.second); + + auto vecf_2 = zvec_convert_to_float(ret32_1.first); + auto vecf_3 = zvec_convert_to_float(ret32_1.second); + return { + fmadd(scale, vecf_0, scale_zp_premul), + fmadd(scale, vecf_1, scale_zp_premul), + fmadd(scale, vecf_2, scale_zp_premul), + fmadd(scale, vecf_3, scale_zp_premul)}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 4, int> = 0> + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + // unpacking unsigned as signed + auto ret16 = unpack(_vec); + auto ret32_0 = unpack(ret16.first); + auto ret32_1 = unpack(ret16.second); + + auto vecf_0 = zvec_convert_to_float(ret32_0.first); + auto vecf_1 = zvec_convert_to_float(ret32_0.second); + + auto vecf_2 = zvec_convert_to_float(ret32_1.first); + auto vecf_3 = zvec_convert_to_float(ret32_1.second); + + return { + (vecf_0 - zero_point) * scale, + (vecf_1 - zero_point) * scale, + (vecf_2 - zero_point) * scale, + (vecf_3 - zero_point) * scale}; + } + + template < + typename U = T, + std::enable_if_t::float_num_vecs() == 4, int> = 0> + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + auto vec_inverse = Vectorized(inverse_scale); + auto vec_zero_point = Vectorized((float)zero_point); + + auto vecf0 = rhs[0]; + auto vecf2 = rhs[1]; + auto vecf4 = rhs[2]; + auto vecf6 = rhs[3]; + + vecf0 = vecf0 * vec_inverse; + vecf2 = vecf2 * vec_inverse; + vecf4 = vecf4 * vec_inverse; + vecf6 = vecf6 * vec_inverse; + + vecf0 = vecf0.rint() + vec_zero_point; + vecf2 = vecf2.rint() + vec_zero_point; + vecf4 = vecf4.rint() + vec_zero_point; + vecf6 = vecf6.rint() + vec_zero_point; + + auto veci0 = zvec_convert_to_int(vecf0); + auto veci2 = zvec_convert_to_int(vecf2); + auto veci4 = zvec_convert_to_int(vecf4); + auto veci6 = zvec_convert_to_int(vecf6); + + auto vecshi0 = pack(veci0, veci2); + auto vecshi2 = pack(veci4, veci6); + auto ret = pack(vecshi0, vecshi2); + + return Vectorized{ret}; + } + + template < + typename U = T, + std::enable_if_t::int_num_vecs() == 4, int> = 0> + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + Vectorized vec_multiplier = Vectorized(multiplier); + Vectorized vec_zero_point = Vectorized(zero_point); + + Vectorized vi0 = inp[0]; + Vectorized vi1 = inp[1]; + Vectorized vi2 = inp[2]; + Vectorized vi3 = inp[3]; + + auto vecf0 = zvec_convert_to_float(vi0.vec()); + auto vecf2 = zvec_convert_to_float(vi1.vec()); + + auto vecf4 = zvec_convert_to_float(vi2.vec()); + auto vecf6 = zvec_convert_to_float(vi3.vec()); + + vecf0 = vecf0 * vec_multiplier; + vecf2 = vecf2 * vec_multiplier; + + vecf4 = vecf4 * vec_multiplier; + vecf6 = vecf6 * vec_multiplier; + + vecf0 = vecf0.rint(); + vecf2 = vecf2.rint(); + vecf4 = vecf4.rint(); + vecf6 = vecf6.rint(); + + auto veci0 = zvec_convert_to_int(vecf0); + auto veci2 = zvec_convert_to_int(vecf2); + auto veci4 = zvec_convert_to_int(vecf4); + auto veci6 = zvec_convert_to_int(vecf6); + + veci0 = veci0 + vec_zero_point; + veci2 = veci2 + vec_zero_point; + + veci4 = veci4 + vec_zero_point; + veci6 = veci6 + vec_zero_point; + + auto vecshi0 = pack(veci0, veci2); + auto vecshi2 = pack(veci4, veci6); + + auto ret = pack(vecshi0, vecshi2); + + return Vectorized{ret}; + } + + Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { + return Vectorized{_vec.eq(other._vec)}; + } + Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { + return Vectorized{_vec.ne(other._vec)}; + } + Vectorized C10_ALWAYS_INLINE gt(const Vectorized& other) const { + return Vectorized{_vec.gt(other._vec)}; + } + Vectorized C10_ALWAYS_INLINE ge(const Vectorized& other) const { + return Vectorized{_vec.ge(other._vec)}; + } + Vectorized C10_ALWAYS_INLINE lt(const Vectorized& other) const { + return Vectorized{_vec.lt(other._vec)}; + } + Vectorized C10_ALWAYS_INLINE le(const Vectorized& other) const { + return Vectorized{_vec.le(other._vec)}; + } + + Vectorized clamp_min(const Vectorized& min) const { + return Vectorized{_vec.clamp_min(min._vec)}; + } + + Vectorized clamp_max(const Vectorized& max) const { + return Vectorized{_vec.clamp_max(max._vec)}; + } + + Vectorized minimum(const Vectorized& other) const { + return Vectorized{_vec.minimum(other._vec)}; + } + + Vectorized maximum(const Vectorized& other) const { + return Vectorized{_vec.maximum(other._vec)}; + } +}; + +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() + b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() - b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() * b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator/( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() / b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() & b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() | b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() ^ b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() == b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() != b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() > b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() >= b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() < b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() <= b.vec()}; \ + } + +ZVECTOR_OPERATORS(c10::qint32) +ZVECTOR_OPERATORS(c10::qint8) +ZVECTOR_OPERATORS(c10::quint8) + +#undef ZVECTOR_OPERATORS + +DEFINE_CLAMP_MAXMIN_FUNCS(c10::quint8) +DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint8) +DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint32) + +template +constexpr auto real_mask() { + return (ZSimdVect)ZSimdVectBinary{0xFFFFFFFF, 0, 0xFFFFFFFF, 0}; +} + +template <> +constexpr auto real_mask() { + return (ZSimdVect)ZSimdVectBinary{0xFFFFFFFFFFFFFFFF, 0}; +} + +template +constexpr auto image_mask() { + return (ZSimdVect)ZSimdVectBinary{0, 0xFFFFFFFF, 0, 0xFFFFFFFF}; +} + +template <> +constexpr auto image_mask() { + return (ZSimdVect)ZSimdVectBinary{0, 0xFFFFFFFFFFFFFFFF}; +} + +template +constexpr auto rsign_mask() { + return ZSimdVect{-0.f, 0.f, -0.f, 0.f}; +} + +template <> +constexpr auto rsign_mask() { + return ZSimdVect{-0.0, 0.f}; +} + +template +constexpr auto isign_mask() { + return ZSimdVect{0.0, -0.f, 0.0, -0.f}; +} + +template <> +constexpr auto isign_mask() { + return ZSimdVect{0.0, -0.0}; +} + +template +constexpr auto image_one() { + return ZSimdVect{0, 1.f, 0, 1.f}; +} + +template <> +constexpr auto image_one() { + return ZSimdVect{0.0, 1.0}; +} + +template +constexpr auto pi_half() { + return ZSimdVect{(float)(M_PI / 2.0), 0.f, (float)(M_PI / 2.0), 0.f}; +} + +template <> +constexpr auto pi_half() { + return ZSimdVect{M_PI / 2.0, 0.0}; +} + +template +constexpr auto image_half() { + return ZSimdVect{0, 0.5f, 0, 0.5f}; +} + +template <> +constexpr auto image_half() { + return ZSimdVect{0.0, 0.5}; +} + +template +constexpr U log2e_inv() { + return static_cast(1.4426950408889634); +} + +template +constexpr U log10e_inv() { + return static_cast(0.43429448190325176); +} + +template +struct is_vec_specialized_for< + T, + std::enable_if_t()>> + : std::bool_constant {}; + +template +struct Vectorized()>> { + public: + using underline_type = decltype(std::declval().imag()); + using value_type = T; + using vtype = ZSimdVect; + using vmaskType = ZSimdVectBinary; + using vinner_type = Vectorized; + using size_type = int; + using vinner_data = typename Vectorized::vinner_data; + + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(value_type); + } + + private: + vinner_type _vec; + + public: + Vectorized() {} + + C10_ALWAYS_INLINE Vectorized(const vinner_data& v) + : _vec{v.first, v.second} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s1, T s2) + : _vec{s1.real(), s1.imag(), s2.real(), s2.imag()} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4) + : _vec{ + s1.real(), + s1.imag(), + s2.real(), + s2.imag(), + s3.real(), + s3.imag(), + s4.real(), + s4.imag()} {} + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s) : Vectorized(s, s) {} + + template = 0> + C10_ALWAYS_INLINE Vectorized(T s) : Vectorized(s, s, s, s) {} + + C10_ALWAYS_INLINE operator vinner_type() const { + return _vec; + } + + C10_ALWAYS_INLINE const vinner_type& vec() const { + return _vec; + } + + C10_ALWAYS_INLINE operator vinner_data() const { + return _vec.data(); + } + + C10_ALWAYS_INLINE vinner_data data() const { + return _vec.data(); + } + + template + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + return Vectorized{vinner_type::loadu(ptr, 2 * count)}; + } + + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { + return _vec.store(ptr, 2 * count); + } + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // convert std::complex index mask to V index mask: xy -> xxyy + vinner_type vmask = mask.vec(); + auto mask_complex = vinner_type( + vec_mergeh(vmask.vec0(), vmask.vec0()), + vec_mergeh(vmask.vec1(), vmask.vec1())); + return Vectorized{vinner_type::blendv(a.vec(), b.vec(), mask_complex)}; + } + + template + static auto C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + constexpr int mask_complex = maskForComplex(mask); + return Vectorized{ + vinner_type::template blend(a.vec(), b.vec())}; + } + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized(base, base + step); + } + + template + static std::enable_if_t> arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + value_type(2) * step, + base + value_type(3) * step); + } + + template + static inline std::enable_if_t<(Z >= C), Vectorized> set_inner( + const Vectorized& a, + const Vectorized& b, + size_t count) { + return b; + } + + template + static inline std::enable_if_t<(Z < C), Vectorized> set_inner( + const Vectorized& a, + const Vectorized& b, + size_t count) { + if (count == Z) + return blend(a, b); + else + return set_inner(a, b, count); + } + + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + size_t count = size()) { + if (count == 0) + return a; + return set_inner<1, size()>(a, b, count); + } + + const T& operator[](int idx) const = delete; + T& operator[](int idx) = delete; + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + Vectorized mapOrdinary(T (*const f)(const T&)) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + return Vectorized{ + f(T(v0[0], v0[1])), + f(T(v0[2], v0[3])), + f(T(v1[0], v1[1])), + f(T(v1[2], v1[3]))}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + Vectorized mapOrdinary(T (*const f)(const T&)) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + return Vectorized{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + Vectorized mapOrdinary(T (*const f)(T)) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + return Vectorized{ + f(T(v0[0], v0[1])), + f(T(v0[2], v0[3])), + f(T(v1[0], v1[1])), + f(T(v1[2], v1[3]))}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + Vectorized mapOrdinary(T (*const f)(T)) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + return Vectorized{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + inline Vectorized mapOrdinary( + T (*const f)(const T&, const T&), + const Vectorized& b) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + auto bvec = b.vec(); + auto b0 = bvec.vec0(); + auto b1 = bvec.vec1(); + T a00 = f(T(v0[0], v0[1]), T(b0[0], b0[1])); + T a01 = f(T(v0[2], v0[3]), T(b0[2], b0[3])); + T a02 = f(T(v1[0], v1[1]), T(b1[0], b1[1])); + T a03 = f(T(v1[2], v1[3]), T(b1[2], b1[3])); + return Vectorized{a00, a01, a02, a03}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + inline Vectorized mapOrdinary( + T (*const f)(const T&, const T&), + const Vectorized& b) const { + auto v0 = _vec.vec0(); + auto v1 = _vec.vec1(); + auto bvec = b.vec(); + auto b0 = bvec.vec0(); + auto b1 = bvec.vec1(); + U a00 = f(U(v0[0], v0[1]), U(b0[0], b0[1])); + U a01 = f(U(v1[0], v1[1]), U(b1[0], b1[1])); + return Vectorized{a00, a01}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + static typename Vectorized::vinner_type real_neg( + const typename Vectorized::vinner_type& a) { + const auto swap_mask = ZSimdVectBinary{ + 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}; + + auto a_neg = a.neg(); + vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask); + vtype v1 = vec_perm(a_neg.vec1(), a.vec1(), swap_mask); + return {v0, v1}; + } + + template < + typename U = T, + std::enable_if_t>::value, int> = 0> + static typename Vectorized::vinner_type real_neg( + const typename Vectorized::vinner_type& a) { + auto a_neg = a.neg(); + vtype v0 = {a_neg.vec0()[0], a.vec0()[1]}; + vtype v1 = {a_neg.vec1()[0], a.vec1()[1]}; + return {v0, v1}; + } + + Vectorized angle2_() const { + auto b_a = _vec.swapped(); // b a + return Vectorized{_vec.atan2(b_a).swapped()}; + } + + Vectorized angle() const { + return angle2_().real(); + } + + Vectorized atan() const { + // atan(x) = i/2 * ln((i + z)/(i - z)) + auto ione = Vectorized{vinner_type(image_one())}; + auto sum = ione + *this; + auto sub = ione - *this; + auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) + return ln * + Vectorized{vinner_type(image_half())}; // i/2*ln() + } + + Vectorized atanh() const { + return mapOrdinary(std::atanh); + } + + Vectorized asin() const { + // asin(x) + // = -i*ln(iz + sqrt(1 -z^2)) + // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) +#if 1 + vinner_type cnj = conj().vec(); + vinner_type b_a = cnj.swapped(); + vinner_type ab = cnj * b_a; + vinner_type im = ab + ab; + vinner_type val_2 = _vec * _vec; + vinner_type val_2_swapped = val_2.swapped(); + vinner_type re = vinner_type::horizontal_sub_perm(val_2, val_2_swapped); + re = vinner_type(static_cast(1)) - re; + constexpr int blend_mask = + blend_choice(); // 0x0A for complex , 0xAA for complex + vinner_type blendx = vinner_type::template blend(re, im); + auto root = Vectorized(blendx).sqrt(); + auto ln = Vectorized(Vectorized(b_a) + root).log(); + return Vectorized(ln.vec().swapped()).conj(); +#else + return mapOrdinary(std::asin); +#endif + } + + Vectorized acos() const { + // acos(x) = pi/2 - asin(x) + return Vectorized(vinner_type(pi_half())) - asin(); + } + + Vectorized sin() const { + return mapOrdinary(std::sin); + } + Vectorized sinh() const { + return mapOrdinary(std::sinh); + } + Vectorized cos() const { + return mapOrdinary(std::cos); + } + Vectorized cosh() const { + return mapOrdinary(std::cosh); + } + Vectorized ceil() const { + return Vectorized{_vec.ceil()}; + } + Vectorized floor() const { + return Vectorized{_vec.floor()}; + } + Vectorized neg() const { + return Vectorized(_vec.neg()); + } + Vectorized round() const { + return Vectorized{_vec.round()}; + } + Vectorized tan() const { + return mapOrdinary(std::tan); + } + Vectorized tanh() const { + return mapOrdinary(std::tanh); + } + Vectorized trunc() const { + return Vectorized{_vec.trunc()}; + } + + Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { + auto eq = _vec.eq(other._vec); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + auto real = eq & vinner_type(real_mask()); + auto imag = (eq & vinner_type(image_mask())).swapped(); + return Vectorized{real & imag}; + } + Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { + auto ne = _vec.ne(other._vec); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + auto real = ne & vinner_type(real_mask()); + auto imag = (ne & vinner_type(image_mask())).swapped(); + return Vectorized{real | imag}; + } + + Vectorized real() const { + return Vectorized(_vec & vinner_type(real_mask())); + } + Vectorized imag_() const { + return Vectorized(_vec & vinner_type(image_mask())); + } + Vectorized imag() const { + return Vectorized{ + (_vec & vinner_type(image_mask())).swapped()}; + } + + Vectorized conj() const { + return Vectorized(_vec ^ vinner_type(isign_mask())); + } + + vinner_data abs_2_() const { + auto a = _vec * _vec; + a = a + a.swapped(); + return a.mergee().data(); + } + + static T abs_helper(const T& value) { + return T(std::abs(value)); + } + + Vectorized abs() const { + return mapOrdinary(abs_helper); + } + + Vectorized exp() const { + return mapOrdinary(std::exp); + } + + Vectorized exp2() const { + return mapOrdinary(exp2_impl); + } + + Vectorized expm1() const { + return mapOrdinary(std::expm1); + } + + Vectorized log() const { + return mapOrdinary(std::log); + } + + Vectorized log2() const { + // log2eB_inv + auto ret = log(); + return Vectorized{ret._vec * vinner_type(log2e_inv())}; + } + + Vectorized log10() const { + auto ret = log(); + return Vectorized{ret._vec * vinner_type(log10e_inv())}; + } + + Vectorized log1p() const { + return mapOrdinary(std::log1p); + } + + Vectorized sgn() const { + return mapOrdinary(at::native::sgn_impl); + } + + Vectorized pow(const Vectorized& exp) const { + return mapOrdinary(std::pow, exp); + } + + Vectorized sqrt() const { + return mapOrdinary(std::sqrt); + } + + Vectorized reciprocal() const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() = c/abs_2() + // im = (bc - ad)/abs_2() = d/abs_2() + vinner_type c_d = _vec ^ vinner_type(isign_mask()); + vinner_type abs = abs_2_(); + return Vectorized{c_d / abs}; + } + + Vectorized rsqrt() const { + return sqrt().reciprocal(); + } + + Vectorized lt(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized le(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized gt(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized ge(const Vectorized& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } +}; + +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() + b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() - b.vec()}; \ + } \ + \ + template <> \ + Vectorized inline operator*( \ + const Vectorized& a, const Vectorized& b) { \ + /* (a + bi) * (c + di) = (ac - bd) + (ad + bc)i */ \ + Vectorized::vinner_type bv = b.vec(); \ + \ + /* this is more z arch friendly than simulating horizontal from x86 */ \ + Vectorized::vinner_type vi = bv.mergeo(); \ + Vectorized::vinner_type vr = bv.mergee(); \ + vi = vi ^ \ + Vectorized::vinner_type( \ + rsign_mask::underline_type>()); \ + Vectorized::vinner_type ret = a.vec() * vr; \ + Vectorized::vinner_type vx_swapped = a.vec().swapped(); \ + ret = fmadd(vx_swapped, vi, ret); \ + \ + return Vectorized{ret}; \ + } \ + \ + template <> \ + Vectorized inline operator/( \ + const Vectorized& a, const Vectorized& b) { \ + /* Unfortunately, this breaks some tests */ \ + /* Implement it like it's done for avx2 */ \ + auto fabs_cd = b.vec().abs(); /* |c| |d| */ \ + auto fabs_dc = fabs_cd.swapped(); /* |d| |c| */ \ + auto scale = Vectorized::vinner_type{1.0} / \ + maximum(fabs_cd, fabs_dc); /* 1/sc 1/sc */ \ + auto a2 = a.vec() * scale; /* a/sc b/sc */ \ + auto b2 = b.vec() * scale; /* c/sc d/sc */ \ + auto acbd2 = a2 * b2; /* ac/sc^2 bd/sc^2 */ \ + \ + auto dc2 = b2.swapped(); /* d/sc c/sc */ \ + dc2 = Vectorized::real_neg(dc2); /* -d/|c,d| c/sc */ \ + auto adbc2 = a2 * dc2; /* -ad/sc^2 bc/sc^2 */ \ + auto sum1 = acbd2 + acbd2.swapped(); /* (ac+bd)/sc^2 (ac+bd)/sc^2 */ \ + auto sum2 = adbc2 + adbc2.swapped(); /* (bc-ad)/sc^2 (bc-ad)/sc^2 */ \ + auto res2 = Vectorized::vinner_type::mergee( \ + sum1, sum2); /* (ac+bd)/sc^2 (bc-ad)/sc^2 */ \ + \ + /* get the denominator */ \ + Vectorized::vinner_type denom2 = \ + Vectorized{b2}.abs_2_(); /* (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 */ \ + res2 = res2 / denom2; \ + return Vectorized{res2}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() & b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() | b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() ^ b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() == b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() != b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } + +ZVECTOR_OPERATORS(c10::complex) +ZVECTOR_OPERATORS(c10::complex) + +#undef ZVECTOR_OPERATORS + +template = 0> +std::pair, Vectorized> inline inner_interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3} + // b = {b0, b1, b2, b3} + using vtype = typename Vectorized::vtype; + vtype ab00 = {a.vec0()[0], b.vec0()[0]}; + vtype ab11 = {a.vec0()[1], b.vec0()[1]}; + vtype ab2_00 = {a.vec1()[0], b.vec1()[0]}; + vtype ab2_11 = {a.vec1()[1], b.vec1()[1]}; + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + return std::make_pair( + Vectorized{ab00, ab11}, Vectorized{ab2_00, ab2_11}); +} + +template = 0> +std::pair, Vectorized> inline inner_deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + using vtype = typename Vectorized::vtype; + vtype aa01 = {a.vec0()[0], a.vec1()[0]}; + vtype aa23 = {b.vec0()[0], b.vec1()[0]}; + + vtype bb_01 = {a.vec0()[1], a.vec1()[1]}; + vtype bb_23 = {b.vec0()[1], b.vec1()[1]}; + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + return std::make_pair(Vectorized{aa01, aa23}, Vectorized{bb_01, bb_23}); +} + +template = 0> +std::pair, Vectorized> inline inner_interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3,, a4, a5, a6, a7} + // b = {b0, b1, b2, b3,, b4, b5, b6, b7} + using vtype = typename Vectorized::vtype; + vtype ab0011 = vec_mergeh(a.vec0(), b.vec0()); + vtype ab2233 = vec_mergel(a.vec0(), b.vec0()); + + vtype ab2_0011 = vec_mergeh(a.vec1(), b.vec1()); + vtype ab2_2233 = vec_mergel(a.vec1(), b.vec1()); + // group cols crossing lanes: + // return {a0, b0, a1, b1,, a2, b2, a3, b3} + // {a4, b4, a5, b5,, a6, b6, a7, b7} + + return std::make_pair( + Vectorized{ab0011, ab2233}, Vectorized{ab2_0011, ab2_2233}); +} + +template = 0> +std::pair, Vectorized> inline inner_deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1,, a2, b2, a3, b3} + // b = {a4, b4, a5, b5,, a6, b6, a7, b7} + using vtype = typename Vectorized::vtype; + // {a0,a2,b0,b2} {a1,a3,b1,b3} + vtype a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1()); + vtype a1a3b1b3 = vec_mergel(a.vec0(), a.vec1()); + + vtype aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3); + vtype bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3); + + vtype a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1()); + vtype a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1()); + + vtype aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2); + vtype bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2); + + // it could be done with vec_perm ,too + // swap lanes: + // return {a0, a1, a2, a3,, a4, a5, a6, a7} + // {b0, b1, b2, b3,, b4, b5, b6, b7} + + return std::make_pair( + Vectorized{aa0123, aa0123_2}, Vectorized{bb0123, bb0123_2}); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_interleave2(a, b); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_interleave2(a, b); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_interleave2(a, b); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_interleave2(a, b); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_deinterleave2(a, b); +} + +template <> +std::pair, Vectorized> inline deinterleave2< + int32_t>(const Vectorized& a, const Vectorized& b) { + return inner_deinterleave2(a, b); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + return inner_deinterleave2(a, b); +} + +template <> +std::pair, Vectorized> inline deinterleave2< + int64_t>(const Vectorized& a, const Vectorized& b) { + return inner_deinterleave2(a, b); +} + +template +std::enable_if_t< + std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(const Vectorized& src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 64 bits + auto vec_int = src.to_vec_float_helper(); + + return zvec_convert_to_float(vec_int); +} + +template +std::enable_if_t< + std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(const Vectorized& src) { + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + auto vec_int = clamp( + zvec_convert_to_int(src), + Vectorized(min_val), + Vectorized(max_val)); + + return vec_int.to_vec_uint8_helper(); +} + +#undef DEFINE_CLAMP_MAXMIN_FUNCS +#undef DEFINE_MAXMIN_FUNCS +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h new file mode 100644 index 0000000000000000000000000000000000000000..c0250e40e3a7ecb2dfdf5ce4da5e2f22289b1a83 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h @@ -0,0 +1,414 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include + +// clang-format off +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// clang-format on + +#include +#include +#include +#include +#include + +namespace at { +namespace vec { + +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) { + stream << val.val_; + return stream; +} +inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) { + stream << static_cast(val.val_); + return stream; +} +inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) { + stream << static_cast(val.val_); + return stream; +} + +template +std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { + T buf[Vectorized::size()]; + vec.store(buf); + stream << "vec["; + for (int i = 0; i != Vectorized::size(); i++) { + if (i != 0) { + stream << ", "; + } + stream << buf[i]; + } + stream << ']'; + return stream; +} + +#if defined(CPU_CAPABILITY_AVX512) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm512_castpd_ps(src); +} + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm512_castps_pd(src); +} + +template <> +inline Vectorized cast(const Vectorized& src) { + return _mm512_castsi512_ps(src); +} + +template <> +inline Vectorized cast( + const Vectorized& src) { + return _mm512_castsi512_pd(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex) { + return _mm512_i64gather_pd(vindex, base_addr, scale); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex) { + return _mm512_i32gather_ps(vindex, base_addr, scale); +} +#endif +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex, + Vectorized& mask) { + auto all_ones = _mm512_castsi512_pd(_mm512_set1_epi64(0xFFFFFFFFFFFFFFFF)); + auto mask_ = _mm512_cmp_pd_mask(all_ones, mask.values, _CMP_EQ_OQ); + return _mm512_mask_i64gather_pd(src, mask_, vindex, base_addr, scale); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex, + Vectorized& mask) { + auto all_ones = _mm512_castsi512_ps(_mm512_set1_epi32(0xFFFFFFFF)); + auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); + return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); +} +#endif +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return _mm512_cvtpd_epi64(src); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return _mm512_cvttps_epi32(src); +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + return _mm512_cvtepi64_pd(src); +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + return _mm512_cvtepi32_ps(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a3, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + __m512i idx1 = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0); + __m512i idx2 = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4); + return std::make_pair( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, + // a15} b = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, + // b14, b15} + // + // return: + // {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} + // {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, + // b15} + __m512i idx1 = + _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8); + return std::make_pair( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + // output: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + // The members of indices have been written in binary format for better + // understandability + __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1); + + return std::make_pair( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} + // b = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, + // a15, b15} + // output: + // return {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, + // a15} + // {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, + // b15} + __m512i idx1 = _mm512_set_epi32( + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + return std::make_pair( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm512_permutexvar_ps(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + return _mm512_permutexvar_pd(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + return _mm512_permutexvar_epi64(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm512_permutexvar_epi32(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = _mm512_set_epi16( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31); + return _mm512_permutexvar_epi16(mask, v); +} + +inline __m512i flip8(const __m512i& v) { + const __m512i mask1 = _mm512_set_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15); + const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); + auto reversed_vec = _mm512_shuffle_epi8(v, mask1); + return _mm512_permutexvar_epi64(mask2, reversed_vec); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +inline Vectorized operator&&( + const Vectorized& self, + const Vectorized& other) { + const __m512i* self_ = reinterpret_cast(self.as_bytes()); + const __m512i* other_ = reinterpret_cast(other.as_bytes()); + __m512i out = _mm512_and_si512(*self_, *other_); + Vectorized ret; + // We do not have a constructor that takes __m512i, so we need to memcpy + std::memcpy(ret, &out, ret.size() * sizeof(bool)); + return ret; +} + +#endif // defined(CPU_CAPABILITY_AVX512) + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h new file mode 100644 index 0000000000000000000000000000000000000000..44a632b3fb6ef40b766b95446efd36d3e4d72657 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h @@ -0,0 +1,1947 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include + +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +#ifndef SLEEF_CONST +#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#define SLEEF_CONST const +#else +#define SLEEF_CONST +#endif +#define SLEEF_CONST_OLD SLEEF_CONST +#else +#define SLEEF_CONST_OLD +#endif + +// bfloat16 conversion +static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { + o = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16)); +} + +static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) { + __m256i lo = _mm512_extracti32x8_epi32(a, 0); + __m256i hi = _mm512_extracti32x8_epi32(a, 1); + cvtbf16_fp32(lo, o1); + cvtbf16_fp32(hi, o2); +} + +static inline __m256i cvtfp32_bf16(const __m512& src) { + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +} + +static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) { + __m512i lo = _mm512_castps_si512(a); + __m512i hi = _mm512_castps_si512(b); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_lo = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + auto mask_hi = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_lo = _mm512_and_si512(_mm512_srli_epi32(lo, 16), ones); + auto t_hi = _mm512_and_si512(_mm512_srli_epi32(hi, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_lo = _mm512_add_epi32(t_lo, vec_bias); + t_hi = _mm512_add_epi32(t_hi, vec_bias); + // input += rounding_bias; + t_lo = _mm512_add_epi32(t_lo, lo); + t_hi = _mm512_add_epi32(t_hi, hi); + // input = input >> 16; + t_lo = _mm512_srli_epi32(t_lo, 16); + t_hi = _mm512_srli_epi32(t_hi, 16); + // Check NaN before converting back to bf16 + t_lo = _mm512_mask_blend_epi32(mask_lo, nan, t_lo); + t_hi = _mm512_mask_blend_epi32(mask_hi, nan, t_hi); + + t_lo = _mm512_packus_epi32( + t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] + __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + return _mm512_permutexvar_epi64(idx, t_lo); +} + +static inline __m512i merge_compare_result(const __m512& a, const __m512& b) { + __m512i lo = _mm512_castps_si512(a); + __m512i hi = _mm512_castps_si512(b); + lo = _mm512_srli_epi32(lo, 16); + hi = _mm512_srli_epi32(hi, 16); + auto out = _mm512_packus_epi32(lo, hi); + __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + return _mm512_permutexvar_epi64(idx, out); +} + +// float16 conversion +static inline void cvtfp16_fp32(const __m256i& a, __m512& o) { + o = _mm512_cvtph_ps(a); +} + +static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) { + __m256i lo = _mm512_extracti32x8_epi32(a, 0); + __m256i hi = _mm512_extracti32x8_epi32(a, 1); + cvtfp16_fp32(lo, o1); + cvtfp16_fp32(hi, o2); +} + +static inline __m256i cvtfp32_fp16(const __m512& src) { + return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + +static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) { + __m256i lo = + _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m256i hi = + _mm512_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m512 t_lo = _mm512_castsi512_ps(_mm512_castsi256_si512(lo)); + __m256 t_hi = _mm256_castsi256_ps(hi); + return _mm512_castps_si512(_mm512_insertf32x8(t_lo, t_hi, 1)); +} + +// dtype conversion between float16/bfloat16 and float32 +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m256i& a, __m512& o); +template <> +inline void cvt_to_fp32(const __m256i& a, __m512& o) { + cvtbf16_fp32(a, o); +} +template <> +inline void cvt_to_fp32(const __m256i& a, __m512& o) { + cvtfp16_fp32(a, o); +} + +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2); +template <> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2) { + cvtbf16_fp32(a, o1, o2); +} +template <> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2) { + cvtfp16_fp32(a, o1, o2); +} + +template < + typename T, + bool is_compare_op = false, + typename std::enable_if_t, int> = 0> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b); +template <> +inline __m512i cvt_from_fp32( + const __m512& a, + const __m512& b) { + return cvtfp32_bf16(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { + return merge_compare_result(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { + return cvtfp32_fp16(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { + return cvtfp32_fp16(a, b); +} + +template +class Vectorized16 { + static_assert( + is_reduced_floating_point_v, + "Support only float16 and bfloat16."); + + private: + __m512i values; + + public: + using value_type = uint16_t; + using size_type = int; + static constexpr size_type size() { + return 32; + } + Vectorized16() { + values = _mm512_setzero_si512(); + } + Vectorized16(__m512i v) : values(v) {} + Vectorized16(T val) { + value_type uw = val.x; + values = _mm512_set1_epi16(uw); + } + Vectorized16( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32) { + values = _mm512_set_epi16( + val32.x, + val31.x, + val30.x, + val29.x, + val28.x, + val27.x, + val26.x, + val25.x, + val24.x, + val23.x, + val22.x, + val21.x, + val20.x, + val19.x, + val18.x, + val17.x, + val16.x, + val15.x, + val14.x, + val13.x, + val12.x, + val11.x, + val10.x, + val9.x, + val8.x, + val7.x, + val6.x, + val5.x, + val4.x, + val3.x, + val2.x, + val1.x); + } + operator __m512i() const { + return values; + } + T& operator[](int idx) = delete; + const T& operator[](int idx) const = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + return _mm512_cmpeq_epi16_mask(values, _mm512_set1_epi16(0)); + } + static Vectorized loadu(const void* ptr, int16_t count = size()) { + if (count == size()) + return _mm512_loadu_si512(reinterpret_cast(ptr)); + + __mmask32 mask = (1ULL << count) - 1; + return _mm512_maskz_loadu_epi16(mask, ptr); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + __mmask32 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi16(ptr, mask, values); + } + } + template + static Vectorized blend(const Vectorized& a, const Vectorized& b) { + return _mm512_mask_blend_epi16(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto all_ones = _mm512_set1_epi16(0xFFFF); + auto mask_ = _mm512_cmp_epi16_mask(mask, all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi16(mask_, a.values, b.values); + } + template + static Vectorized arange( + T base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + case 16: + return blend<65535>(a, b); + case 17: + return blend<131071>(a, b); + case 18: + return blend<262143>(a, b); + case 19: + return blend<524287>(a, b); + case 20: + return blend<1048575>(a, b); + case 21: + return blend<2097151>(a, b); + case 22: + return blend<4194303>(a, b); + case 23: + return blend<8388607>(a, b); + case 24: + return blend<16777215>(a, b); + case 25: + return blend<33554431>(a, b); + case 26: + return blend<67108863>(a, b); + case 27: + return blend<134217727>(a, b); + case 28: + return blend<268435455>(a, b); + case 29: + return blend<536870911>(a, b); + case 30: + return blend<1073741823>(a, b); + case 31: + return blend<2147483647>(a, b); + } + return b; + } +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wignored-qualifiers" + + Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + const auto o1 = vop(lo); + const auto o2 = vop(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized isnan() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + __mmask16 lo_mask, hi_mask; + __m512 zero = _mm512_set1_ps(0.0); + __m512i zeroi = _mm512_castps_si512(zero); + lo_mask = _mm512_cmp_ps_mask(lo, zero, _CMP_UNORD_Q); + lo = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF)); + hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q); + hi = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF)); + return merge_compare_result(lo, hi); + } +#pragma clang diagnostic pop + Vectorized abs() const { + return _mm512_andnot_si512(_mm512_set1_epi16(0x8000), values); + } + Vectorized angle() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto angle_lambda = [](__m512 values) { + const auto zero_vec = _mm512_set1_ps(0.f); + const auto nan_vec = _mm512_set1_ps(NAN); + const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ); + const auto non_nan_mask_vec = _mm512_mask_set1_epi32( + _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); + const auto nan_mask = _mm512_cmp_ps_mask( + _mm512_castsi512_ps(non_nan_mask_vec), zero_vec, _CMP_EQ_OQ); + const auto pi = _mm512_set1_ps(c10::pi); + + const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ); + auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi); + angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec); + return angle; + }; + auto o1 = angle_lambda(lo); + auto o2 = angle_lambda(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_epi16(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return map(Sleef_acosf16_u10); + } + Vectorized acosh() const { + return map(Sleef_acoshf16_u10); + } + Vectorized asin() const { + return map(Sleef_asinf16_u10); + } + Vectorized asinh() const { + return map(Sleef_asinhf16_u10); + } + Vectorized atan() const { + return map(Sleef_atanf16_u10); + } + Vectorized atanh() const { + return map(Sleef_atanhf16_u10); + } + Vectorized atan2(const Vectorized& b) const { + __m512 lo, hi; + __m512 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_atan2f16_u10(lo, b1); + auto o2 = Sleef_atan2f16_u10(hi, b2); + return cvt_from_fp32(o1, o2); + } + Vectorized copysign(const Vectorized& sign) const { + // copy sign bit (0x8000) from sign and remaining bits from values + __m512i mask_value = _mm512_set1_epi32(~0x80008000); + __m512i mask_signbit = _mm512_set1_epi32(0x80008000); + return Vectorized(_mm512_or_si512( + _mm512_and_si512(values, mask_value), + _mm512_and_si512(sign, mask_signbit))); + } + Vectorized erf() const { + return map(Sleef_erff16_u10); + } + Vectorized erfc() const { + return map(Sleef_erfcf16_u15); + } + Vectorized erfinv() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + for (int64_t i = 0; i < size() / 2; i++) { + tmp1[i] = calc_erfinv(tmp1[i]); + tmp2[i] = calc_erfinv(tmp2[i]); + } + auto o1 = _mm512_loadu_ps(tmp1); + auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized exp() const { + return map(Sleef_expf16_u10); + } + Vectorized exp2() const { + return map(Sleef_exp2f16_u10); + } + Vectorized expm1() const { + return map(Sleef_expm1f16_u10); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const { + __m512 x_lo, x_hi; + cvt_to_fp32(values, x_lo, x_hi); + __m512 q_lo, q_hi; + cvtbf16_fp32(q.values, q_lo, q_hi); + auto o1 = Sleef_fmodf16(x_lo, q_lo); + auto o2 = Sleef_fmodf16(x_hi, q_hi); + return cvt_from_fp32(o1, o2); + } + Vectorized hypot(const Vectorized& b) const { + __m512 lo, hi; + __m512 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_hypotf16_u05(lo, b1); + auto o2 = Sleef_hypotf16_u05(hi, b2); + return cvt_from_fp32(o1, o2); + } + Vectorized i0() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + for (int64_t i = 0; i < size() / 2; i++) { + tmp1[i] = calc_i0(tmp1[i]); + tmp2[i] = calc_i0(tmp2[i]); + } + auto o1 = _mm512_loadu_ps(tmp1); + auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized i0e() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + constexpr auto sz = size(); + __at_align__ float tmp1[sz / 2], tmp2[sz / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + + for (auto i = decltype(sz){0}; i < sz / 2; i++) { + tmp1[i] = calc_i0e(tmp1[i]); + tmp2[i] = calc_i0e(tmp2[i]); + } + const auto o1 = _mm512_loadu_ps(tmp1); + const auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized digamma() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + constexpr auto sz = size(); + __at_align__ float tmp1[sz / 2], tmp2[sz / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + + for (auto i = decltype(sz){0}; i < sz / 2; i++) { + tmp1[i] = calc_digamma(tmp1[i]); + tmp2[i] = calc_digamma(tmp2[i]); + } + const auto o1 = _mm512_loadu_ps(tmp1); + const auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized igamma(const Vectorized& x) const { + __m512 lo, hi; + __m512 xlo, xhi; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(x.values, xlo, xhi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmpx1), xlo); + _mm512_storeu_ps(reinterpret_cast(tmpx2), xhi); + for (int64_t i = 0; i < size() / 2; ++i) { + tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]); + tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]); + } + auto o1 = _mm512_loadu_ps(tmp1); + auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + + Vectorized igammac(const Vectorized& x) const { + __m512 lo, hi; + __m512 xlo, xhi; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(x.values, xlo, xhi); + __at_align__ float tmp1[size() / 2], tmp2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmp1), lo); + _mm512_storeu_ps(reinterpret_cast(tmp2), hi); + __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2]; + _mm512_storeu_ps(reinterpret_cast(tmpx1), xlo); + _mm512_storeu_ps(reinterpret_cast(tmpx2), xhi); + for (int64_t i = 0; i < size() / 2; ++i) { + tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]); + tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]); + } + auto o1 = _mm512_loadu_ps(tmp1); + auto o2 = _mm512_loadu_ps(tmp2); + return cvt_from_fp32(o1, o2); + } + Vectorized log() const { + return map(Sleef_logf16_u10); + } + Vectorized log2() const { + return map(Sleef_log2f16_u10); + } + Vectorized log10() const { + return map(Sleef_log10f16_u10); + } + Vectorized log1p() const { + return map(Sleef_log1pf16_u10); + } + Vectorized sin() const { + return map(Sleef_sinf16_u10); + } + Vectorized sinh() const { + return map(Sleef_sinhf16_u10); + } + Vectorized cos() const { + return map(Sleef_cosf16_u10); + } + Vectorized cosh() const { + return map(Sleef_coshf16_u10); + } + Vectorized ceil() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm512_ceil_ps(lo); + auto o2 = _mm512_ceil_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized floor() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm512_floor_ps(lo); + auto o2 = _mm512_floor_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized neg() const { + return _mm512_xor_si512(values, _mm512_set1_epi16(0x8000)); + } + Vectorized round() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm512_roundscale_ps( + lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + auto o2 = _mm512_roundscale_ps( + hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + return cvt_from_fp32(o1, o2); + } + Vectorized tan() const { + return map(Sleef_tanf16_u10); + } + Vectorized tanh() const { + return map(Sleef_tanhf16_u10); + } + Vectorized trunc() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = + _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + auto o2 = + _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + return cvt_from_fp32(o1, o2); + } + Vectorized lgamma() const { + return map(Sleef_lgammaf16_u10); + } + Vectorized sqrt() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto o1 = _mm512_sqrt_ps(lo); + auto o2 = _mm512_sqrt_ps(hi); + return cvt_from_fp32(o1, o2); + } + Vectorized reciprocal() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto ones = _mm512_set1_ps(1); + auto o1 = _mm512_div_ps(ones, lo); + auto o2 = _mm512_div_ps(ones, hi); + return cvt_from_fp32(o1, o2); + } + Vectorized rsqrt() const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + auto ones = _mm512_set1_ps(1); + auto o1 = _mm512_div_ps(ones, _mm512_sqrt_ps(lo)); + auto o2 = _mm512_div_ps(ones, _mm512_sqrt_ps(hi)); + return cvt_from_fp32(o1, o2); + } + Vectorized pow(const Vectorized& b) const { + __m512 lo, hi; + __m512 b1, b2; + cvt_to_fp32(values, lo, hi); + cvt_to_fp32(b.values, b1, b2); + auto o1 = Sleef_powf16_u10(lo, b1); + auto o2 = Sleef_powf16_u10(hi, b2); + return cvt_from_fp32(o1, o2); + } + + private: + template + Vectorized inline binary_compare(const VectorizedType& b, Op op) const { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + cvt_to_fp32(values, a_lo, a_hi); + cvt_to_fp32(b.values, b_lo, b_hi); + auto o1 = op(a_lo, b_lo); + auto o2 = op(a_hi, b_hi); + return cvt_from_fp32(o1, o2); + } + + public: + Vectorized inline operator>(const Vectorized& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + Vectorized inline operator<(const Vectorized& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + Vectorized inline operator>=(const Vectorized& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + Vectorized inline operator<=(const Vectorized& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + Vectorized inline operator==(const Vectorized16& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + Vectorized inline operator!=(const Vectorized16& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } +}; + +template +static inline Vectorized binary_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + cvt_to_fp32(__m512i(a), a_lo, a_hi); + cvt_to_fp32(__m512i(b), b_lo, b_hi); + auto o1 = op(a_lo, b_lo); + auto o2 = op(a_hi, b_hi); + return cvt_from_fp32(o1, o2); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: + using Vectorized16::Vectorized16; + + using value_type = BFloat16; + + Vectorized frac() const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_si512(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(b), b_lo, b_hi); + auto max_lo = _mm512_max_ps(a_lo, b_lo); + auto max_hi = _mm512_max_ps(a_hi, b_hi); + auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q); + auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask)); + auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask)); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm512_or_ps(max_lo, nan_lo); + auto o2 = _mm512_or_ps(max_hi, nan_hi); + return cvtfp32_bf16(o1, o2); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + __m512i zero_vec = _mm512_set1_epi32(0); + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(b), b_lo, b_hi); + auto min_lo = _mm512_min_ps(a_lo, b_lo); + auto min_hi = _mm512_min_ps(a_hi, b_hi); + auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q); + auto nan_lo = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF)); + auto nan_hi = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm512_or_ps(min_lo, nan_lo); + auto o2 = _mm512_or_ps(min_hi, nan_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + __m512 a_lo, a_hi; + __m512 min_lo, min_hi; + __m512 max_lo, max_hi; + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(min), min_lo, min_hi); + cvtbf16_fp32(__m512i(max), max_lo, max_hi); + auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo)); + auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi)); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + __m512 a_lo, a_hi; + __m512 max_lo, max_hi; + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(max), max_lo, max_hi); + auto o1 = _mm512_min_ps(max_lo, a_lo); + auto o2 = _mm512_min_ps(max_hi, a_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + __m512 a_lo, a_hi; + __m512 min_lo, min_hi; + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(min), min_lo, min_hi); + auto o1 = _mm512_max_ps(min_lo, a_lo); + auto o2 = _mm512_max_ps(min_hi, a_hi); + return cvtfp32_bf16(o1, o2); +} + +template <> +inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); + _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +inline void convert(const float* src, BFloat16* dst, int64_t n) { + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m512 a = _mm512_loadu_ps(&src[i]); + __m512 b = _mm512_loadu_ps(&src[i + 16]); + + __m512i bf = cvtfp32_bf16(a, b); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +inline void convert(const double* src, BFloat16* dst, int64_t n) { + auto load_float = [](const double* src) -> __m512 { + // Load one float vector from an array of doubles + __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src)); + __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8)); + return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1); + }; + + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m512 a = load_float(&src[i]); + __m512 b = load_float(&src[i + 16]); + + __m512i bf = cvtfp32_bf16(a, b); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + __m512 c_lo, c_hi; + cvtbf16_fp32(__m512i(a), a_lo, a_hi); + cvtbf16_fp32(__m512i(b), b_lo, b_hi); + cvtbf16_fp32(__m512i(c), c_lo, c_hi); + auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo); + auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi); + return cvtfp32_bf16(o1, o2); +} + +static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { + __m512i r[8]; + // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15 e0e1 e2e3 e4e5 e6e7 e8e9 + // e10e11 e12e13 e14e15 b0-b15 f0-f15 c0-c15 g0-g15 d0-d15 h0-h15 i0-i15 + // m0-m15 j0-j15 n0-n15 k0-k15 o0-o15 l0-l15 p0-p15 +#ifndef __msvc_cl__ +#pragma unroll(4) +#endif + for (int i = 0; i < 4; i++) { + r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01); + r[i + 4] = + _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01); + } + + // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11 e0e1 f0f1 e2e3 f2f3 e8e9 + // f8f9 e10e11 f10f11 u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15 e4e5 + // f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15 u2: c0c1 d0d1 c2c3 d2d3 c8c9 + // d8d9 c10c11 d10d11 g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11 u3: c4c5 + // d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 + // h12h13 g14g15 h14h15 i j m n k l o p +#ifndef __msvc_cl__ +#pragma unroll(4) +#endif + for (int i = 0; i < 8; i += 2) { + u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]); + u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]); + } + + // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9 e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 + // g8g9 h8h9 r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11 e2e3 f2f3 + // g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 + // c12c13 d12d13 r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 r4: i j k + // l m n o p + r[0] = _mm512_unpacklo_epi64(u[0], u[2]); + r[1] = _mm512_unpackhi_epi64(u[0], u[2]); + r[2] = _mm512_unpacklo_epi64(u[1], u[3]); + r[3] = _mm512_unpackhi_epi64(u[1], u[3]); + r[4] = _mm512_unpacklo_epi64(u[4], u[6]); + r[5] = _mm512_unpackhi_epi64(u[4], u[6]); + r[6] = _mm512_unpacklo_epi64(u[5], u[7]); + r[7] = _mm512_unpackhi_epi64(u[5], u[7]); + + __m512i const1 = _mm512_set_epi32( + 0x00370035, + 0x00330031, + 0x00270025, + 0x00230021, + 0x00170015, + 0x00130011, + 0x00070005, + 0x00030001, + 0x00360034, + 0x00320030, + 0x00260024, + 0x00220020, + 0x00160014, + 0x00120010, + 0x00060004, + 0x00020000); + __m512i const2 = _mm512_set_epi32( + 0x003f003d, + 0x003b0039, + 0x002f002d, + 0x002b0029, + 0x001f001d, + 0x001b0019, + 0x000f000d, + 0x000b0009, + 0x003e003c, + 0x003a0038, + 0x002e002c, + 0x002a0028, + 0x001e001c, + 0x001a0018, + 0x000e000c, + 0x000a0008); + // merge values from two regs + // 0-- 1-- + // 8-- 9-- + // 2-- 3-- + // 10-- 11-- + // 4-- 5-- + // 12-- 13-- + // 6-- 7-- + // 14-- 15-- +#ifndef __msvc_cl__ +#pragma unroll(4) +#endif + for (int i = 0; i < 4; i++) { + u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]); + u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]); + } +} + +// TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16 +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +template <> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst) { + __m256i t[16]; + // load from src to registers + // a: a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 + // b: b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 + // c: c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 + // d: d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + // e: e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 e10 e11 e12 e13 e14 e15 + // f: f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 + // g: g0 g1 g2 g3 g4 g5 g6 g7 g8 g9 g10 g11 g12 g13 g14 g15 + // h: h0 h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12 h13 h14 h15 + // i: i0 i1 i2 i3 i4 i5 i6 i7 i8 i9 i10 i11 i12 i13 i14 i15 + // j: j0 j1 j2 j3 j4 j5 j6 j7 j8 j9 j10 j11 j12 j13 j14 j15 + // k: k0 k1 k2 k3 k4 k5 k6 k7 k8 k9 k10 k11 k12 k13 k14 k15 + // l: l0 l1 l2 l3 l4 l5 l6 l7 l8 l9 l10 l11 l12 l13 l14 l15 + // m: m0 m1 m2 m3 m4 m5 m6 m7 m8 m9 m10 m11 m12 m13 m14 m15 + // n: n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15 + // o: o0 o1 o2 o3 o4 o5 o6 o7 o8 o9 o10 o11 o12 o13 o14 o15 + // p: p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 +#ifndef __msvc_cl__ +#pragma unroll(16) +#endif + for (int i = 0; i < 16; i++) { + t[i] = + _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); + } + + __m512i u[8]; + _transpose_mxn_half_16_16(t, u); + +#ifndef __msvc_cl__ +#pragma unroll(8) +#endif + for (int i = 0; i < 8; i++) { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x01)); + } +} + +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +template <> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst) { + __m256i t[16]; + // load from src to registers + // Same matrix indices as above transpose_mxn +#ifndef __msvc_cl__ +#pragma unroll(16) +#endif + for (int i = 0; i < 16; i++) { + t[i] = + _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); + } + + __m512i u[8]; + _transpose_mxn_half_16_16(t, u); + +#ifndef __msvc_cl__ +#pragma unroll(8) +#endif + for (int i = 0; i < 8; i++) { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x01)); + } +} + +static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { + // t[0]: 0 32 1 33 2 34 3 35 8 40 9 41 10 42 11 43 16 ... 59 + // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63 + // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123 + // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127 + // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 + // ... 187 t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 + // 175 148 ... 191 t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 + // 234 203 235 208 ... 251 t[7]: 196 228 197 229 198 230 199 231 204 236 205 + // 237 206 238 207 239 212 ... 255 t[8]: 256 288 257 289 258 290 259 291 264 + // 296 265 297 266 298 267 299 272 ... 315 t[9]: 260 292 261 293 262 294 263 + // 295 268 300 269 301 270 302 271 303 276 ... 319 t[10]: 320 352 321 353 322 + // 354 323 355 328 360 329 361 330 362 331 363 336 ... 379 t[11]: 324 356 325 + // 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383 t[12]: 384 + // 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443 + // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404 + // ... 447 t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459 + // 491 464 ... 507 t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462 + // 494 463 495 468 ... 511 t[16]: 512 544 513 545 514 546 515 547 520 552 521 + // 553 522 554 523 555 528 ... 571 + // ... + // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 + // 980 ... 1023 +#ifndef __msvc_cl__ +#pragma unroll(16) +#endif + for (int i = 0; i < 16; ++i) { + d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]); + d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]); + } + + // t[0]: 0 32 64 96 1 33 65 97 8 40 72 104 9 41 73 105 16 ... 121 + // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123 + // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125 + // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127 + // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 + // ... 249 t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 + // 235 146 ... 251 t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 + // 173 205 237 148 ... 253 t[7]: 134 166 198 230 135 167 199 231 142 174 206 + // 238 143 175 207 239 150 ... 255 t[8]: 256 288 320 352 257 289 321 353 264 + // 296 328 360 265 297 329 361 272 ... 377 t[9]: 258 290 322 354 259 291 323 + // 355 266 298 330 362 267 299 331 363 274 ... 379 t[10]: 260 292 324 356 261 + // 293 325 357 268 300 332 364 269 301 333 365 276 ... 381 t[11]: 262 294 326 + // 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383 t[12]: 384 + // 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505 + // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402 + // ... 507 t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461 + // 493 404 ... 509 t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399 + // 431 463 495 406 ... 511 t[16]: 512 544 576 608 513 545 577 609 520 552 584 + // 616 521 553 585 617 528 ... 633 + // ... + // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 + // 918 ... 1023 +#ifndef __msvc_cl__ +#pragma unroll(8) +#endif + for (int i = 0; i < 8; ++i) { + r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]); + r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]); + r[i * 4 + 2] = _mm512_unpacklo_epi32(d[i * 4 + 1], d[i * 4 + 3]); + r[i * 4 + 3] = _mm512_unpackhi_epi32(d[i * 4 + 1], d[i * 4 + 3]); + } + + // t[0]: 0 32 64 96 128 160 192 224 8 40 72 104 136 168 200 232 16 ... 248 + // t[1]: 1 33 65 97 129 161 193 225 9 41 73 105 137 169 201 233 17 ... 249 + // t[2]: 2 34 66 98 130 162 194 226 10 42 74 106 138 170 202 234 18 ... 250 + // t[3]: 3 35 67 99 131 163 195 227 11 43 75 107 139 171 203 235 19 ... 251 + // t[4]: 4 36 68 100 132 164 196 228 12 44 76 108 140 172 204 236 20 ... 252 + // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253 + // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254 + // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255 + // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 + // ... 504 t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 + // 489 273 ... 505 t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 + // 426 458 490 274 ... 506 t[11]: 259 291 323 355 387 419 451 483 267 299 331 + // 363 395 427 459 491 275 ... 507 t[12]: 260 292 324 356 388 420 452 484 268 + // 300 332 364 396 428 460 492 276 ... 508 t[13]: 261 293 325 357 389 421 453 + // 485 269 301 333 365 397 429 461 493 277 ... 509 t[14]: 262 294 326 358 390 + // 422 454 486 270 302 334 366 398 430 462 494 278 ... 510 t[15]: 263 295 327 + // 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511 t[16]: 512 + // 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760 + // ... + // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 + // ... 1023 +#ifndef __msvc_cl__ +#pragma unroll(4) +#endif + for (int i = 0; i < 4; ++i) { + d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]); + d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]); + d[i * 8 + 2] = _mm512_unpacklo_epi64(r[i * 8 + 1], r[i * 8 + 5]); + d[i * 8 + 3] = _mm512_unpackhi_epi64(r[i * 8 + 1], r[i * 8 + 5]); + d[i * 8 + 4] = _mm512_unpacklo_epi64(r[i * 8 + 2], r[i * 8 + 6]); + d[i * 8 + 5] = _mm512_unpackhi_epi64(r[i * 8 + 2], r[i * 8 + 6]); + d[i * 8 + 6] = _mm512_unpacklo_epi64(r[i * 8 + 3], r[i * 8 + 7]); + d[i * 8 + 7] = _mm512_unpackhi_epi64(r[i * 8 + 3], r[i * 8 + 7]); + } + + // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 16 ... 496 + // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497 + // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498 + // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499 + // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... + // 500 t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 + // ... 501 t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 + // 22 ... 502 t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 + // 487 23 ... 503 t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 + // 456 488 24 ... 504 t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 + // 425 457 489 25 ... 505 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 + // 394 426 458 490 26 ... 506 t[11]: 11 43 75 107 139 171 203 235 267 299 331 + // 363 395 427 459 491 27 ... 507 t[12]: 12 44 76 108 140 172 204 236 268 300 + // 332 364 396 428 460 492 28 ... 508 t[13]: 13 45 77 109 141 173 205 237 269 + // 301 333 365 397 429 461 493 29 ... 509 t[14]: 14 46 78 110 142 174 206 238 + // 270 302 334 366 398 430 462 494 30 ... 510 t[15]: 15 47 79 111 143 175 207 + // 239 271 303 335 367 399 431 463 495 31 ... 511 t[16]: 512 544 576 608 640 + // 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008 + // ... + // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 + // ... 1023 + __m512i const1 = _mm512_set_epi64( + 0x000000000000000d, + 0x000000000000000c, + 0x0000000000000005, + 0x0000000000000004, + 0x0000000000000009, + 0x0000000000000008, + 0x0000000000000001, + 0x0000000000000000); + __m512i const2 = _mm512_set_epi64( + 0x000000000000000f, + 0x000000000000000e, + 0x0000000000000007, + 0x0000000000000006, + 0x000000000000000b, + 0x000000000000000a, + 0x0000000000000003, + 0x0000000000000002); +#ifndef __msvc_cl__ +#pragma unroll(8) +#endif + for (int i = 0; i < 8; ++i) { + r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/ const1, d[i + 8]); + r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/ const2, d[i + 8]); + r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const1, d[i + 24]); + r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const2, d[i + 24]); + } + + // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544 + // ... 992 t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 + // 513 545 ... 993 t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 + // 450 482 514 546 ... 994 t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 + // 387 419 451 483 515 547 ... 995 t[4]: 4 36 68 100 132 164 196 228 260 292 + // 324 356 388 420 452 484 516 548 ... 996 t[5]: 5 37 69 101 133 165 197 229 + // 261 293 325 357 389 421 453 485 517 549 ... 997 t[6]: 6 38 70 102 134 166 + // 198 230 262 294 326 358 390 422 454 486 518 550 ... 998 t[7]: 7 39 71 103 + // 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999 t[8]: 8 40 + // 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000 + // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553 + // ... 1001 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 + // 490 522 554 ... 1002 t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 + // 395 427 459 491 523 555 ... 1003 t[12]: 12 44 76 108 140 172 204 236 268 + // 300 332 364 396 428 460 492 524 556 ... 1004 t[13]: 13 45 77 109 141 173 + // 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005 t[14]: 14 46 78 + // 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006 t[15]: + // 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ... + // 1007 t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 + // 528 560 ... 1008 + // ... + // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 + // ... 1023 + __m512i const3 = _mm512_set_epi64( + 0x000000000000000b, + 0x000000000000000a, + 0x0000000000000009, + 0x0000000000000008, + 0x0000000000000003, + 0x0000000000000002, + 0x0000000000000001, + 0x0000000000000000); + __m512i const4 = _mm512_set_epi64( + 0x000000000000000f, + 0x000000000000000e, + 0x000000000000000d, + 0x000000000000000c, + 0x0000000000000007, + 0x0000000000000006, + 0x0000000000000005, + 0x0000000000000004); +#ifndef __msvc_cl__ +#pragma unroll(16) +#endif + for (int i = 0; i < 16; ++i) { + d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/ const3, r[i + 16]); + d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/ const4, r[i + 16]); + } +} + +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6 +template <> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst, + int M, + int N) { + // load from src + TORCH_CHECK( + M <= 32 && N <= 32, "transpose_mxn expects M, N <= 32."); + __m512i r[32]; + int i; + if (N == 32) { + for (i = 0; i < M; ++i) { + r[i] = _mm512_loadu_si512(&src[i * ld_src]); + } + } else { + __mmask32 src_mask = (1 << N) - 1; + for (i = 0; i < M; ++i) { + r[i] = _mm512_maskz_loadu_epi16(src_mask, &src[i * ld_src]); + } + } + for (; i < 32; ++i) { + r[i] = _mm512_setzero_si512(); + } + + __m512i d[32]; + _transpose_mxn_half_32_32(r, d); + + // store to dst + if (M == 32) { + for (i = 0; i < N; ++i) { + _mm512_storeu_si512(&dst[i * ld_dst], d[i]); + } + } else { + __mmask32 dst_mask = (1 << M) - 1; + for (i = 0; i < N; ++i) { + _mm512_mask_storeu_epi16(&dst[i * ld_dst], dst_mask, d[i]); + } + } +} + +template < + typename T, + int M, + int N, + typename std::enable_if_t< + std::is_same_v && + ((M <= 32 && M != 16) || (N <= 32 && N != 16)), + int> = 0> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +template <> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst, + int M, + int N) { + TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn expects M, N <= 32."); + // load from src + __m512i r[32]; + int i; + if (N == 32) { + for (i = 0; i < M; ++i) { + r[i] = _mm512_loadu_si512(&src[i * ld_src]); + } + } else { + __mmask32 src_mask = (1 << N) - 1; + for (i = 0; i < M; ++i) { + r[i] = _mm512_maskz_loadu_epi16(src_mask, &src[i * ld_src]); + } + } + for (; i < 32; ++i) { + r[i] = _mm512_setzero_si512(); + } + + __m512i d[32]; + _transpose_mxn_half_32_32(r, d); + + // store to dst + if (M == 32) { + for (i = 0; i < N; ++i) { + _mm512_storeu_si512(&dst[i * ld_dst], d[i]); + } + } else { + __mmask32 dst_mask = (1 << M) - 1; + for (i = 0; i < N; ++i) { + _mm512_mask_storeu_epi16(&dst[i * ld_dst], dst_mask, d[i]); + } + } +} + +template < + typename T, + int M, + int N, + typename std::enable_if_t< + std::is_same_v && + ((M <= 32 && M != 16) || (N <= 32 && N != 16)), + int> = 0> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: + using Vectorized16::Vectorized16; + + using value_type = Half; + + Vectorized frac() const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_si512(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(b), b_lo, b_hi); + auto max_lo = _mm512_max_ps(a_lo, b_lo); + auto max_hi = _mm512_max_ps(a_hi, b_hi); + auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q); + auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask)); + auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask)); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm512_or_ps(max_lo, nan_lo); + auto o2 = _mm512_or_ps(max_hi, nan_hi); + return cvtfp32_fp16(o1, o2); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + __m512i zero_vec = _mm512_set1_epi32(0); + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(b), b_lo, b_hi); + auto min_lo = _mm512_min_ps(a_lo, b_lo); + auto min_hi = _mm512_min_ps(a_hi, b_hi); + auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q); + auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q); + auto nan_lo = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF)); + auto nan_hi = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + auto o1 = _mm512_or_ps(min_lo, nan_lo); + auto o2 = _mm512_or_ps(min_hi, nan_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + __m512 a_lo, a_hi; + __m512 min_lo, min_hi; + __m512 max_lo, max_hi; + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(min), min_lo, min_hi); + cvtfp16_fp32(__m512i(max), max_lo, max_hi); + auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo)); + auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi)); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + __m512 a_lo, a_hi; + __m512 max_lo, max_hi; + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(max), max_lo, max_hi); + auto o1 = _mm512_min_ps(max_lo, a_lo); + auto o2 = _mm512_min_ps(max_hi, a_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + __m512 a_lo, a_hi; + __m512 min_lo, min_hi; + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(min), min_lo, min_hi); + auto o1 = _mm512_max_ps(min_lo, a_lo); + auto o2 = _mm512_max_ps(min_hi, a_hi); + return cvtfp32_fp16(o1, o2); +} + +template <> +inline void convert(const Half* src, Half* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); + _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +inline void convert(const float* src, Half* dst, int64_t n) { + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m512 a = _mm512_loadu_ps(&src[i]); + __m512 b = _mm512_loadu_ps(&src[i + 16]); + + __m512i bf = cvtfp32_fp16(a, b); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +inline void convert(const double* src, Half* dst, int64_t n) { + auto load_float = [](const double* src) -> __m512 { + // Load one float vector from an array of doubles + __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src)); + __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8)); + return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1); + }; + + int64_t i; + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { + __m512 a = load_float(&src[i]); + __m512 b = load_float(&src[i + 16]); + + __m512i bf = cvtfp32_fp16(a, b); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf); + } + for (; i < n; i++) { + dst[i] = c10::convert(src[i]); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + __m512 a_lo, a_hi; + __m512 b_lo, b_hi; + __m512 c_lo, c_hi; + cvtfp16_fp32(__m512i(a), a_lo, a_hi); + cvtfp16_fp32(__m512i(b), b_lo, b_hi); + cvtfp16_fp32(__m512i(c), c_lo, c_hi); + auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo); + auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi); + return cvtfp32_fp16(o1, o2); +} + +#define CONVERT_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + __m512 o1, o2; \ + cvt_to_fp32(__m512i(a), o1, o2); \ + return std::make_tuple(o1, o2); \ + } \ + \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + return cvt_from_fp32(__m512(a), __m512(b)); \ + } +CONVERT_VECTORIZED_INIT(BFloat16, bfloat16) +CONVERT_VECTORIZED_INIT(Half, half) + +#else // defined(CPU_CAPABILITY_AVX512) + +#define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr2); \ + for (const auto k : c10::irange(K)) { \ + arr[k] = c10::convert(arr2[k]); \ + } \ + return std::make_tuple( \ + Vectorized::loadu(arr), \ + Vectorized::loadu(arr + Vectorized::size())); \ + } \ + \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr); \ + b.store(arr + Vectorized::size()); \ + for (const auto k : c10::irange(K)) { \ + arr2[k] = c10::convert(arr[k]); \ + } \ + return Vectorized::loadu(arr2); \ + } +CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16) +CONVERT_NON_VECTORIZED_INIT(Half, half) + +#endif // defined(CPU_CAPABILITY_AVX512) + +#if defined(CPU_CAPABILITY_AVX512) +#define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ + __m512 out_values; \ + cvt_to_fp32(values, out_values); \ + out = out_values; \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + auto vec = Vectorized::loadu(data); \ + __m512 out1_values, out2_values; \ + cvt_to_fp32(vec, out1_values, out2_values); \ + out1 = out1_values; \ + out2 = out2_values; \ + } +LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16) +LOAD_FP32_VECTORIZED_INIT(Half, fp16) + +#else // defined(CPU_CAPABILITY_AVX512) +#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ + for (const auto k : c10::irange(Vectorized::size())) { \ + values[k] = data[k]; \ + } \ + out = Vectorized::loadu(values); \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + load_fp32_from_##name(data, out1); \ + data += Vectorized::size(); \ + load_fp32_from_##name(data, out2); \ + } +LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16) +LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16) + +#endif +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h new file mode 100644 index 0000000000000000000000000000000000000000..0779363c788634d77d10dd700b7c203cae2c206d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h @@ -0,0 +1,661 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m512d values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: + using value_type = c10::complex; + using size_type = int; + static constexpr size_type size() { + return 4; + } + Vectorized() { + values = _mm512_setzero_pd(); + } + Vectorized(__m512d v) : values(v) {} + Vectorized(c10::complex val) { + double real_value = val.real(); + double imag_value = val.imag(); + values = _mm512_setr_pd( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4) { + values = _mm512_setr_pd( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag()); + } + operator __m512d() const { + return values; + } + template + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy + // NOLINTNEXTLINE(clang-diagnostic-warning) + switch (mask) { + case 0: + return a; + case 1: + return _mm512_mask_blend_pd( + 0x03, a.values, b.values); // b0000 0001 = b0000 0011 + case 2: + return _mm512_mask_blend_pd( + 0x0C, a.values, b.values); // b0000 0010 = b0000 1100 + case 3: + return _mm512_mask_blend_pd( + 0x0F, a.values, b.values); // b0000 0011 = b0000 1111 + case 4: + return _mm512_mask_blend_pd( + 0x30, a.values, b.values); // b0000 0100 = b0011 0000 + case 5: + return _mm512_mask_blend_pd( + 0x33, a.values, b.values); // b0000 0101 = b0011 0011 + case 6: + return _mm512_mask_blend_pd( + 0x3C, a.values, b.values); // b0000 0110 = b0011 1100 + case 7: + return _mm512_mask_blend_pd( + 0x3F, a.values, b.values); // b0000 0111 = b0011 1111 + case 8: + return _mm512_mask_blend_pd( + 0xC0, a.values, b.values); // b0000 1000 = b1100 0000 + case 9: + return _mm512_mask_blend_pd( + 0xC3, a.values, b.values); // b0000 1001 = b1100 0011 + case 10: + return _mm512_mask_blend_pd( + 0xCC, a.values, b.values); // b0000 1010 = b1100 1100 + case 11: + return _mm512_mask_blend_pd( + 0xCF, a.values, b.values); // b0000 1011 = b1100 1111 + case 12: + return _mm512_mask_blend_pd( + 0xF0, a.values, b.values); // b0000 1100 = b1111 0000 + case 13: + return _mm512_mask_blend_pd( + 0xF3, a.values, b.values); // b0000 1101 = b1111 0011 + case 14: + return _mm512_mask_blend_pd( + 0xFC, a.values, b.values); // b0000 1110 = b1111 1100 + case 15: + return _mm512_mask_blend_pd( + 0xFF, a.values, b.values); // b0000 1111 = b1111 1111 + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm512_unpacklo_pd(mask.values, mask.values); + auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); + auto mmask = _mm512_cmp_epi64_mask( + _mm512_castpd_si512(mask_), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mmask, a.values, b.values); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + c10::complex(1) * step, + base + c10::complex(2) * step, + base + c10::complex(3) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm512_loadu_pd(reinterpret_cast(ptr)); + + __at_align__ double tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(c10::complex)); + return _mm512_load_pd(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_pd(reinterpret_cast(ptr), values); + } else if (count > 0) { + double tmp_values[2 * size()]; + _mm512_storeu_pd(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); + } + } + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + // AVX512 doesn't have horizontal add & horizontal sub instructions. + // TODO: hadd_pd() & hsub_pd() may have scope for improvement. + static inline __m512d hadd_pd(__m512d a, __m512d b) { + __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0); + __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1); + return _mm512_add_pd( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); + } + static inline __m512d hsub_pd(__m512d a, __m512d b) { + __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0); + __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1); + return _mm512_sub_pd( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); + } + __m512d abs_2_() const { + auto val_2 = _mm512_mul_pd(values, values); // a*a b*b + return hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b + } + __m512d abs_() const { + auto real = _mm512_movedup_pd(values); // real real + // movehdup_pd does not exist... + auto imag = _mm512_permute_pd(values, 0xff); // imag imag + return Sleef_hypotd8_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm512_and_pd(abs_(), real_mask); // abs 0 + } + __m512d angle_() const { + // angle = atan2(b/a) + auto b_a = _mm512_permute_pd(values, 0x55); // b a + return Sleef_atan2d8_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + auto angle = _mm512_permute_pd(angle_(), 0x55); // angle 90-angle + return _mm512_and_pd(angle, real_mask); // angle 0 + } + Vectorized> sgn() const { + auto abs = abs_(); + auto zero = _mm512_setzero_pd(); + auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); + auto div = _mm512_div_pd(values, abs); + return _mm512_mask_blend_pd(mask, div, zero); + } + __m512d real_() const { + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm512_and_pd(values, real_mask); + } + Vectorized> real() const { + return real_(); + } + __m512d imag_() const { + const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF)); + return _mm512_and_pd(values, imag_mask); + } + Vectorized> imag() const { + return _mm512_permute_pd(imag_(), 0x55); // b a + } + __m512d conj_() const { + const __m512d sign_mask = + _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + return _mm512_xor_pd(values, sign_mask); // a -b + } + Vectorized> conj() const { + return conj_(); + } + Vectorized> log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + Vectorized> log2() const { + const __m512d log2_ = _mm512_set1_pd(std::log(2)); + return _mm512_div_pd(log(), log2_); + } + Vectorized> log10() const { + const __m512d log10_ = _mm512_set1_pd(std::log(10)); + return _mm512_div_pd(log(), log10_); + } + Vectorized> log1p() const { + return map(std::log1p); + } + Vectorized> asin() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // // asin(x) + // // = -i*ln(iz + sqrt(1 -z^2)) + // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + // const __m512d one = _mm512_set1_pd(1); + + // auto conj = conj_(); + // auto b_a = _mm512_permute_pd(conj, 0x55); //-b a + // auto ab = _mm512_mul_pd(conj, b_a); //-ab + // -ab auto im = _mm512_add_pd(ab, ab); //-2ab -2ab + + // auto val_2 = _mm512_mul_pd(values, values); // a*a + // b*b auto re = hsub_pd(val_2, _mm512_permute_pd(val_2, 0x55)); // a*a-b*b + // b*b-a*a re = _mm512_sub_pd(one, re); + + // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt(); + // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_pd(b_a, root)).log(); + // //ln(iz + sqrt()) return Vectorized(_mm512_permute_pd(ln.values, + // 0x55)).conj(); //-i*ln() + return map(std::asin); + } + Vectorized> acos() const { + // acos(x) = pi/2 - asin(x) + constexpr auto pi_2d = c10::pi / 2; + const __m512d pi_2 = + _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0); + return _mm512_sub_pd(pi_2, asin()); + } + Vectorized> atan() const; + Vectorized> atanh() const { + return map(std::atanh); + } + Vectorized> exp() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expd8_u10(values); //exp(a) exp(b) exp = + // _mm512_mask_blend_pd(0xAA, exp, _mm512_permute_pd(exp, 0x55)); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosd8_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm512_mask_blend_pd(0xAA, + // _mm512_permute_pd(sin_cos.y, 0x55), + // sin_cos.x); //cos(b) + // sin(b) + // return _mm512_mul_pd(exp, cos_sin); + return map(std::exp); + } + Vectorized> exp2() const { + // Use identity 2**x = exp(log(2) * x) + const __m512d ln_2 = _mm512_set1_pd(c10::ln_2); + Vectorized> scaled_values = + _mm512_mul_pd(values, ln_2); + return scaled_values.exp(); + } + Vectorized> expm1() const { + return map(std::expm1); + } + Vectorized> sin() const { + return map(std::sin); + } + Vectorized> sinh() const { + return map(std::sinh); + } + Vectorized> cos() const { + return map(std::cos); + } + Vectorized> cosh() const { + return map(std::cosh); + } + Vectorized> ceil() const { + return _mm512_ceil_pd(values); + } + Vectorized> floor() const { + return _mm512_floor_pd(values); + } + Vectorized> neg() const { + auto zero = _mm512_setzero_pd(); + return _mm512_sub_pd(zero, values); + } + Vectorized> round() const { + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized> tan() const { + return map(std::tan); + } + Vectorized> tanh() const { + return map(std::tanh); + } + Vectorized> trunc() const { + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized> sqrt() const { + return map(std::sqrt); + } + Vectorized> reciprocal() const; + Vectorized> rsqrt() const { + return sqrt().reciprocal(); + } + Vectorized> pow( + const Vectorized>& exp) const { + __at_align__ c10::complex x_tmp[size()]; + __at_align__ c10::complex y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized> operator==( + const Vectorized>& other) const { + auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF)); + } + Vectorized> operator!=( + const Vectorized>& other) const { + auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF)); + } + Vectorized> operator<( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_add_pd(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_sub_pd(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m512d sign_mask = + _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm512_mul_pd(a, b); // ac bd + + auto d_c = _mm512_permute_pd(b, 0x55); // d c + d_c = _mm512_xor_pd(sign_mask, d_c); // d -c + auto ad_bc = _mm512_mul_pd(a, d_c); // ad -bc + + auto ret = Vectorized>::hsub_pd( + ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // auto mask = _mm512_set1_pd(-0.f); + // auto fabs_cd = _mm512_andnot_pd(mask, b); // |c| |d| + // auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55); // |d| |c| + // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc)); // 1/sc + // 1/sc auto a2 = _mm512_mul_pd(a, scale); // a/sc b/sc auto b2 = + // _mm512_mul_pd(b, scale); // c/sc d/sc auto acbd2 = + // _mm512_mul_pd(a2, b2); + + // const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); auto dc2 = _mm512_permute_pd(b2, 0x55); // d/sc c/sc + // dc2 = _mm512_xor_pd(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm512_mul_pd(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = Vectorized>::hadd_pd(acbd2, adbc2); + // //(ac+bd)/sc^2 (bc-ad)/sc^2 + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm512_div_pd(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return _mm512_loadu_pd(reinterpret_cast(out)); +} + +// reciprocal. Implement this here so we can use multiplication. +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); auto c_d = _mm512_xor_pd(sign_mask, values); //c -d + // return _mm512_div_pd(c_d, abs_2_()); + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = c10::complex(1) / tmp[i]; + } + return loadu(tmp); +} + +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m512d i = _mm512_setr_pd(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm512_setr_pd(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm512_add_pd(i, values)); // a + // 1+b auto sub = Vectorized(_mm512_sub_pd(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() + return map(std::atan); +} + +template <> +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { + auto zero_vec = _mm512_set1_epi64(0); + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_LT_OQ); + auto max = _mm512_mask_blend_pd(mask, a, b); + // Exploit the fact that all-ones is a NaN. + auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q); + auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF); + return _mm512_or_pd(max, _mm512_castsi512_pd(isnan)); +} + +template <> +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { + auto zero_vec = _mm512_set1_epi64(0); + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_GT_OQ); + auto min = _mm512_mask_blend_pd(mask, a, b); + // Exploit the fact that all-ones is a NaN. + auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q); + auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF); + return _mm512_or_pd(min, _mm512_castsi512_pd(isnan)); +} + +template <> +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_and_pd(a, b); +} + +template <> +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_or_pd(a, b); +} + +template <> +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_xor_pd(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm512_set1_pd(1.0)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm512_set1_pd(1.0)); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h new file mode 100644 index 0000000000000000000000000000000000000000..59fce4ea931c3671dfe3c87387a524bcc6666690 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h @@ -0,0 +1,1229 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m512 values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: + using value_type = c10::complex; + using size_type = int; + static constexpr size_type size() { + return 8; + } + Vectorized() { + values = _mm512_setzero_ps(); + } + Vectorized(__m512 v) : values(v) {} + Vectorized(c10::complex val) { + float real_value = val.real(); + float imag_value = val.imag(); + values = _mm512_setr_ps( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4, + c10::complex val5, + c10::complex val6, + c10::complex val7, + c10::complex val8) { + values = _mm512_setr_ps( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag(), + val5.real(), + val5.imag(), + val6.real(), + val6.imag(), + val7.real(), + val7.imag(), + val8.real(), + val8.imag()); + } + operator __m512() const { + return values; + } + template + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy + static_assert(mask > -1 && mask < 256, "Unexpected mask value"); + // The compiler would hopefully convert this switch condition + // into a jump table + switch (mask) { + case 0: + return a; + case 1: + return _mm512_mask_blend_ps(0x03, a.values, b.values); + case 2: + return _mm512_mask_blend_ps(0x0C, a.values, b.values); + case 3: + return _mm512_mask_blend_ps(0x0F, a.values, b.values); + case 4: + return _mm512_mask_blend_ps(0x30, a.values, b.values); + case 5: + return _mm512_mask_blend_ps(0x33, a.values, b.values); + case 6: + return _mm512_mask_blend_ps(0x3C, a.values, b.values); + case 7: + return _mm512_mask_blend_ps(0x3F, a.values, b.values); + case 8: + return _mm512_mask_blend_ps(0xC0, a.values, b.values); + case 9: + return _mm512_mask_blend_ps(0xC3, a.values, b.values); + case 10: + return _mm512_mask_blend_ps(0xCC, a.values, b.values); + case 11: + return _mm512_mask_blend_ps(0xCF, a.values, b.values); + case 12: + return _mm512_mask_blend_ps(0xF0, a.values, b.values); + case 13: + return _mm512_mask_blend_ps(0xF3, a.values, b.values); + case 14: + return _mm512_mask_blend_ps(0xFC, a.values, b.values); + case 15: + return _mm512_mask_blend_ps(0xFF, a.values, b.values); + case 16: + return _mm512_mask_blend_ps(0x300, a.values, b.values); + case 17: + return _mm512_mask_blend_ps(0x303, a.values, b.values); + case 18: + return _mm512_mask_blend_ps(0x30C, a.values, b.values); + case 19: + return _mm512_mask_blend_ps(0x30F, a.values, b.values); + case 20: + return _mm512_mask_blend_ps(0x330, a.values, b.values); + case 21: + return _mm512_mask_blend_ps(0x333, a.values, b.values); + case 22: + return _mm512_mask_blend_ps(0x33C, a.values, b.values); + case 23: + return _mm512_mask_blend_ps(0x33F, a.values, b.values); + case 24: + return _mm512_mask_blend_ps(0x3C0, a.values, b.values); + case 25: + return _mm512_mask_blend_ps(0x3C3, a.values, b.values); + case 26: + return _mm512_mask_blend_ps(0x3CC, a.values, b.values); + case 27: + return _mm512_mask_blend_ps(0x3CF, a.values, b.values); + case 28: + return _mm512_mask_blend_ps(0x3F0, a.values, b.values); + case 29: + return _mm512_mask_blend_ps(0x3F3, a.values, b.values); + case 30: + return _mm512_mask_blend_ps(0x3FC, a.values, b.values); + case 31: + return _mm512_mask_blend_ps(0x3FF, a.values, b.values); + case 32: + return _mm512_mask_blend_ps(0xC00, a.values, b.values); + case 33: + return _mm512_mask_blend_ps(0xC03, a.values, b.values); + case 34: + return _mm512_mask_blend_ps(0xC0C, a.values, b.values); + case 35: + return _mm512_mask_blend_ps(0xC0F, a.values, b.values); + case 36: + return _mm512_mask_blend_ps(0xC30, a.values, b.values); + case 37: + return _mm512_mask_blend_ps(0xC33, a.values, b.values); + case 38: + return _mm512_mask_blend_ps(0xC3C, a.values, b.values); + case 39: + return _mm512_mask_blend_ps(0xC3F, a.values, b.values); + case 40: + return _mm512_mask_blend_ps(0xCC0, a.values, b.values); + case 41: + return _mm512_mask_blend_ps(0xCC3, a.values, b.values); + case 42: + return _mm512_mask_blend_ps(0xCCC, a.values, b.values); + case 43: + return _mm512_mask_blend_ps(0xCCF, a.values, b.values); + case 44: + return _mm512_mask_blend_ps(0xCF0, a.values, b.values); + case 45: + return _mm512_mask_blend_ps(0xCF3, a.values, b.values); + case 46: + return _mm512_mask_blend_ps(0xCFC, a.values, b.values); + case 47: + return _mm512_mask_blend_ps(0xCFF, a.values, b.values); + case 48: + return _mm512_mask_blend_ps(0xF00, a.values, b.values); + case 49: + return _mm512_mask_blend_ps(0xF03, a.values, b.values); + case 50: + return _mm512_mask_blend_ps(0xF0C, a.values, b.values); + case 51: + return _mm512_mask_blend_ps(0xF0F, a.values, b.values); + case 52: + return _mm512_mask_blend_ps(0xF30, a.values, b.values); + case 53: + return _mm512_mask_blend_ps(0xF33, a.values, b.values); + case 54: + return _mm512_mask_blend_ps(0xF3C, a.values, b.values); + case 55: + return _mm512_mask_blend_ps(0xF3F, a.values, b.values); + case 56: + return _mm512_mask_blend_ps(0xFC0, a.values, b.values); + case 57: + return _mm512_mask_blend_ps(0xFC3, a.values, b.values); + case 58: + return _mm512_mask_blend_ps(0xFCC, a.values, b.values); + case 59: + return _mm512_mask_blend_ps(0xFCF, a.values, b.values); + case 60: + return _mm512_mask_blend_ps(0xFF0, a.values, b.values); + case 61: + return _mm512_mask_blend_ps(0xFF3, a.values, b.values); + case 62: + return _mm512_mask_blend_ps(0xFFC, a.values, b.values); + case 63: + return _mm512_mask_blend_ps(0xFFF, a.values, b.values); + case 64: + return _mm512_mask_blend_ps(0x3000, a.values, b.values); + case 65: + return _mm512_mask_blend_ps(0x3003, a.values, b.values); + case 66: + return _mm512_mask_blend_ps(0x300C, a.values, b.values); + case 67: + return _mm512_mask_blend_ps(0x300F, a.values, b.values); + case 68: + return _mm512_mask_blend_ps(0x3030, a.values, b.values); + case 69: + return _mm512_mask_blend_ps(0x3033, a.values, b.values); + case 70: + return _mm512_mask_blend_ps(0x303C, a.values, b.values); + case 71: + return _mm512_mask_blend_ps(0x303F, a.values, b.values); + case 72: + return _mm512_mask_blend_ps(0x30C0, a.values, b.values); + case 73: + return _mm512_mask_blend_ps(0X30C3, a.values, b.values); + case 74: + return _mm512_mask_blend_ps(0x30CC, a.values, b.values); + case 75: + return _mm512_mask_blend_ps(0x30CF, a.values, b.values); + case 76: + return _mm512_mask_blend_ps(0x30F0, a.values, b.values); + case 77: + return _mm512_mask_blend_ps(0x30F3, a.values, b.values); + case 78: + return _mm512_mask_blend_ps(0x30FC, a.values, b.values); + case 79: + return _mm512_mask_blend_ps(0x30FF, a.values, b.values); + case 80: + return _mm512_mask_blend_ps(0x3300, a.values, b.values); + case 81: + return _mm512_mask_blend_ps(0X3303, a.values, b.values); + case 82: + return _mm512_mask_blend_ps(0x330C, a.values, b.values); + case 83: + return _mm512_mask_blend_ps(0x330F, a.values, b.values); + case 84: + return _mm512_mask_blend_ps(0x3330, a.values, b.values); + case 85: + return _mm512_mask_blend_ps(0x3333, a.values, b.values); + case 86: + return _mm512_mask_blend_ps(0x333C, a.values, b.values); + case 87: + return _mm512_mask_blend_ps(0X333F, a.values, b.values); + case 88: + return _mm512_mask_blend_ps(0x33C0, a.values, b.values); + case 89: + return _mm512_mask_blend_ps(0x33C3, a.values, b.values); + case 90: + return _mm512_mask_blend_ps(0x33CC, a.values, b.values); + case 91: + return _mm512_mask_blend_ps(0x33CF, a.values, b.values); + case 92: + return _mm512_mask_blend_ps(0x33F0, a.values, b.values); + case 93: + return _mm512_mask_blend_ps(0x33F3, a.values, b.values); + case 94: + return _mm512_mask_blend_ps(0x33FC, a.values, b.values); + case 95: + return _mm512_mask_blend_ps(0x33FF, a.values, b.values); + case 96: + return _mm512_mask_blend_ps(0X3C00, a.values, b.values); + case 97: + return _mm512_mask_blend_ps(0x3C03, a.values, b.values); + case 98: + return _mm512_mask_blend_ps(0x3C0C, a.values, b.values); + case 99: + return _mm512_mask_blend_ps(0x3C0F, a.values, b.values); + case 100: + return _mm512_mask_blend_ps(0x3C30, a.values, b.values); + case 101: + return _mm512_mask_blend_ps(0x3C33, a.values, b.values); + case 102: + return _mm512_mask_blend_ps(0x3C3C, a.values, b.values); + case 103: + return _mm512_mask_blend_ps(0x3C3F, a.values, b.values); + case 104: + return _mm512_mask_blend_ps(0x3CC0, a.values, b.values); + case 105: + return _mm512_mask_blend_ps(0x3CC3, a.values, b.values); + case 106: + return _mm512_mask_blend_ps(0x3CCC, a.values, b.values); + case 107: + return _mm512_mask_blend_ps(0x3CCF, a.values, b.values); + case 108: + return _mm512_mask_blend_ps(0x3CF0, a.values, b.values); + case 109: + return _mm512_mask_blend_ps(0x3CF3, a.values, b.values); + case 110: + return _mm512_mask_blend_ps(0x3CFC, a.values, b.values); + case 111: + return _mm512_mask_blend_ps(0x3CFF, a.values, b.values); + case 112: + return _mm512_mask_blend_ps(0x3F00, a.values, b.values); + case 113: + return _mm512_mask_blend_ps(0x3F03, a.values, b.values); + case 114: + return _mm512_mask_blend_ps(0x3F0C, a.values, b.values); + case 115: + return _mm512_mask_blend_ps(0x3F0F, a.values, b.values); + case 116: + return _mm512_mask_blend_ps(0x3F30, a.values, b.values); + case 117: + return _mm512_mask_blend_ps(0x3F33, a.values, b.values); + case 118: + return _mm512_mask_blend_ps(0x3F3C, a.values, b.values); + case 119: + return _mm512_mask_blend_ps(0x3F3F, a.values, b.values); + case 120: + return _mm512_mask_blend_ps(0x3FC0, a.values, b.values); + case 121: + return _mm512_mask_blend_ps(0x3FC3, a.values, b.values); + case 122: + return _mm512_mask_blend_ps(0x3FCC, a.values, b.values); + case 123: + return _mm512_mask_blend_ps(0x3FCF, a.values, b.values); + case 124: + return _mm512_mask_blend_ps(0x3FF0, a.values, b.values); + case 125: + return _mm512_mask_blend_ps(0x3FF3, a.values, b.values); + case 126: + return _mm512_mask_blend_ps(0x3FFC, a.values, b.values); + case 127: + return _mm512_mask_blend_ps(0x3FFF, a.values, b.values); + case 128: + return _mm512_mask_blend_ps(0xC000, a.values, b.values); + case 129: + return _mm512_mask_blend_ps(0xC003, a.values, b.values); + case 130: + return _mm512_mask_blend_ps(0xC00C, a.values, b.values); + case 131: + return _mm512_mask_blend_ps(0xC00F, a.values, b.values); + case 132: + return _mm512_mask_blend_ps(0xC030, a.values, b.values); + case 133: + return _mm512_mask_blend_ps(0xC033, a.values, b.values); + case 134: + return _mm512_mask_blend_ps(0xC03C, a.values, b.values); + case 135: + return _mm512_mask_blend_ps(0xC03F, a.values, b.values); + case 136: + return _mm512_mask_blend_ps(0xC0C0, a.values, b.values); + case 137: + return _mm512_mask_blend_ps(0xC0C3, a.values, b.values); + case 138: + return _mm512_mask_blend_ps(0xC0CC, a.values, b.values); + case 139: + return _mm512_mask_blend_ps(0xC0CF, a.values, b.values); + case 140: + return _mm512_mask_blend_ps(0xC0F0, a.values, b.values); + case 141: + return _mm512_mask_blend_ps(0xC0F3, a.values, b.values); + case 142: + return _mm512_mask_blend_ps(0xC0FC, a.values, b.values); + case 143: + return _mm512_mask_blend_ps(0xC0FF, a.values, b.values); + case 144: + return _mm512_mask_blend_ps(0xC300, a.values, b.values); + case 145: + return _mm512_mask_blend_ps(0xC303, a.values, b.values); + case 146: + return _mm512_mask_blend_ps(0xC30C, a.values, b.values); + case 147: + return _mm512_mask_blend_ps(0xC30F, a.values, b.values); + case 148: + return _mm512_mask_blend_ps(0xC330, a.values, b.values); + case 149: + return _mm512_mask_blend_ps(0xC333, a.values, b.values); + case 150: + return _mm512_mask_blend_ps(0xC33C, a.values, b.values); + case 151: + return _mm512_mask_blend_ps(0xC33F, a.values, b.values); + case 152: + return _mm512_mask_blend_ps(0xC3C0, a.values, b.values); + case 153: + return _mm512_mask_blend_ps(0xC3C3, a.values, b.values); + case 154: + return _mm512_mask_blend_ps(0xC3CC, a.values, b.values); + case 155: + return _mm512_mask_blend_ps(0xC3CF, a.values, b.values); + case 156: + return _mm512_mask_blend_ps(0xC3F0, a.values, b.values); + case 157: + return _mm512_mask_blend_ps(0xC3F3, a.values, b.values); + case 158: + return _mm512_mask_blend_ps(0xC3FC, a.values, b.values); + case 159: + return _mm512_mask_blend_ps(0xC3FF, a.values, b.values); + case 160: + return _mm512_mask_blend_ps(0xCC00, a.values, b.values); + case 161: + return _mm512_mask_blend_ps(0xCC03, a.values, b.values); + case 162: + return _mm512_mask_blend_ps(0xCC0C, a.values, b.values); + case 163: + return _mm512_mask_blend_ps(0xCC0F, a.values, b.values); + case 164: + return _mm512_mask_blend_ps(0xCC30, a.values, b.values); + case 165: + return _mm512_mask_blend_ps(0xCC33, a.values, b.values); + case 166: + return _mm512_mask_blend_ps(0xCC3C, a.values, b.values); + case 167: + return _mm512_mask_blend_ps(0xCC3F, a.values, b.values); + case 168: + return _mm512_mask_blend_ps(0xCCC0, a.values, b.values); + case 169: + return _mm512_mask_blend_ps(0xCCC3, a.values, b.values); + case 170: + return _mm512_mask_blend_ps(0xCCCC, a.values, b.values); + case 171: + return _mm512_mask_blend_ps(0xCCCF, a.values, b.values); + case 172: + return _mm512_mask_blend_ps(0xCCF0, a.values, b.values); + case 173: + return _mm512_mask_blend_ps(0xCCF3, a.values, b.values); + case 174: + return _mm512_mask_blend_ps(0xCCFC, a.values, b.values); + case 175: + return _mm512_mask_blend_ps(0xCCFF, a.values, b.values); + case 176: + return _mm512_mask_blend_ps(0xCF00, a.values, b.values); + case 177: + return _mm512_mask_blend_ps(0xCF03, a.values, b.values); + case 178: + return _mm512_mask_blend_ps(0xCF0C, a.values, b.values); + case 179: + return _mm512_mask_blend_ps(0xCF0F, a.values, b.values); + case 180: + return _mm512_mask_blend_ps(0xCF30, a.values, b.values); + case 181: + return _mm512_mask_blend_ps(0xCF33, a.values, b.values); + case 182: + return _mm512_mask_blend_ps(0xCF3C, a.values, b.values); + case 183: + return _mm512_mask_blend_ps(0xCF3F, a.values, b.values); + case 184: + return _mm512_mask_blend_ps(0xCFC0, a.values, b.values); + case 185: + return _mm512_mask_blend_ps(0xCFC3, a.values, b.values); + case 186: + return _mm512_mask_blend_ps(0xCFCC, a.values, b.values); + case 187: + return _mm512_mask_blend_ps(0xCFCF, a.values, b.values); + case 188: + return _mm512_mask_blend_ps(0xCFF0, a.values, b.values); + case 189: + return _mm512_mask_blend_ps(0xCFF3, a.values, b.values); + case 190: + return _mm512_mask_blend_ps(0xCFFC, a.values, b.values); + case 191: + return _mm512_mask_blend_ps(0xCFFF, a.values, b.values); + case 192: + return _mm512_mask_blend_ps(0xF000, a.values, b.values); + case 193: + return _mm512_mask_blend_ps(0xF003, a.values, b.values); + case 194: + return _mm512_mask_blend_ps(0xF00C, a.values, b.values); + case 195: + return _mm512_mask_blend_ps(0xF00F, a.values, b.values); + case 196: + return _mm512_mask_blend_ps(0xF030, a.values, b.values); + case 197: + return _mm512_mask_blend_ps(0xF033, a.values, b.values); + case 198: + return _mm512_mask_blend_ps(0xF03C, a.values, b.values); + case 199: + return _mm512_mask_blend_ps(0xF03F, a.values, b.values); + case 200: + return _mm512_mask_blend_ps(0XF0C0, a.values, b.values); + case 201: + return _mm512_mask_blend_ps(0xF0C3, a.values, b.values); + case 202: + return _mm512_mask_blend_ps(0xF0CC, a.values, b.values); + case 203: + return _mm512_mask_blend_ps(0xF0CF, a.values, b.values); + case 204: + return _mm512_mask_blend_ps(0xF0F0, a.values, b.values); + case 205: + return _mm512_mask_blend_ps(0xF0F3, a.values, b.values); + case 206: + return _mm512_mask_blend_ps(0xF0FC, a.values, b.values); + case 207: + return _mm512_mask_blend_ps(0xF0FF, a.values, b.values); + case 208: + return _mm512_mask_blend_ps(0XF300, a.values, b.values); + case 209: + return _mm512_mask_blend_ps(0xF303, a.values, b.values); + case 210: + return _mm512_mask_blend_ps(0xF30C, a.values, b.values); + case 211: + return _mm512_mask_blend_ps(0xF30F, a.values, b.values); + case 212: + return _mm512_mask_blend_ps(0xF330, a.values, b.values); + case 213: + return _mm512_mask_blend_ps(0xF333, a.values, b.values); + case 214: + return _mm512_mask_blend_ps(0XF33C, a.values, b.values); + case 215: + return _mm512_mask_blend_ps(0xF33F, a.values, b.values); + case 216: + return _mm512_mask_blend_ps(0xF3C0, a.values, b.values); + case 217: + return _mm512_mask_blend_ps(0xF3C3, a.values, b.values); + case 218: + return _mm512_mask_blend_ps(0xF3CC, a.values, b.values); + case 219: + return _mm512_mask_blend_ps(0xF3CF, a.values, b.values); + case 220: + return _mm512_mask_blend_ps(0xF3F0, a.values, b.values); + case 221: + return _mm512_mask_blend_ps(0xF3F3, a.values, b.values); + case 222: + return _mm512_mask_blend_ps(0xF3FC, a.values, b.values); + case 223: + return _mm512_mask_blend_ps(0XF3FF, a.values, b.values); + case 224: + return _mm512_mask_blend_ps(0xFC00, a.values, b.values); + case 225: + return _mm512_mask_blend_ps(0xFC03, a.values, b.values); + case 226: + return _mm512_mask_blend_ps(0xFC0C, a.values, b.values); + case 227: + return _mm512_mask_blend_ps(0xFC0F, a.values, b.values); + case 228: + return _mm512_mask_blend_ps(0xFC30, a.values, b.values); + case 229: + return _mm512_mask_blend_ps(0xFC33, a.values, b.values); + case 230: + return _mm512_mask_blend_ps(0xFC3C, a.values, b.values); + case 231: + return _mm512_mask_blend_ps(0xFC3F, a.values, b.values); + case 232: + return _mm512_mask_blend_ps(0xFCC0, a.values, b.values); + case 233: + return _mm512_mask_blend_ps(0xFCC3, a.values, b.values); + case 234: + return _mm512_mask_blend_ps(0xFCCC, a.values, b.values); + case 235: + return _mm512_mask_blend_ps(0xFCCF, a.values, b.values); + case 236: + return _mm512_mask_blend_ps(0xFCF0, a.values, b.values); + case 237: + return _mm512_mask_blend_ps(0xFCF3, a.values, b.values); + case 238: + return _mm512_mask_blend_ps(0xFCFC, a.values, b.values); + case 239: + return _mm512_mask_blend_ps(0xFCFF, a.values, b.values); + case 240: + return _mm512_mask_blend_ps(0xFF00, a.values, b.values); + case 241: + return _mm512_mask_blend_ps(0xFF03, a.values, b.values); + case 242: + return _mm512_mask_blend_ps(0xFF0C, a.values, b.values); + case 243: + return _mm512_mask_blend_ps(0xFF0F, a.values, b.values); + case 244: + return _mm512_mask_blend_ps(0xFF30, a.values, b.values); + case 245: + return _mm512_mask_blend_ps(0xFF33, a.values, b.values); + case 246: + return _mm512_mask_blend_ps(0xFF3C, a.values, b.values); + case 247: + return _mm512_mask_blend_ps(0xFF3F, a.values, b.values); + case 248: + return _mm512_mask_blend_ps(0xFFC0, a.values, b.values); + case 249: + return _mm512_mask_blend_ps(0xFFC3, a.values, b.values); + case 250: + return _mm512_mask_blend_ps(0xFFCC, a.values, b.values); + case 251: + return _mm512_mask_blend_ps(0xFFCF, a.values, b.values); + case 252: + return _mm512_mask_blend_ps(0xFFF0, a.values, b.values); + case 253: + return _mm512_mask_blend_ps(0xFFF3, a.values, b.values); + case 254: + return _mm512_mask_blend_ps(0xFFFC, a.values, b.values); + default: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm512_unpacklo_ps(mask.values, mask.values); + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto mmask = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask_), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mmask, a.values, b.values); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + step, + base + c10::complex(2) * step, + base + c10::complex(3) * step, + base + c10::complex(4) * step, + base + c10::complex(5) * step, + base + c10::complex(6) * step, + base + c10::complex(7) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm512_loadu_ps(reinterpret_cast(ptr)); + + __at_align__ float tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { + tmp_values[i] = 0.0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(c10::complex)); + return _mm512_load_ps(tmp_values); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_ps(reinterpret_cast(ptr), values); + } else if (count > 0) { + float tmp_values[2 * size()]; + _mm512_storeu_ps(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); + } + } + // AVX512 doesn't have horizontal add & horizontal sub instructions. + // TODO: hadd_pd() & hsub_pd() may have scope for improvement. + static inline __m512 hadd_ps(__m512 a, __m512 b) { + __m512i idx1 = _mm512_set_epi32( + 30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1); + return _mm512_add_ps( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); + } + static inline __m512 hsub_ps(__m512 a, __m512 b) { + __m512i idx1 = _mm512_set_epi32( + 30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1); + return _mm512_sub_ps( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); + } + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + __m512 abs_2_() const { + auto val_2 = _mm512_mul_ps(values, values); // a*a b*b + auto ret = hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b + return ret; + } + __m512 abs_() const { + auto real = _mm512_moveldup_ps(values); // real real + auto imag = _mm512_movehdup_ps(values); // imag imag + return Sleef_hypotf16_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm512_and_ps(abs_(), real_mask); // abs 0 + } + __m512 angle_() const { + // angle = atan2(b/a) + auto b_a = _mm512_permute_ps(values, 0xB1); // b a + return Sleef_atan2f16_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle 90-angle + return _mm512_and_ps(angle, real_mask); // angle 0 + } + Vectorized> sgn() const { + auto abs = abs_(); + auto zero = _mm512_setzero_ps(); + auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); + auto div = _mm512_div_ps(values, abs); + return _mm512_mask_blend_ps(mask, div, zero); + } + __m512 real_() const { + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm512_and_ps(values, real_mask); + } + Vectorized> real() const { + return real_(); + } + __m512 imag_() const { + const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF)); + return _mm512_and_ps(values, imag_mask); + } + Vectorized> imag() const { + return _mm512_permute_ps(imag_(), 0xB1); // b a + } + __m512 conj_() const { + const __m512 sign_mask = _mm512_setr_ps( + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0); + return _mm512_xor_ps(values, sign_mask); // a -b + } + Vectorized> conj() const { + return conj_(); + } + Vectorized> log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + Vectorized> log2() const { + const __m512 log2_ = _mm512_set1_ps(std::log(2)); + return _mm512_div_ps(log(), log2_); + } + Vectorized> log10() const { + const __m512 log10_ = _mm512_set1_ps(std::log(10)); + return _mm512_div_ps(log(), log10_); + } + Vectorized> log1p() const { + return map(std::log1p); + } + Vectorized> asin() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // // asin(x) + // // = -i*ln(iz + sqrt(1 -z^2)) + // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + // const __m512 one = _mm512_set1_ps(1); + + // auto conj = conj_(); + // auto b_a = _mm512_permute_ps(conj, 0xB1); //-b a + // auto ab = _mm512_mul_ps(conj, b_a); //-ab + // -ab auto im = _mm512_add_ps(ab, ab); //-2ab -2ab + + // auto val_2 = _mm512_mul_ps(values, values); // a*a + // b*b auto re = hsub_ps(val_2, _mm512_permute_ps(val_2, 0xB1)); // a*a-b*b + // b*b-a*a re = _mm512_sub_ps(one, re); + + // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt(); + // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_ps(b_a, root)).log(); + // //ln(iz + sqrt()) return Vectorized(_mm512_permute_ps(ln.values, + // 0xB1)).conj(); //-i*ln() + return map(std::asin); + } + Vectorized> acos() const { + return map(std::acos); + } + Vectorized> atan() const; + Vectorized> atanh() const { + return map(std::atanh); + } + Vectorized> exp() const { + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expf16_u10(values); //exp(a) exp(b) exp = + // _mm512_mask_blend_ps(0xAAAA, exp, _mm512_permute_ps(exp, 0xB1)); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosf16_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm512_mask_blend_ps(0xAAAA, + // _mm512_permute_ps(sin_cos.y, 0xB1), + // sin_cos.x); //cos(b) + // sin(b) + // return _mm512_mul_ps(exp, cos_sin); + return map(std::exp); + } + Vectorized> exp2() const { + // Use identity 2**x = exp(log(2) * x) + const __m512 ln_2 = _mm512_set1_ps(c10::ln_2); + Vectorized> scaled_values = _mm512_mul_ps(values, ln_2); + return scaled_values.exp(); + } + Vectorized> expm1() const { + return map(std::expm1); + } + Vectorized> sin() const { + return map(std::sin); + } + Vectorized> sinh() const { + return map(std::sinh); + } + Vectorized> cos() const { + return map(std::cos); + } + Vectorized> cosh() const { + return map(std::cosh); + } + Vectorized> ceil() const { + return _mm512_ceil_ps(values); + } + Vectorized> floor() const { + return _mm512_floor_ps(values); + } + Vectorized> neg() const { + auto zero = _mm512_setzero_ps(); + return _mm512_sub_ps(zero, values); + } + Vectorized> round() const { + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized> tan() const { + return map(std::tan); + } + Vectorized> tanh() const { + return map(std::tanh); + } + Vectorized> trunc() const { + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized> sqrt() const { + return map(std::sqrt); + } + Vectorized> reciprocal() const; + Vectorized> rsqrt() const { + return sqrt().reciprocal(); + } + Vectorized> pow( + const Vectorized>& exp) const { + __at_align__ c10::complex x_tmp[size()]; + __at_align__ c10::complex y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (const auto i : c10::irange(size())) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized> operator==( + const Vectorized>& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF)); + } + Vectorized> operator!=( + const Vectorized>& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF)); + } + Vectorized> operator<( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_add_ps(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_sub_ps(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m512 sign_mask = _mm512_setr_ps( + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0); + auto ac_bd = _mm512_mul_ps(a, b); // ac bd + + auto d_c = _mm512_permute_ps(b, 0xB1); // d c + d_c = _mm512_xor_ps(sign_mask, d_c); // d -c + auto ad_bc = _mm512_mul_ps(a, d_c); // ad -bc + + auto ret = Vectorized>::hsub_ps( + ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // auto mask = _mm512_set1_ps(-0.f); + // auto fabs_cd = _mm512_andnot_ps(mask, b); // |c| |d| + // auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1); // |d| |c| + // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc)); // 1/sc + // 1/sc auto a2 = _mm512_mul_ps(a, scale); // a/sc b/sc auto b2 = + // _mm512_mul_ps(b, scale); // c/sc d/sc auto acbd2 = + // _mm512_mul_ps(a2, b2); + + // const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0, + // -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); + // auto dc2 = _mm512_permute_ps(b2, 0xB1); // d/sc c/sc + // dc2 = _mm512_xor_ps(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm512_mul_ps(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = Vectorized>::hadd_ps(acbd2, adbc2); + // //(ac+bd)/sc^2 (bc-ad)/sc^2 + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm512_div_ps(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return _mm512_loadu_ps(reinterpret_cast(out)); +} + +// reciprocal. Implement this here so we can use multiplication. +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0, + // 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); + // auto c_d = _mm512_xor_ps(sign_mask, values); //c -d + // return _mm512_div_ps(c_d, abs_2_()); + __at_align__ c10::complex tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = c10::complex(1) / tmp[i]; + } + return loadu(tmp); +} + +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m512 i = _mm512_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, + // 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm512_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5, + // 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm512_add_ps(i, values)); // a + // 1+b auto sub = Vectorized(_mm512_sub_ps(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() + return map(std::atan); +} + +template <> +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { + auto zero_vector = _mm512_set1_epi32(0); + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_LT_OQ); + auto max = _mm512_mask_blend_ps(mask, a, b); + // Exploit the fact that all-ones is a NaN. + auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q); + auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF); + return _mm512_or_ps(max, _mm512_castsi512_ps(isnan)); +} + +template <> +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { + auto zero_vector = _mm512_set1_epi32(0); + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_GT_OQ); + auto min = _mm512_mask_blend_ps(mask, a, b); + // Exploit the fact that all-ones is a NaN. + auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q); + auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF); + return _mm512_or_ps(min, _mm512_castsi512_ps(isnan)); +} + +template <> +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_and_ps(a, b); +} + +template <> +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_or_ps(a, b); +} + +template <> +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_xor_ps(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm512_set1_ps(1.0f)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm512_set1_ps(1.0f)); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..44d8b70fa3c512d3b30557631b7cfed674252df9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h @@ -0,0 +1,345 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + __m512 value; + cvtbf16_fp32(_mm512_castsi512_si256(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + __m512 value; + cvtfp16_fp32(_mm512_castsi512_si256(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = _mm512_castsi256_si512(cvtfp32_bf16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = convert_float_bfloat16(src[0], src[1]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + std::tie(result[0], result[1]) = convert_bfloat16_float(src[0]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = _mm512_castsi256_si512(cvtfp32_fp16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = convert_float_half(src[0], src[1]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + std::tie(result[0], result[1]) = convert_half_float(src[0]); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm512_cvtepi64_ps(src[0]); + auto high = _mm512_cvtepi64_ps(src[1]); + return Vectorized( + _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm512_cvt_roundps_epi64( + _mm512_castps512_ps256(src[0]), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + result[1] = _mm512_cvt_roundps_epi64( + _mm512_extractf32x8_ps(src[0], 1), + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm512_cvtepi64_epi32(src[0]); + auto high = _mm512_cvtepi64_epi32(src[1]); + return Vectorized( + _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src[0])); + result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src[0], 1)); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm512_castsi512_si128(src[0]); + return Vectorized(_mm512_cvtepi8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm512_castsi512_si128(src[0]); + return Vectorized(_mm512_cvtepu8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + return Vectorized(_mm512_cvttps_epi32(src[0])); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + return Vectorized(_mm512_cvtepi32_ps(src[0])); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src256 = _mm512_castsi512_si256(src[0]); + return Vectorized(_mm512_cvtepu8_epi16(src256)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm512_cvtepi32_epi8(src[0]); + return Vectorized(_mm512_castsi128_si512(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src256 = _mm512_cvtepi16_epi8(src[0]); + return Vectorized(_mm512_castsi256_si512(src256)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + src_t, + 1, + typename std::enable_if_t< + (is_reduced_floating_point_v && is_8bit_integer_v) || + (is_reduced_floating_point_v && is_8bit_integer_v), + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN tmp_fp32 = VecConvert::apply(src); + return VecConvert::apply(tmp_fp32); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 2, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + at::vec::Vectorized vec1 = convert_float_to_int8(src[0]); + at::vec::Vectorized vec2 = convert_float_to_int8(src[1]); + __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2)); + __m512 result = _mm512_insertf32x4( + _mm512_castsi512_ps(vec1), + lane2, + 1); // Insert lane2 into the second 128-bit lane + return at::vec::Vectorized(_mm512_castps_si512(result)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_float_to_int8(src[0]); + } +}; + +template +struct VecConvert< + float, + 2, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + __m512i src2 = + _mm512_castsi128_si512(_mm_castps_si128(_mm512_extractf32x4_ps( + _mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane + )); + return VectorizedN( + convert_int8_to_float(src[0]), + convert_int8_to_float(src2)); + } +}; + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_int8_to_float(src[0]); + } +}; + +template +struct VecConvert< + dst_t, + 1, + int64_t, + 2, + std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const VectorizedN& src) { + return VecConvert::apply( + VecConvert::apply(src)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + at::vec::Vectorized src = src_n[0]; + __m128i res128 = cvtfp32_fp8e4m3(src); + return at::vec::Vectorized(_mm512_castsi128_si512(res128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + // cvt first 16x8 bits from Float8_e4m3fn to float + at::vec::Vectorized src = src_n[0]; + __m512 result; + cvtfp8e4m3_fp32(_mm512_castsi512_si128(src), result); + return at::vec::Vectorized(result); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + at::vec::Vectorized src = src_n[0]; + __m128i res128 = cvtfp32_fp8e5m2(src); + return at::vec::Vectorized(_mm512_castsi128_si512(res128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + // cvt first 16x8 bits from Float8_e5m2 to float + at::vec::Vectorized src = src_n[0]; + __m512 result; + cvtfp8e5m2_fp32(_mm512_castsi512_si128(src), result); + return at::vec::Vectorized(result); + } +}; + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ca121d301df6c9fb71b0eef28a9efe8fd03f8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h @@ -0,0 +1,571 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#if (defined(CPU_CAPABILITY_AVX512)) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: + // values needs to be public for compilation with clang + // as vec512.h uses it + __m512d values; + using value_type = double; + using size_type = int; + static constexpr size_type size() { + return 8; + } + Vectorized() { + values = _mm512_setzero_pd(); + } + Vectorized(__m512d v) : values(v) {} + Vectorized(double val) { + values = _mm512_set1_pd(val); + } + Vectorized( + double val1, + double val2, + double val3, + double val4, + double val5, + double val6, + double val7, + double val8) { + values = _mm512_setr_pd(val1, val2, val3, val4, val5, val6, val7, val8); + } + operator __m512d() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mask_blend_pd(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); + auto mmask = _mm512_cmp_epi64_mask( + _mm512_castpd_si512(mask.values), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mmask, a.values, b.values); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return _mm512_loadu_pd(reinterpret_cast(ptr)); + + __mmask8 mask = (1ULL << count) - 1; + return _mm512_maskz_loadu_pd(mask, ptr); + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_pd(reinterpret_cast(ptr), values); + } else if (count > 0) { + __mmask8 mask = (1ULL << count) - 1; + _mm512_mask_storeu_pd(reinterpret_cast(ptr), mask, values); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + __mmask8 cmp = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_EQ_OQ); + return static_cast(cmp); + } + Vectorized isnan() const { + auto cmp_mask = + _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + bool has_inf_nan() const { + __m512d self_sub = _mm512_sub_pd(values, values); + return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & + 0x7777777777777777) != 0; + } + Vectorized map(double (*const f)(double)) const { + __at_align__ double tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = _mm512_set1_pd(-0.f); + return _mm512_andnot_pd(mask, values); + } + Vectorized angle() const { + const auto zero_vec = _mm512_castsi512_pd(zero_vector); + const auto nan_vec = _mm512_set1_pd(NAN); + const auto not_nan_mask = _mm512_cmp_pd_mask(values, values, _CMP_EQ_OQ); + const auto not_nan = + _mm512_mask_set1_epi64(zero_vector, not_nan_mask, 0xFFFFFFFFFFFFFFFF); + const auto nan_mask = + _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan), zero_vec, _CMP_EQ_OQ); + const auto pi = _mm512_set1_pd(c10::pi); + + const auto neg_mask = _mm512_cmp_pd_mask(values, zero_vec, _CMP_LT_OQ); + auto angle = _mm512_mask_blend_pd(neg_mask, zero_vec, pi); + angle = _mm512_mask_blend_pd(nan_mask, angle, nan_vec); + return angle; + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_pd(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return Vectorized(Sleef_acosd8_u10(values)); + } + Vectorized acosh() const { + return Vectorized(Sleef_acoshd8_u10(values)); + } + Vectorized asin() const { + return Vectorized(Sleef_asind8_u10(values)); + } + Vectorized asinh() const { + return Vectorized(Sleef_asinhd8_u10(values)); + } + Vectorized atan() const { + return Vectorized(Sleef_atand8_u10(values)); + } + Vectorized atanh() const { + return Vectorized(Sleef_atanhd8_u10(values)); + } + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2d8_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { + return Vectorized(Sleef_copysignd8(values, sign)); + } + Vectorized erf() const { + return Vectorized(Sleef_erfd8_u10(values)); + } + Vectorized erfc() const { + return Vectorized(Sleef_erfcd8_u15(values)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return Vectorized(Sleef_expd8_u10(values)); + } + Vectorized exp2() const { + return Vectorized(Sleef_exp2d8_u10(values)); + } + Vectorized expm1() const { + return Vectorized(Sleef_expm1d8_u10(values)); + } + Vectorized exp_u20() const { + return exp(); + } + Vectorized fexp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const { + return Vectorized(Sleef_fmodd8(values, q)); + } + Vectorized hypot(const Vectorized& b) const { + return Vectorized(Sleef_hypotd8_u05(values, b)); + } + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ double tmp[size()]; + __at_align__ double tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized log() const { + return Vectorized(Sleef_logd8_u10(values)); + } + Vectorized log2() const { + return Vectorized(Sleef_log2d8_u10(values)); + } + Vectorized log10() const { + return Vectorized(Sleef_log10d8_u10(values)); + } + Vectorized log1p() const { + return Vectorized(Sleef_log1pd8_u10(values)); + } + Vectorized sin() const { + return Vectorized(Sleef_sind8_u10(values)); + } + Vectorized sinh() const { + return Vectorized(Sleef_sinhd8_u10(values)); + } + Vectorized cos() const { + return Vectorized(Sleef_cosd8_u10(values)); + } + Vectorized cosh() const { + return Vectorized(Sleef_coshd8_u10(values)); + } + Vectorized ceil() const { + return _mm512_ceil_pd(values); + } + Vectorized floor() const { + return _mm512_floor_pd(values); + } + Vectorized frac() const; + Vectorized neg() const { + return _mm512_xor_pd(_mm512_set1_pd(-0.), values); + } + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterd8(values, b)); + } + Vectorized round() const { + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized tan() const { + return Vectorized(Sleef_tand8_u10(values)); + } + Vectorized tanh() const { + return Vectorized(Sleef_tanhd8_u10(values)); + } + Vectorized trunc() const { + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized lgamma() const { + return Vectorized(Sleef_lgammad8_u10(values)); + } + Vectorized sqrt() const { + return _mm512_sqrt_pd(values); + } + Vectorized reciprocal() const { + return _mm512_div_pd(_mm512_set1_pd(1), values); + } + Vectorized rsqrt() const { + return _mm512_div_pd(_mm512_set1_pd(1), _mm512_sqrt_pd(values)); + } + Vectorized pow(const Vectorized& b) const { + return Vectorized(Sleef_powd8_u10(values, b)); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized operator!=(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized operator<(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LT_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized operator<=(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LE_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized operator>(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GT_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized operator>=(const Vectorized& other) const { + auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GE_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_pd(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_pd(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mul_pd(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return _mm512_div_pd(a, b); +} + +// frac. Implement this here so we can use subtraction. +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi64(0); + Vectorized max = _mm512_max_pd(a, b); + auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + return _mm512_or_pd(max, isnan); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi64(0); + Vectorized min = _mm512_min_pd(a, b); + auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + return _mm512_or_pd(min, isnan); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return _mm512_min_pd(max, _mm512_max_pd(min, a)); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return _mm512_max_pd(min, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return _mm512_min_pd(max, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_pd(a, b); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_pd(a, b); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_pd(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0); +} + +template <> +inline void convert(const double* src, double* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i)); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fmadd_pd(a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fnmadd_pd(a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fmsub_pd(a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fnmsub_pd(a, b, c); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h new file mode 100644 index 0000000000000000000000000000000000000000..e390db15bfa62b8607ffa72e8bca018e8e1a9432 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h @@ -0,0 +1,945 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + static constexpr __m512i zero_vec{0, 0, 0, 0, 0, 0, 0, 0}; + + public: + __m512 values; + using value_type = float; + using size_type = int; + static constexpr size_type size() { + return 16; + } + Vectorized() { + values = _mm512_setzero_ps(); + } + Vectorized(__m512 v) : values(v) {} + Vectorized(float val) { + values = _mm512_set1_ps(val); + } + Vectorized( + float val1, + float val2, + float val3, + float val4, + float val5, + float val6, + float val7, + float val8, + float val9, + float val10, + float val11, + float val12, + float val13, + float val14, + float val15, + float val16) { + values = _mm512_setr_ps( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + Vectorized(const float (&arr)[16]) + : Vectorized( + arr[0], + arr[1], + arr[2], + arr[3], + arr[4], + arr[5], + arr[6], + arr[7], + arr[8], + arr[9], + arr[10], + arr[11], + arr[12], + arr[13], + arr[14], + arr[15]) {} + operator __m512() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mask_blend_ps(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto mmask = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask.values), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mmask, a.values, b.values); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return _mm512_loadu_ps(reinterpret_cast(ptr)); + + __mmask16 mask = (1ULL << count) - 1; + return _mm512_maskz_loadu_ps(mask, ptr); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + _mm512_storeu_ps(reinterpret_cast(ptr), values); + } else if (count > 0) { + __mmask16 mask = (1ULL << count) - 1; + _mm512_mask_storeu_ps(reinterpret_cast(ptr), mask, values); + } + } + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + __mmask16 cmp = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_EQ_OQ); + return static_cast(cmp); + } + Vectorized isnan() const { + auto mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + bool has_inf_nan() const { + __m512 self_sub = _mm512_sub_ps(values, values); + return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & + 0x7777777777777777) != 0; + } + Vectorized map(float (*const f)(float)) const { + __at_align__ float tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = _mm512_set1_ps(-0.f); + return _mm512_andnot_ps(mask, values); + } + Vectorized angle() const { + __m512 zero_vec = _mm512_set1_ps(0.f); + const auto nan_vec = _mm512_set1_ps(NAN); + const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ); + const auto not_nan_vec = _mm512_mask_set1_epi32( + _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); + const auto nan_mask = _mm512_cmp_ps_mask( + _mm512_castsi512_ps(not_nan_vec), zero_vec, _CMP_EQ_OQ); + const auto pi = _mm512_set1_ps(c10::pi); + + const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ); + auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi); + angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec); + return angle; + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_ps(0); + } + Vectorized conj() const { + return *this; + } + Vectorized acos() const { + return Vectorized(Sleef_acosf16_u10(values)); + } + Vectorized acosh() const { + return Vectorized(Sleef_acoshf16_u10(values)); + } + Vectorized asin() const { + return Vectorized(Sleef_asinf16_u10(values)); + } + Vectorized asinh() const { + return Vectorized(Sleef_asinhf16_u10(values)); + } + Vectorized atan() const { + return Vectorized(Sleef_atanf16_u10(values)); + } + Vectorized atanh() const { + return Vectorized(Sleef_atanhf16_u10(values)); + } + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2f16_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { + return Vectorized(Sleef_copysignf16(values, sign)); + } + Vectorized erf() const { + // constants + const auto neg_zero_vec = _mm512_set1_ps(-0.f); + const auto one_vec = _mm512_set1_ps(1.0f); + const auto p = _mm512_set1_ps(0.3275911f); + const auto p1 = _mm512_set1_ps(0.254829592f); + const auto p2 = _mm512_set1_ps(-0.284496736f); + const auto p3 = _mm512_set1_ps(1.421413741f); + const auto p4 = _mm512_set1_ps(-1.453152027f); + const auto p5 = _mm512_set1_ps(1.061405429f); + // sign(x) + auto sign_mask = _mm512_and_ps(neg_zero_vec, values); + auto abs_vec = _mm512_abs_ps(values); + // t = 1 / (p * abs(x) + 1) + auto tmp0 = _mm512_fmadd_ps(p, abs_vec, one_vec); + auto t = _mm512_div_ps(one_vec, tmp0); + // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 + auto tmp1 = _mm512_fmadd_ps(p5, t, p4); + auto tmp2 = _mm512_fmadd_ps(tmp1, t, p3); + auto tmp3 = _mm512_fmadd_ps(tmp2, t, p2); + auto r = _mm512_fmadd_ps(tmp3, t, p1); + // - exp(- x * x) + auto pow_2 = _mm512_mul_ps(values, values); + auto neg_pow_2 = _mm512_xor_ps(neg_zero_vec, pow_2); + // auto tmp4 = exp(neg_pow_2); + auto tmp4 = Vectorized(Sleef_expf16_u10(neg_pow_2)); + auto tmp5 = _mm512_xor_ps(neg_zero_vec, tmp4); + // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) + auto tmp6 = _mm512_mul_ps(tmp5, t); + auto tmp7 = _mm512_fmadd_ps(tmp6, r, one_vec); + return _mm512_xor_ps(sign_mask, tmp7); + } + Vectorized erfc() const { + return Vectorized(Sleef_erfcf16_u15(values)); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return Vectorized(Sleef_expf16_u10(values)); + } + Vectorized exp2() const { + return Vectorized(Sleef_exp2f16_u10(values)); + } + Vectorized expm1() const { + return Vectorized(Sleef_expm1f16_u10(values)); + } + Vectorized fexp_u20() const { + const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f); + const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f); + const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356); + const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236); + + const __m512 vec_exp_log2ef = + _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) + + const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2)); + const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f); + + const __m512 vec_ln_flt_min = + _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); + const __m512 vec_ln_flt_max = + _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); + __m512i vec_infinity = _mm512_set1_epi32(0x7F800000); + __m512i vec_zero = _mm512_setzero_epi32(); + + // Fast Exponential Computation on SIMD Architectures + // A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro + // Curioni exp(x) = 2**(x * log2(e)) + // = 2**xi * 2**xf - TIPS we are using the EEEE floating point + // representation with identification to the exponent and the + // mentissa + // 2**xf will be approximated to a polynomial of degree 3 computed with + // Horner method + // mask for the boundary condition + auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS); + auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS); + + // transformation with log2(e) + auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef); + auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src)); + + // compute polynomial using Horner Scheme, for superscalar processor + auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2); + vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1); + vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0); + + vec_src = _mm512_sub_ps(vec_src, vec_res); + // the tips is here, headache in perspective + auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b); + // headache bis - we loose precision with the cast but it "fits", but ok + // after f32 -> f16 later + __m512i casted_integer = _mm512_cvttps_epi32(tmp); + // boundary condition, lower than the min -> 0 + casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero); + // boundary condition, larger than the max -> +oo + casted_integer = + _mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity); + // final interpretation to float + return _mm512_castsi512_ps(casted_integer); + } + Vectorized exp_u20() const { + // A faster version of exp with ULP=20 + const __m512 vec_factorial_1 = + _mm512_set1_ps(0.999999701f); // 1/factorial(1) + const __m512 vec_factorial_2 = + _mm512_set1_ps(0.499991506f); // 1/factorial(2) + const __m512 vec_factorial_3 = + _mm512_set1_ps(0.166676521f); // 1/factorial(3) + const __m512 vec_factorial_4 = + _mm512_set1_ps(0.0418978221f); // 1/factorial(4) + const __m512 vec_factorial_5 = + _mm512_set1_ps(0.00828929059f); // 1/factorial(5) + const __m512 vec_exp_log2ef = + _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) + const __m512 vec_half = _mm512_set1_ps(0.5f); + const __m512 vec_one = _mm512_set1_ps(1.f); + const __m512 vec_zero = _mm512_set1_ps(0.f); + const __m512 vec_two = _mm512_set1_ps(2.f); + const __m512 vec_ln2f = + _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) + const __m512 vec_ln_flt_min = + _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); + const __m512 vec_ln_flt_max = + _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); + const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); + const int n_mantissa_bits = 23; + + // exp(x) = + // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem + // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression + + auto less_ln_flt_min_mask = + _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/); + auto vec_src = _mm512_min_ps(values, vec_ln_flt_max); + vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min); + + // fx = floorf(x * log2ef + 0.5) + auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half); + auto vec_fx_i = _mm512_cvt_roundps_epi32( + vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + vec_fx = _mm512_cvtepi32_ps(vec_fx_i); + + // x = x - fx * ln2 + auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src); + + // compute polynomial + auto vec_res = + _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4); + vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3); + vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2); + vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1); + vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one); + + // compute 2^(n-1) + auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one); + auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); + auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); + vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); + auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); + vec_two_pow_n = + _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); + + // y = y * 2^n + vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n); + vec_res = _mm512_mul_ps(vec_res, vec_two); + return vec_res; + } + Vectorized fmod(const Vectorized& q) const { + return Vectorized(Sleef_fmodf16(values, q)); + } + Vectorized log() const { + return Vectorized(Sleef_logf16_u10(values)); + } + Vectorized log2() const { + return Vectorized(Sleef_log2f16_u10(values)); + } + Vectorized log10() const { + return Vectorized(Sleef_log10f16_u10(values)); + } + Vectorized log1p() const { + return Vectorized(Sleef_log1pf16_u10(values)); + } + Vectorized frac() const; + Vectorized sin() const { + return Vectorized(Sleef_sinf16_u35(values)); + } + Vectorized sinh() const { + return Vectorized(Sleef_sinhf16_u10(values)); + } + Vectorized cos() const { + return Vectorized(Sleef_cosf16_u35(values)); + } + Vectorized cosh() const { + return Vectorized(Sleef_coshf16_u10(values)); + } + Vectorized ceil() const { + return _mm512_ceil_ps(values); + } + Vectorized floor() const { + return _mm512_floor_ps(values); + } + Vectorized hypot(const Vectorized& b) const { + return Vectorized(Sleef_hypotf16_u05(values, b)); + } + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igamma(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized igammac(const Vectorized& x) const { + __at_align__ float tmp[size()]; + __at_align__ float tmp_x[size()]; + store(tmp); + x.store(tmp_x); + for (const auto i : c10::irange(size())) { + tmp[i] = calc_igammac(tmp[i], tmp_x[i]); + } + return loadu(tmp); + } + Vectorized neg() const { + return _mm512_xor_ps(_mm512_set1_ps(-0.f), values); + } + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterf16(values, b)); + } + Vectorized round() const { + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vectorized tan() const { + return Vectorized(Sleef_tanf16_u10(values)); + } + Vectorized tanh() const { + return Vectorized(Sleef_tanhf16_u10(values)); + } + Vectorized trunc() const { + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vectorized lgamma() const { + return Vectorized(Sleef_lgammaf16_u10(values)); + } + Vectorized sqrt() const { + return _mm512_sqrt_ps(values); + } + Vectorized reciprocal() const { + return _mm512_div_ps(_mm512_set1_ps(1), values); + } + Vectorized rsqrt() const { + return _mm512_div_ps(_mm512_set1_ps(1), _mm512_sqrt_ps(values)); + } + Vectorized pow(const Vectorized& b) const { + return Vectorized(Sleef_powf16_u10(values, b)); + } + float reduce_add() const { + return _mm512_reduce_add_ps(values); + } + float reduce_max() const { + return _mm512_reduce_max_ps(values); + } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized operator>(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized operator>=(const Vectorized& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_ps(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_ps(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mul_ps(a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return _mm512_div_ps(a, b); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi32(0); + auto max = _mm512_max_ps(a, b); + auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + return _mm512_or_ps(max, isnan); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi32(0); + auto min = _mm512_min_ps(a, b); + auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); + // Exploit the fact that all-ones is a NaN. + return _mm512_or_ps(min, isnan); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return _mm512_min_ps(max, _mm512_max_ps(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return _mm512_min_ps(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return _mm512_max_ps(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_ps(a, b); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_ps(a, b); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_ps(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +inline void convert(const float* src, float* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i)); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fmadd_ps(a, b, c); +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fnmadd_ps(a, b, c); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fmsub_ps(a, b, c); +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return _mm512_fnmsub_ps(a, b, c); +} + +// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) +// Used by Inductor CPP codegen for micro gemm +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304 +// kernel for transposing mxn where m, n <= 16 +// (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + N instructions +inline void transpose_block( + at::vec::VectorizedN& input, + int M = 16, + int N = 16) { + TORCH_CHECK(M <= 16 && N <= 16, "transpose_block expects M, N <= 16."); + // unpacking and interleaving 32-bit elements + __m512 temp[16]; + int i; + for (i = 0; i < (M + 1) / 2; ++i) { + temp[2 * i] = _mm512_unpacklo_ps(input[2 * i], input[2 * i + 1]); + temp[2 * i + 1] = _mm512_unpackhi_ps(input[2 * i], input[2 * i + 1]); + } + for (i = i * 2; i < 16; ++i) { + temp[i] = _mm512_setzero_ps(); + } + + // unpacking and interleaving 64-bit elements + for (i = 0; i < (M + 3) / 4; ++i) { + input[4 * i] = _mm512_castpd_ps(_mm512_unpacklo_pd( + _mm512_castps_pd(temp[4 * i]), _mm512_castps_pd(temp[4 * i + 2]))); + input[4 * i + 1] = _mm512_castpd_ps(_mm512_unpackhi_pd( + _mm512_castps_pd(temp[4 * i]), _mm512_castps_pd(temp[4 * i + 2]))); + input[4 * i + 2] = _mm512_castpd_ps(_mm512_unpacklo_pd( + _mm512_castps_pd(temp[4 * i + 1]), _mm512_castps_pd(temp[4 * i + 3]))); + input[4 * i + 3] = _mm512_castpd_ps(_mm512_unpackhi_pd( + _mm512_castps_pd(temp[4 * i + 1]), _mm512_castps_pd(temp[4 * i + 3]))); + } + + // shuffle 128-bits (composed of 4 32-bit elements) + for (i = 0; i < (M + 7) / 8; ++i) { + temp[8 * i] = _mm512_shuffle_f32x4(input[8 * i], input[8 * i + 4], 0x88); + temp[8 * i + 1] = + _mm512_shuffle_f32x4(input[8 * i + 1], input[8 * i + 5], 0x88); + temp[8 * i + 2] = + _mm512_shuffle_f32x4(input[8 * i + 2], input[8 * i + 6], 0x88); + temp[8 * i + 3] = + _mm512_shuffle_f32x4(input[8 * i + 3], input[8 * i + 7], 0x88); + temp[8 * i + 4] = + _mm512_shuffle_f32x4(input[8 * i], input[8 * i + 4], 0xdd); + temp[8 * i + 5] = + _mm512_shuffle_f32x4(input[8 * i + 1], input[8 * i + 5], 0xdd); + temp[8 * i + 6] = + _mm512_shuffle_f32x4(input[8 * i + 2], input[8 * i + 6], 0xdd); + temp[8 * i + 7] = + _mm512_shuffle_f32x4(input[8 * i + 3], input[8 * i + 7], 0xdd); + } + + for (i = 0; i < N; ++i) { + if (i < 8) { + input[i] = _mm512_shuffle_f32x4(temp[i], temp[8 + i], 0x88); + } else { + input[i] = _mm512_shuffle_f32x4(temp[i - 8], temp[i], 0xdd); + } + } +} + +// TODO(jgong5): rewrite with ATEN vectorized (need to add unpack and shuffle) +// Used by Inductor CPP codegen +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304 +// kernel for transposing mxn where m, n <= 16 +// M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions +inline void transpose_mxn_16x16( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst, + int M, + int N) { + TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn expects M, N <= 16."); + // load from src to registers + at::vec::VectorizedN input; + int i; + if (N == 16) { + for (i = 0; i < M; ++i) { + input[i] = _mm512_loadu_ps(&src[i * ld_src]); + } + } else { + __mmask16 src_mask = (1 << N) - 1; + for (i = 0; i < M; ++i) { + input[i] = _mm512_maskz_loadu_ps(src_mask, &src[i * ld_src]); + } + } + for (; i < 16; ++i) { + // Not really needed but to avoid uninitialized variable warning. + // Shouldn't be much overhead because xor can be executed in parallel with + // other instructions. + input[i] = _mm512_setzero_ps(); + } + + transpose_block(input, M, N); + + // store from registers to dst + if (M == 16) { + for (i = 0; i < N; ++i) { + _mm512_storeu_ps(&dst[i * ld_dst], input[i]); + } + } else { + __mmask16 dst_mask = (1 << M) - 1; + for (i = 0; i < N; ++i) { + _mm512_mask_storeu_ps(&dst[i * ld_dst], dst_mask, input[i]); + } + } +} + +template <> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst, + int M, + int N) { + int64_t i = 0; + for (; i < M / 16 * 16; i += 16) { + int64_t j = 0; + for (; j < N / 16 * 16; j += 16) { + transpose_mxn_16x16( + src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16); + } + // handle remainder j + int nrem = N - j; + if (nrem > 0) { + transpose_mxn_16x16( + src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem); + } + } + // handle remainder i + int mrem = M - i; + if (mrem > 0) { + int j = 0; + for (; j < N / 16 * 16; j += 16) { + transpose_mxn_16x16( + src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16); + } + // handle remainder j + int nrem = N - j; + transpose_mxn_16x16( + src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem); + } +} + +template < + typename T, + int M, + int N, + typename std::enable_if_t, int> = 0> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h new file mode 100644 index 0000000000000000000000000000000000000000..b0aa8e3a05cd29529145415da9ba08f356e24d7e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h @@ -0,0 +1,666 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#if (defined(CPU_CAPABILITY_AVX512)) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +static inline void cvtfp8e4m3_fp32(const __m128i& a, __m512& o) { + // Zero Extend + __m512i x = _mm512_cvtepu8_epi32(a); + __m512i val = _mm512_and_epi32( + _mm512_slli_epi32(x, 24), _mm512_set1_epi32(0x7FFFFFFF)); // nonsign_val + __m512i mant = + _mm512_and_si512(x, _mm512_set1_epi32(0x07)); // mantissa = x & 0x07 + __m512i exp = _mm512_and_si512( + _mm512_srli_epi32(x, 3), + _mm512_set1_epi32(0x0F)); // exp = (x >> 3) & 0x0F + __m512i sign = + _mm512_and_si512(x, _mm512_set1_epi32(0x80)); // sign = x & 0x80 + __m512i _zeros = _mm512_setzero_si512(); + + // --- Step 1: Calculate the renorm_shift + __m512i renorm_shift = _zeros; + // Denorm case (exp == 0 && mant != 0) --- + __mmask16 denormal_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) & + _mm512_cmpneq_epi32_mask(mant, _zeros); + if (denormal_mask) { + // An alternative solution is as what scalar did in + // pytorch/c10/util/Float8_e4m3fn.h To count the num of leading zeros, since + // here we know the unsigned denorm value has zero sign and exp which is 5 + // leading zeros, we need to count the leading zero of mant (3bit) which may + // done through table lookup for example: const uint8_t lz_table[8] = {3, 2, + // 1, 1, 0, 0, 0, 0}; num_leading_zero = lz_table[mant] + 5; + + __m512i _ones = _mm512_set1_epi32(1); + __m512i _twos = _mm512_set1_epi32(2); + __m512i _threes = _mm512_set1_epi32(3); + + // Default leading zero number for denorm value is 1 = 5 - 4 + __m512i denorm_renorm_shift = _ones; + // For mant 001, leading zero number is 3 = 7 -4 + __mmask16 leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _ones); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _threes); + // For mant 010 and 011, leading zero number is 2 = 6 -4 + leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _twos); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos); + leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _threes); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos); + + renorm_shift = + _mm512_mask_mov_epi32(renorm_shift, denormal_mask, denorm_renorm_shift); + } + + // --- Step 2: calculate norm and denorm --- + __m512i norm_shifted = + _mm512_srli_epi32(_mm512_sllv_epi32(val, renorm_shift), 4); + // exponent bias adjustment: (0x78 - renorm_shift) << 23 + __m512i exp_bias = _mm512_slli_epi32( + _mm512_sub_epi32(_mm512_set1_epi32(0x78), renorm_shift), 23); + val = _mm512_add_epi32(norm_shifted, exp_bias); + + // --- Step 3: Nan case (exp == 0xF && mant == 0x07) --- + __mmask16 nan_mask = _mm512_cmpeq_epi32_mask(exp, _mm512_set1_epi32(0xF)) & + _mm512_cmpeq_epi32_mask(mant, _mm512_set1_epi32(0x07)); + if (nan_mask) { + const __m512i nan_values = _mm512_set1_epi32(0x7FC00000); + val = _mm512_mask_mov_epi32(val, nan_mask, nan_values); + } + + // --- Step 4: Zero case (exp == 0x00 && mant == 0x00) --- + __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) & + _mm512_cmpeq_epi32_mask(mant, _zeros); + if (zero_mask) { + val = _mm512_mask_mov_epi32(val, zero_mask, _zeros); + } + + // --- Step 5: OR with sign (sign bit << 24 to get to bit 31) --- + val = _mm512_or_si512(val, _mm512_slli_epi32(sign, 24)); + + o = _mm512_castsi512_ps(val); +} + +static inline __m128i cvtfp32_fp8e4m3(const __m512& src) { + // cvt 16x32 from fp32 to fp8 e4m3 + const __m512i sign_mask = _mm512_set1_epi32(0x80000000); + const __m512i fp8_max = _mm512_set1_epi32(UINT32_C(1087) << 20); + const __m512i denorm_thresh = _mm512_set1_epi32(UINT32_C(121) << 23); + const __m512i denorm_mask = _mm512_set1_epi32(UINT32_C(141) << 23); + const __m512i bias_part1 = _mm512_set1_epi32((uint32_t)(7 - 127) << 23); + const __m512i rounding_bias = _mm512_set1_epi32(0x7FFFF); + __m512i f_bits = _mm512_castps_si512(src); + // Extract and save sign + __m512i sign = _mm512_and_epi32(f_bits, sign_mask); + f_bits = _mm512_xor_epi32(f_bits, sign); + + // Prepare result containers + __m512i result = _mm512_setzero_si512(); + + // Step 1: Handle case of overflow + // (f_bits >= fp8_max): set result = 0x7f + __mmask16 overflow_mask = _mm512_cmpge_epu32_mask(f_bits, fp8_max); + if (overflow_mask) { + result = _mm512_mask_set1_epi32(result, overflow_mask, 0x7f); + } + + // Step 2: Handle small numbers (denormals) + // Small numbers (f_bits < denorm_thresh) + __mmask16 denorm_thresh_mask = _mm512_cmplt_epu32_mask(f_bits, denorm_thresh); + + if (denorm_thresh_mask) { + __m512 small_input = _mm512_castsi512_ps(f_bits); + __m512 small_denorm = + _mm512_add_ps(small_input, _mm512_castsi512_ps(denorm_mask)); + __m512i small_denorm_bits = _mm512_castps_si512(small_denorm); + __m512i small_result = _mm512_sub_epi32(small_denorm_bits, denorm_mask); + result = _mm512_mask_mov_epi32(result, denorm_thresh_mask, small_result); + } + + // Step 3: Handle normal numbers + __mmask16 normal_mask = ~(overflow_mask | denorm_thresh_mask); + + if (normal_mask) { + // mant_odd = (f_bits >> 20) & 1 + __m512i mant_odd = + _mm512_and_epi32(_mm512_srli_epi32(f_bits, 20), _mm512_set1_epi32(1)); + // f_bits += bias_part1 + rounding_bias + __m512i rounded = _mm512_add_epi32(f_bits, bias_part1); + rounded = _mm512_add_epi32(rounded, rounding_bias); + // Add mant_odd + rounded = _mm512_add_epi32(rounded, mant_odd); + // Shift right by 20 bits + __m512i normal_result = _mm512_srli_epi32(rounded, 20); + result = _mm512_mask_mov_epi32(result, normal_mask, normal_result); + } + + // Merge back the sign + __m512i sign_shifted = _mm512_srli_epi32(sign, 24); + result = _mm512_or_epi32(result, sign_shifted); + + // Now result is 16 x 32-bit integers, but we only need 8-bit for each + __m512i packed = _mm512_and_si512(result, _mm512_set1_epi32(0xFF)); + + // Narrow 32-bit integers to 8-bit + return _mm512_cvtepi32_epi8(packed); +} + +static inline float fp8e4m3_to_fp32_scalar(uint8_t val) { + __m512i v = _mm512_set1_epi8(val); + __m128i v_128 = _mm512_castsi512_si128(v); + __m512 o; + cvtfp8e4m3_fp32(v_128, o); + return _mm512_cvtss_f32(o); +} + +static inline uint8_t fp32_to_fp8e4m3_scalar(float val) { + __m512 v = _mm512_set1_ps(val); + __m128i o = cvtfp32_fp8e4m3(v); + return static_cast(_mm_cvtsi128_si32(o)); +} + +static inline void cvtfp8e5m2_fp32(const __m128i& a, __m512& o) { + __m256i a_256 = _mm256_castsi128_si256(a); + __m512i a_512 = _mm512_cvtepu8_epi16(a_256); + a_512 = _mm512_slli_epi16(a_512, 8); + a_256 = _mm512_castsi512_si256(a_512); + cvtfp16_fp32(a_256, o); +} + +static inline __m128i cvtfp32_fp8e5m2(const __m512& src) { + constexpr uint32_t fp32_inf = UINT32_C(255) << 23; + constexpr uint32_t fp8_max = UINT32_C(143) << 23; + constexpr uint32_t denorm_mask = UINT32_C(134) << 23; + + // Cvt to bits + __m512i input_bits = _mm512_castps_si512(src); + __m512i result = _mm512_setzero_si512(); + + // Get the sign + __m512i sign = _mm512_and_si512(input_bits, _mm512_set1_epi32(0x80000000)); + + // Get the unsigned input + input_bits = _mm512_xor_si512(input_bits, sign); + + // Calculate the mask for inf, nan and denorm + __mmask16 greater_than_fp8_max = + _mm512_cmpge_epi32_mask(input_bits, _mm512_set1_epi32(fp8_max)); + __mmask16 greater_than_fp32_inf = + _mm512_cmpgt_epi32_mask(input_bits, _mm512_set1_epi32(fp32_inf)); + __mmask16 less_than_normal = _mm512_cmpgt_epi32_mask( + _mm512_set1_epi32((UINT32_C(113) << 23)), input_bits); + __m512i temp_bits_for_denorm = _mm512_setzero_si512(); + if (less_than_normal) { + __m512i denorm_mask_512i = _mm512_set1_epi32(denorm_mask); + temp_bits_for_denorm = _mm512_castps_si512(_mm512_add_ps( + _mm512_castsi512_ps(input_bits), + _mm512_castsi512_ps(denorm_mask_512i))); + temp_bits_for_denorm = + _mm512_sub_epi32(temp_bits_for_denorm, denorm_mask_512i); + } + + // Step 1: Norm Val + __m512i mant_odd_mask = + _mm512_and_epi32(_mm512_srli_epi32(input_bits, 21), _mm512_set1_epi32(1)); + input_bits = _mm512_add_epi32( + input_bits, _mm512_set1_epi32(((uint32_t)(15 - 127) << 23) + 0xFFFFF)); + input_bits = _mm512_add_epi32(input_bits, mant_odd_mask); + result = _mm512_srli_epi32(input_bits, 21); + + // Step 2: INF and NAN + if (greater_than_fp8_max) { + result = _mm512_mask_mov_epi32( + result, greater_than_fp8_max, _mm512_set1_epi8(0x7C)); + if (greater_than_fp32_inf) { + result = _mm512_mask_mov_epi32( + result, greater_than_fp32_inf, _mm512_set1_epi8(0x7F)); + } + } + + // Step 3: Denorm val + if (less_than_normal) { + result = + _mm512_mask_mov_epi32(result, less_than_normal, temp_bits_for_denorm); + } + + // Step 4: restore sign + result = _mm512_or_si512(result, _mm512_srli_epi32(sign, 24)); + + return _mm512_cvtepi32_epi8(result); +} + +static inline float fp8e5m2_to_fp32_scalar(uint8_t val) { + __m512i v = _mm512_set1_epi8(val); + __m128i v_128 = _mm512_castsi512_si128(v); + __m512 o; + cvtfp8e5m2_fp32(v_128, o); + return _mm512_cvtss_f32(o); +} + +static inline uint8_t fp32_to_fp8e5m2_scalar(float val) { + __m512 v = _mm512_set1_ps(val); + __m128i o = cvtfp32_fp8e5m2(v); + return static_cast(_mm_cvtsi128_si32(o)); +} + +template +class Vectorizedf8 { + static_assert( + std::integral_constant < bool, + std::is_same_v || std::is_same_v < T, + at::Float8_e5m2 >> ::value, + "Support only float8 e4m3."); + + private: + __m512i values; + template + Vectorized inline binary_compare(const VectorizedType& b, Op op) const { + __m512 a0, a1, a2, a3; + __m512 b0, b1, b2, b3; + __m512 o0, o1, o2, o3; + if constexpr (std::is_same_v) { + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 0), a0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 1), a1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 2), a2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 3), a3); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3); + } else { + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 0), a0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 1), a1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 2), a2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 3), a3); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3); + } + + o0 = op(a0, b0); + o1 = op(a1, b1); + o2 = op(a2, b2); + o3 = op(a3, b3); + __m128i o128_0, o128_1, o128_2, o128_3; + if constexpr (std::is_same_v) { + o128_0 = cvtfp32_fp8e4m3(o0); + o128_1 = cvtfp32_fp8e4m3(o1); + o128_2 = cvtfp32_fp8e4m3(o2); + o128_3 = cvtfp32_fp8e4m3(o3); + } else { + o128_0 = cvtfp32_fp8e5m2(o0); + o128_1 = cvtfp32_fp8e5m2(o1); + o128_2 = cvtfp32_fp8e5m2(o2); + o128_3 = cvtfp32_fp8e5m2(o3); + } + + __m512i result = _mm512_setzero_si512(); + result = _mm512_inserti32x4(result, o128_0, 0); + result = _mm512_inserti32x4(result, o128_1, 1); + result = _mm512_inserti32x4(result, o128_2, 2); + result = _mm512_inserti32x4(result, o128_3, 3); + + return result; + } + + public: + using value_type = uint8_t; + using size_type = int; + static constexpr size_type size() { + return 64; + } + Vectorizedf8() {} + Vectorizedf8(__m512i v) : values(v) {} + Vectorizedf8(T val) { + value_type uw = val.x; + values = _mm512_set1_epi8(uw); + } + operator __m512i() const { + return values; + } + T& operator[](int idx) = delete; + const T& operator[](int idx) const = delete; + static Vectorized loadu(const void* ptr, int16_t count = size()) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else if (count == 16) { + // Fast path if only load element number of 16 + __m128i input_128 = + _mm_loadu_si128(reinterpret_cast(ptr)); + return _mm512_castsi128_si512(input_128); + } else { + __mmask64 mask = (1ULL << count) - 1; + return _mm512_maskz_loadu_epi8(mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + if (count == 16) { + // Fast path if only store element number of 16 + _mm_storeu_si128( + reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values)); + } else { + __mmask64 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi8(ptr, mask, values); + } + } + } + + Vectorized abs() const { + return _mm512_andnot_si512(_mm512_set1_epi8(0x80), values); + } + + Vectorized inline operator==(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator!=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator>(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator>=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator<(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator<=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } +}; + +template <> +class Vectorized : public Vectorizedf8 { + public: + using Vectorizedf8::Vectorizedf8; + + using value_type = Float8_e4m3fn; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template < + typename T, + typename Op, + std::enable_if_t< + std::is_same_v || + std::is_same_v, + int> = 0> +static inline Vectorized binary_fp8_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { + __m512 a0, a1, a2, a3; + __m512 b0, b1, b2, b3; + __m512 o0, o1, o2, o3; + if constexpr (std::is_same_v) { + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 0), a0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 0), b0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 1), a1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 1), b1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 2), a2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 2), b2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 3), a3); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 3), b3); + } else { + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 0), a0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 0), b0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 1), a1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 1), b1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 2), a2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 2), b2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 3), a3); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 3), b3); + } + o0 = op(a0, b0); + o1 = op(a1, b1); + o2 = op(a2, b2); + o3 = op(a3, b3); + + __m128i o128_0, o128_1, o128_2, o128_3; + if constexpr (std::is_same_v) { + o128_0 = cvtfp32_fp8e4m3(o0); + o128_1 = cvtfp32_fp8e4m3(o1); + o128_2 = cvtfp32_fp8e4m3(o2); + o128_3 = cvtfp32_fp8e4m3(o3); + } else { + o128_0 = cvtfp32_fp8e5m2(o0); + o128_1 = cvtfp32_fp8e5m2(o1); + o128_2 = cvtfp32_fp8e5m2(o2); + o128_3 = cvtfp32_fp8e5m2(o3); + } + + __m512i result = _mm512_setzero_si512(); + result = _mm512_inserti32x4(result, o128_0, 0); + result = _mm512_inserti32x4(result, o128_1, 1); + result = _mm512_inserti32x4(result, o128_2, 2); + result = _mm512_inserti32x4(result, o128_3, 3); + + return result; +} + +// Refer to +// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +, +// -, *, /, planned to be deleted in the future and here is just to make +// compiler happy +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} + +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} + +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} + +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +class Vectorized : public Vectorizedf8 { + public: + using Vectorizedf8::Vectorizedf8; + + using value_type = Float8_e5m2; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +// Refer to +// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +, +// -, *, /, planned to be deleted in the future and here is just to make +// compiler happy +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} + +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} + +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} + +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h new file mode 100644 index 0000000000000000000000000000000000000000..2044a199105a3dfe76e9fda09acc68251510651b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h @@ -0,0 +1,2126 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#ifdef CPU_CAPABILITY_AVX512 + +struct Vectorizedi { + protected: + __m512i values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + static inline __m512i invert(const __m512i& v) { + const auto ones = _mm512_set1_epi64(-1); + return _mm512_xor_si512(ones, v); + } + + public: + Vectorizedi() {} + Vectorizedi(__m512i v) : values(v) {} + operator __m512i() const { + return values; + } +}; + +#else + +struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined + +#endif // CPU_CAPABILITY_AVX512 + +#ifdef CPU_CAPABILITY_AVX512 + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: + using value_type = int64_t; + using size_type = int; + static constexpr size_type size() { + return 8; + } + using Vectorizedi::Vectorizedi; + Vectorized() { + values = _mm512_setzero_si512(); + } + Vectorized(int64_t v) { + values = _mm512_set1_epi64(v); + } + Vectorized( + int64_t val1, + int64_t val2, + int64_t val3, + int64_t val4, + int64_t val5, + int64_t val6, + int64_t val7, + int64_t val8) { + values = _mm512_setr_epi64(val1, val2, val3, val4, val5, val6, val7, val8); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi64(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto msb_one = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); + auto mask_ = _mm512_cmp_epi64_mask(mask, msb_one, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi64(mask_, a.values, b.values); + } + template + static Vectorized arange( + int64_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int64_t count) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else { + __mmask8 mask = (1ULL << count) - 1; + auto ones = _mm512_set1_epi64(1); + return _mm512_mask_loadu_epi64(ones, mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + __mmask8 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi64(ptr, mask, values); + } + } + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + Vectorized abs() const { + auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values); + auto is_larger = + _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF); + auto inverse = _mm512_xor_si512(values, is_larger); + return _mm512_sub_epi64(inverse, is_larger); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_epi64(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmpeq_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmpneq_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmplt_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmple_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + Vectorized operator>(const Vectorized& other) const { + auto mask = _mm512_cmpgt_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + Vectorized operator>=(const Vectorized& other) const { + auto mask = _mm512_cmpge_epi64_mask(values, other.values); + return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; +template <> +class Vectorized : public Vectorizedi { + private: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + static const Vectorized ones; + + public: + using value_type = int32_t; + static constexpr int size() { + return 16; + } + using Vectorizedi::Vectorizedi; + Vectorized() {} + Vectorized(int32_t v) { + values = _mm512_set1_epi32(v); + } + Vectorized( + int32_t val1, + int32_t val2, + int32_t val3, + int32_t val4, + int32_t val5, + int32_t val6, + int32_t val7, + int32_t val8, + int32_t val9, + int32_t val10, + int32_t val11, + int32_t val12, + int32_t val13, + int32_t val14, + int32_t val15, + int32_t val16) { + values = _mm512_setr_epi32( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi32(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto msb_one = _mm512_set1_epi32(0xFFFFFFFF); + auto mask_ = _mm512_cmp_epi32_mask(mask, msb_one, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi32(mask_, a.values, b.values); + } + template + static Vectorized arange( + int32_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int32_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int32_t count) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else { + __mmask16 mask = (1ULL << count) - 1; + auto ones = _mm512_set1_epi32(1); + return _mm512_mask_loadu_epi32(ones, mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + __mmask16 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi32(ptr, mask, values); + } + } + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; + Vectorized abs() const { + return _mm512_abs_epi32(values); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_epi32(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + int32_t reduce_add() const { + return _mm512_reduce_add_epi32(values); + } + int32_t reduce_max() const { + return _mm512_reduce_max_epi32(values); + } + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmpeq_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmpneq_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmplt_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmple_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized operator>(const Vectorized& other) const { + auto mask = _mm512_cmpgt_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized operator>=(const Vectorized& other) const { + auto mask = _mm512_cmpge_epi32_mask(values, other.values); + return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF); + } + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + int64_t i; + // int32_t and float have same size +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_vec = + _mm512_loadu_si512(reinterpret_cast(src + i)); + auto output_vec = _mm512_cvtepi32_ps(input_vec); + _mm512_storeu_ps(reinterpret_cast(dst + i), output_vec); + } +#ifndef _MSC_VER +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int32_t* src, double* dst, int64_t n) { + int64_t i; + // int32_t has half the size of double +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_256_vec = + _mm256_loadu_si256(reinterpret_cast(src + i)); + auto output_vec = _mm512_cvtepi32_pd(input_256_vec); + _mm512_storeu_pd(reinterpret_cast(dst + i), output_vec); + } +#ifndef _MSC_VER +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: + using value_type = int16_t; + static constexpr int size() { + return 32; + } + using Vectorizedi::Vectorizedi; + Vectorized() {} + Vectorized(int16_t v) { + values = _mm512_set1_epi16(v); + } + Vectorized( + int16_t val1, + int16_t val2, + int16_t val3, + int16_t val4, + int16_t val5, + int16_t val6, + int16_t val7, + int16_t val8, + int16_t val9, + int16_t val10, + int16_t val11, + int16_t val12, + int16_t val13, + int16_t val14, + int16_t val15, + int16_t val16, + int16_t val17, + int16_t val18, + int16_t val19, + int16_t val20, + int16_t val21, + int16_t val22, + int16_t val23, + int16_t val24, + int16_t val25, + int16_t val26, + int16_t val27, + int16_t val28, + int16_t val29, + int16_t val30, + int16_t val31, + int16_t val32) { + values = _mm512_set_epi16( + val32, + val31, + val30, + val29, + val28, + val27, + val26, + val25, + val24, + val23, + val22, + val21, + val20, + val19, + val18, + val17, + val16, + val15, + val14, + val13, + val12, + val11, + val10, + val9, + val8, + val7, + val6, + val5, + val4, + val3, + val2, + val1); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi16(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto msb_one = _mm512_set1_epi16(0xFFFF); + auto mask_ = _mm512_cmp_epi16_mask(mask, msb_one, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi16(mask_, a.values, b.values); + } + template + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int16_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<0x1>(a, b); + case 2: + return blend<0x3>(a, b); + case 3: + return blend<0x7>(a, b); + case 4: + return blend<0xF>(a, b); + case 5: + return blend<0x1F>(a, b); + case 6: + return blend<0x3F>(a, b); + case 7: + return blend<0x7F>(a, b); + case 8: + return blend<0xFF>(a, b); + case 9: + return blend<0x1FF>(a, b); + case 10: + return blend<0x3FF>(a, b); + case 11: + return blend<0x7FF>(a, b); + case 12: + return blend<0xFFF>(a, b); + case 13: + return blend<0x1FFF>(a, b); + case 14: + return blend<0x3FFF>(a, b); + case 15: + return blend<0x7FFF>(a, b); + case 16: + return blend<0xFFFF>(a, b); + case 17: + return blend<0x1FFFF>(a, b); + case 18: + return blend<0x3FFFF>(a, b); + case 19: + return blend<0x7FFFF>(a, b); + case 20: + return blend<0xFFFFF>(a, b); + case 21: + return blend<0x1FFFFF>(a, b); + case 22: + return blend<0x3FFFFF>(a, b); + case 23: + return blend<0x7FFFFF>(a, b); + case 24: + return blend<0xFFFFFF>(a, b); + case 25: + return blend<0x1FFFFFF>(a, b); + case 26: + return blend<0x3FFFFFF>(a, b); + case 27: + return blend<0x7FFFFFF>(a, b); + case 28: + return blend<0xFFFFFFF>(a, b); + case 29: + return blend<0x1FFFFFFF>(a, b); + case 30: + return blend<0x3FFFFFFF>(a, b); + case 31: + return blend<0x7FFFFFFF>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } + static Vectorized loadu(const void* ptr, int16_t count) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else { + __mmask32 mask = (1ULL << count) - 1; + auto ones = _mm512_set1_epi16(1); + return _mm512_mask_loadu_epi16(ones, mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + __mmask32 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi16(ptr, mask, values); + } + } + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; + Vectorized abs() const { + return _mm512_abs_epi16(values); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_epi16(0); + } + Vectorized conj() const { + return *this; + } + Vectorized neg() const; + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmpeq_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmpneq_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmplt_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmple_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + Vectorized operator>(const Vectorized& other) const { + auto mask = _mm512_cmpgt_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + Vectorized operator>=(const Vectorized& other) const { + auto mask = _mm512_cmpge_epi16_mask(values, other.values); + return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template +class Vectorized8 : public Vectorizedi { + static_assert( + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + + protected: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + static const Vectorized ones; + + public: + using value_type = T; + static constexpr int size() { + return 64; + } + using Vectorizedi::Vectorizedi; + Vectorized8() {} + Vectorized8(T v) { + values = _mm512_set1_epi8(v); + } + Vectorized8( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32, + T val33, + T val34, + T val35, + T val36, + T val37, + T val38, + T val39, + T val40, + T val41, + T val42, + T val43, + T val44, + T val45, + T val46, + T val47, + T val48, + T val49, + T val50, + T val51, + T val52, + T val53, + T val54, + T val55, + T val56, + T val57, + T val58, + T val59, + T val60, + T val61, + T val62, + T val63, + T val64) { + values = _mm512_set_epi8( + val64, + val63, + val62, + val61, + val60, + val59, + val58, + val57, + val56, + val55, + val54, + val53, + val52, + val51, + val50, + val49, + val48, + val47, + val46, + val45, + val44, + val43, + val42, + val41, + val40, + val39, + val38, + val37, + val36, + val35, + val34, + val33, + val32, + val31, + val30, + val29, + val28, + val27, + val26, + val25, + val24, + val23, + val22, + val21, + val20, + val19, + val18, + val17, + val16, + val15, + val14, + val13, + val12, + val11, + val10, + val9, + val8, + val7, + val6, + val5, + val4, + val3, + val2, + val1); + } + template + static Vectorized blend(Vectorized a, Vectorized b) { + return _mm512_mask_blend_epi8(mask, a.values, b.values); + } + template + static Vectorized arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step, + base + 32 * step, + base + 33 * step, + base + 34 * step, + base + 35 * step, + base + 36 * step, + base + 37 * step, + base + 38 * step, + base + 39 * step, + base + 40 * step, + base + 41 * step, + base + 42 * step, + base + 43 * step, + base + 44 * step, + base + 45 * step, + base + 46 * step, + base + 47 * step, + base + 48 * step, + base + 49 * step, + base + 50 * step, + base + 51 * step, + base + 52 * step, + base + 53 * step, + base + 54 * step, + base + 55 * step, + base + 56 * step, + base + 57 * step, + base + 58 * step, + base + 59 * step, + base + 60 * step, + base + 61 * step, + base + 62 * step, + base + 63 * step); + } + static Vectorized set(Vectorized a, Vectorized b, T count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<0x1>(a, b); + case 2: + return blend<0x3>(a, b); + case 3: + return blend<0x7>(a, b); + case 4: + return blend<0xF>(a, b); + case 5: + return blend<0x1F>(a, b); + case 6: + return blend<0x3F>(a, b); + case 7: + return blend<0x7F>(a, b); + case 8: + return blend<0xFF>(a, b); + case 9: + return blend<0x1FF>(a, b); + case 10: + return blend<0x3FF>(a, b); + case 11: + return blend<0x7FF>(a, b); + case 12: + return blend<0xFFF>(a, b); + case 13: + return blend<0x1FFF>(a, b); + case 14: + return blend<0x3FFF>(a, b); + case 15: + return blend<0x7FFF>(a, b); + case 16: + return blend<0xFFFF>(a, b); + case 17: + return blend<0x1FFFF>(a, b); + case 18: + return blend<0x3FFFF>(a, b); + case 19: + return blend<0x7FFFF>(a, b); + case 20: + return blend<0xFFFFF>(a, b); + case 21: + return blend<0x1FFFFF>(a, b); + case 22: + return blend<0x3FFFFF>(a, b); + case 23: + return blend<0x7FFFFF>(a, b); + case 24: + return blend<0xFFFFFF>(a, b); + case 25: + return blend<0x1FFFFFF>(a, b); + case 26: + return blend<0x3FFFFFF>(a, b); + case 27: + return blend<0x7FFFFFF>(a, b); + case 28: + return blend<0xFFFFFFF>(a, b); + case 29: + return blend<0x1FFFFFFF>(a, b); + case 30: + return blend<0x3FFFFFFF>(a, b); + case 31: + return blend<0x7FFFFFFF>(a, b); + case 32: + return blend<0xFFFFFFFF>(a, b); + case 33: + return blend<0x1FFFFFFFF>(a, b); + case 34: + return blend<0x3FFFFFFFF>(a, b); + case 35: + return blend<0x7FFFFFFFF>(a, b); + case 36: + return blend<0xFFFFFFFFF>(a, b); + case 37: + return blend<0x1FFFFFFFFF>(a, b); + case 38: + return blend<0x3FFFFFFFFF>(a, b); + case 39: + return blend<0x7FFFFFFFFF>(a, b); + case 40: + return blend<0xFFFFFFFFFF>(a, b); + case 41: + return blend<0x1FFFFFFFFFF>(a, b); + case 42: + return blend<0x3FFFFFFFFFF>(a, b); + case 43: + return blend<0x7FFFFFFFFFF>(a, b); + case 44: + return blend<0xFFFFFFFFFFF>(a, b); + case 45: + return blend<0x1FFFFFFFFFFF>(a, b); + case 46: + return blend<0x3FFFFFFFFFFF>(a, b); + case 47: + return blend<0x7FFFFFFFFFFF>(a, b); + case 48: + return blend<0xFFFFFFFFFFFF>(a, b); + case 49: + return blend<0x1FFFFFFFFFFFF>(a, b); + case 50: + return blend<0x3FFFFFFFFFFFF>(a, b); + case 51: + return blend<0x7FFFFFFFFFFFF>(a, b); + case 52: + return blend<0xFFFFFFFFFFFFF>(a, b); + case 53: + return blend<0x1FFFFFFFFFFFFF>(a, b); + case 54: + return blend<0x3FFFFFFFFFFFFF>(a, b); + case 55: + return blend<0x7FFFFFFFFFFFFF>(a, b); + case 56: + return blend<0xFFFFFFFFFFFFFF>(a, b); + case 57: + return blend<0x1FFFFFFFFFFFFFF>(a, b); + case 58: + return blend<0x3FFFFFFFFFFFFFF>(a, b); + case 59: + return blend<0x7FFFFFFFFFFFFFF>(a, b); + case 60: + return blend<0xFFFFFFFFFFFFFFF>(a, b); + case 61: + return blend<0x1FFFFFFFFFFFFFFF>(a, b); + case 62: + return blend<0x3FFFFFFFFFFFFFFF>(a, b); + case 63: + return blend<0x7FFFFFFFFFFFFFFF>(a, b); + } + return b; + } + static Vectorized loadu(const void* ptr) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } + static Vectorized loadu_one_fourth(const void* ptr) { + // Fast path if only load element number of 16. + // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), + // Because loadu(const void* ptr, T count) requires zero initialization for + // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384 + // bits of the result are undefined. + // TODO We can use _mm512_zextsi128_si512 in the future, + // since gcc 9.3 doesn't support it now. + __m128i input_128 = _mm_loadu_si128(reinterpret_cast(ptr)); + return _mm512_castsi128_si512(input_128); + } + static Vectorized loadu(const void* ptr, T count) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else if (count == 16) { + // Fast path if only load element number of 16 + return loadu_one_fourth(ptr); + } else { + __mmask64 mask = (1ULL << count) - 1; + auto ones = _mm512_set1_epi8(1); + return _mm512_mask_loadu_epi8(ones, mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + if (count == 16) { + // Fast path if only store element number of 16 + _mm_storeu_si128( + reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values)); + } else { + __mmask64 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi8(ptr, mask, values); + } + } + } + const T& operator[](int idx) const = delete; + T& operator[](int idx) = delete; + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return _mm512_set1_epi8(0); + } + Vectorized conj() const { + return *this; + } +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto msb_one = _mm512_set1_epi8(0xFF); + auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi8(mask_, a.values, b.values); + } + + Vectorized neg() const; + + Vectorized abs() const { + return _mm512_abs_epi8(values); + } + + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmpeq_epi8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmpneq_epi8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmplt_epi8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmple_epi8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator>(const Vectorized& other) const { + return other < *this; + } + Vectorized operator>=(const Vectorized& other) const { + return other <= *this; + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto msb_one = _mm512_set1_epi8(0xFF); + auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ); + return _mm512_mask_blend_epi8(mask_, a.values, b.values); + } + + Vectorized neg() const; + + Vectorized abs() const { + return *this; + } + + Vectorized operator==(const Vectorized& other) const { + auto mask = _mm512_cmpeq_epu8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator!=(const Vectorized& other) const { + auto mask = _mm512_cmpneq_epu8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator<(const Vectorized& other) const { + auto mask = _mm512_cmplt_epu8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator<=(const Vectorized& other) const { + auto mask = _mm512_cmple_epu8_mask(values, other.values); + return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF); + } + Vectorized operator>(const Vectorized& other) const { + return other < *this; + } + Vectorized operator>=(const Vectorized& other) const { + return other <= *this; + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi64(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi32(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi16(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi8(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi8(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_epi64(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_epi32(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_epi16(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_epi8(a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sub_epi8(a, b); +} + +// Negation. Defined here so we can utilize operator- +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +inline Vectorized Vectorized::neg() const { + return Vectorized(0) - *this; +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mullo_epi64(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mullo_epi32(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mullo_epi16(a, b); +} + +template +Vectorized inline int_elementwise_binary_512( + const Vectorized& a, + const Vectorized& b, + Op op) { + T values_a[Vectorized::size()]; + T values_b[Vectorized::size()]; + a.store(values_a); + b.store(values_b); + for (int i = 0; i != Vectorized::size(); i++) { + values_a[i] = op(values_a[i], values_b[i]); + } + return Vectorized::loadu(values_a); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + // We don't have an instruction for multiplying int8_t +#ifndef CPU_CAPABILITY_AVX512 + return int_elementwise_binary_512(a, b, std::multiplies()); +#else + __m512i mask00FF = _mm512_set1_epi16(0x00FF); + __m512i a_lo = _mm512_srai_epi16(_mm512_slli_epi16(a, 8), 8); + __m512i b_lo = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8); + __m512i a_hi = _mm512_srai_epi16(a, 8); + __m512i b_hi = _mm512_srai_epi16(b, 8); + __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF); + __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8); + __m512i res = _mm512_or_si512(res_hi, res_lo); + return res; +#endif +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + // We don't have an instruction for multiplying uint8_t +#ifndef CPU_CAPABILITY_AVX512 + return int_elementwise_binary_512(a, b, std::multiplies()); +#else + __m512i mask00FF = _mm512_set1_epi16(0x00FF); + __m512i a_lo = _mm512_and_si512(a, mask00FF); + __m512i b_lo = _mm512_and_si512(b, mask00FF); + __m512i a_hi = _mm512_srli_epi16(a, 8); + __m512i b_hi = _mm512_srli_epi16(b, 8); + __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF); + __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8); + __m512i res = _mm512_or_si512(res_hi, res_lo); + return res; +#endif +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_min_epi64(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_min_epi32(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_min_epi16(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_min_epi8(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_min_epu8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_max_epi64(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_max_epi32(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_max_epi16(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_max_epi8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return _mm512_max_epu8(a, b); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm512_min_epi32(max_val, _mm512_max_epi32(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm512_min_epi16(max_val, _mm512_max_epi16(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { + return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm512_min_epi64(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm512_min_epi32(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm512_min_epi16(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm512_min_epi8(max_val, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { + return _mm512_min_epu8(max_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epi64(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epi32(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epi16(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epi8(min_val, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epu8(min_val, a); +} + +template +std::enable_if_t< + !(std::is_same_v || std::is_same_v), + Vectorized< + int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized::size()) { + return Vectorized::loadu(ptr, count); +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const int8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm512_cvtepi8_epi32( + _mm_loadu_si128(reinterpret_cast(ptr))); + } else { + auto a = Vectorized::loadu(ptr, count); + return _mm512_cvtepi8_epi32(_mm512_castsi512_si128(a)); + } +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const uint8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast(ptr))); + } else { + auto a = Vectorized::loadu(ptr, count); + return _mm512_cvtepu8_epi32(_mm512_castsi512_si128(a)); + } +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} + +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return _mm512_and_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return _mm512_or_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + return _mm512_xor_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator~(const Vectorized& a) { + return _mm512_xor_si512(a, _mm512_set1_epi32(-1)); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template < + bool left_shift, + typename T, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + int> = 0> +Vectorized inline shift_512_8( + const Vectorized& a, + const Vectorized& b) { + // No vector instruction for shifting int8_t/uint8_t, so emulating + // it instead. + + // Control masks for shuffle operation, treating 512 bits as an + // array of 8-bit elements, and considering pairs of neighboring + // elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and + // M!=N) is set so that shuffle will move element with index M from + // input pair into element with index N in output pair, and element + // with index M in output pair will be set to all 0s. + __m512i ctl_0_1 = _mm512_set_epi8( + 62, + 0x80, + 60, + 0x80, + 58, + 0x80, + 56, + 0x80, + 54, + 0x80, + 52, + 0x80, + 50, + 0x80, + 48, + 0x80, + 46, + 0x80, + 44, + 0x80, + 42, + 0x80, + 40, + 0x80, + 38, + 0x80, + 36, + 0x80, + 34, + 0x80, + 32, + 0x80, + 30, + 0x80, + 28, + 0x80, + 26, + 0x80, + 24, + 0x80, + 22, + 0x80, + 20, + 0x80, + 18, + 0x80, + 16, + 0x80, + 14, + 0x80, + 12, + 0x80, + 10, + 0x80, + 8, + 0x80, + 6, + 0x80, + 4, + 0x80, + 2, + 0x80, + 0, + 0x80); + __m512i ctl_1_0 = _mm512_set_epi8( + 0x80, + 63, + 0x80, + 61, + 0x80, + 59, + 0x80, + 57, + 0x80, + 55, + 0x80, + 53, + 0x80, + 51, + 0x80, + 49, + 0x80, + 47, + 0x80, + 45, + 0x80, + 43, + 0x80, + 41, + 0x80, + 39, + 0x80, + 37, + 0x80, + 35, + 0x80, + 33, + 0x80, + 31, + 0x80, + 29, + 0x80, + 27, + 0x80, + 25, + 0x80, + 23, + 0x80, + 21, + 0x80, + 19, + 0x80, + 17, + 0x80, + 15, + 0x80, + 13, + 0x80, + 11, + 0x80, + 9, + 0x80, + 7, + 0x80, + 5, + 0x80, + 3, + 0x80, + 1); + + // Masks for bitwise and operation, treating 512 bits as an array of + // 8-bit elements, and considering them in pairs of neighboring + // elements. A mask named "keep_M" (M in [0,1]) is set so that + // bitwise and will copy element with index M from input pair into + // element with the same index in output pair, while the other + // element in output pair will be set to all 0s. + __m512i keep_0 = _mm512_set1_epi16(0xFF); + __m512i keep_1 = _mm512_set1_epi16(0xFF00); + + // Take each 8-bit element with idx%2==0 from input array to be + // shifted and extend it to 16 bits so that 0s are added to the + // right. Then, perform shifting on this 16-bit number. Upper 8 + // bits will be proper result of shifting original 8-bit number, so + // write them to result array, into the same position from which + // corresponding input element is taken. Also, make sure that + // result array elements with idx%2!=0 are set to all 0s. + // + // Note that number of bits to shift for is extended to 16 bits by + // adding 0s to the left. That means this number is not properly + // sign-extended for negative values. However, number of bits to + // shift is treated as an unsigned integer by respective shift + // intrinsics anyway so if negative then either with or without + // proper sign extension, it will be interpreted as a number greater + // than 32, and the shifting result will be the same. + __m512i a0 = _mm512_shuffle_epi8(a, ctl_0_1); + __m512i b0 = _mm512_and_si512(b, keep_0); + __m512i c0; + if (left_shift) + c0 = _mm512_sllv_epi16(a0, b0); + else if constexpr (std::is_same_v) + c0 = _mm512_srav_epi16(a0, b0); + else + c0 = _mm512_srlv_epi16(a0, b0); + c0 = _mm512_shuffle_epi8(c0, ctl_1_0); + + // Perform shifting the same way for input array elements with + // idx%2==1. + __m512i a1 = _mm512_and_si512(a, keep_1); + __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0); + __m512i c1; + if (left_shift) + c1 = _mm512_sllv_epi16(a1, b1); + else if constexpr (std::is_same_v) + c1 = _mm512_srav_epi16(a1, b1); + else + c1 = _mm512_srlv_epi16(a1, b1); + c1 = _mm512_and_si512(c1, keep_1); + + // Merge partial results into the final result. + __m512i c = _mm512_or_si512(c0, c1); + + return c; +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sllv_epi64(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sllv_epi32(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return _mm512_sllv_epi16(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return shift_512_8(a, b); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + return shift_512_8(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return _mm512_srav_epi64(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return _mm512_srav_epi32(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return _mm512_srav_epi16(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return shift_512_8(a, b); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + return shift_512_8(a, b); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h new file mode 100644 index 0000000000000000000000000000000000000000..5ad0997df7d03d19214f50c9fa81b8d1f03ab02c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h @@ -0,0 +1,395 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +template +struct VecMaskLoad< + T, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (mask_n == dst_n * 2 && dst_n >= 1) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + at::vec::Vectorized zero_vec(0); + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + VectorizedN tmp_vec; + VectorizedN result; + for (int i = 0; i < dst_n; i++) { + tmp_vec[0] = vec_mask[2 * i]; + tmp_vec[1] = vec_mask[2 * i + 1]; + auto int64_mask = VecMask(tmp_vec).template cast(); + auto int_mask = int64_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + if constexpr (std::is_same_v) { + result[i] = Vectorized(_mm512_mask_loadu_ps( + zero_vec, mmask, ptr + i * Vectorized::size())); + } else { + result[i] = Vectorized(_mm512_mask_loadu_epi32( + zero_vec, mmask, ptr + i * Vectorized::size())); + } + } + return result; + } +}; + +template +struct VecMaskLoad< + T, + dst_n, + mask_t, + dst_n, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + at::vec::Vectorized zero_vec(0); + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < dst_n; i++) { + auto tmp_mask = VecMask(vec_mask[i]); + auto int_mask = tmp_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + if constexpr (std::is_same_v) { + result[i] = Vectorized(_mm512_mask_loadu_ps( + zero_vec, mmask, ptr + i * Vectorized::size())); + } else { + result[i] = Vectorized(_mm512_mask_loadu_epi32( + zero_vec, mmask, ptr + i * Vectorized::size())); + } + } + return result; + } +}; + +template +struct VecMaskLoad< + data_t, + dst_n, + mask_t, + dst_n, + std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < dst_n; i++) { + auto tmp_mask = VecMask(vec_mask[i]); + auto int_mask = tmp_mask.template cast(); + auto mmask0 = _mm512_cmp_epi32_mask(int_mask[0], all_ones, _MM_CMPINT_EQ); + auto mmask1 = _mm512_cmp_epi32_mask(int_mask[1], all_ones, _MM_CMPINT_EQ); + auto zero = _mm256_set1_epi16(0); + auto temp0 = _mm256_mask_loadu_epi16( + zero, mmask0, ptr + (2 * i) * Vectorized::size()); + auto temp1 = _mm256_mask_loadu_epi16( + zero, mmask1, ptr + (2 * i + 1) * Vectorized::size()); + result[i] = Vectorized( + _mm512_inserti32x8(_mm512_castsi256_si512(temp0), temp1, 1)); + } + return result; + } +}; + +template +struct VecMaskLoad< + data_t, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (mask_n == 2 * dst_n && dst_n >= 1) && + (std::is_same_v || std::is_same_v)>> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + VectorizedN result; + VectorizedN tmp_vec; + for (int i = 0; i < dst_n; i++) { + tmp_vec[0] = vec_mask[2 * i]; + tmp_vec[1] = vec_mask[2 * i + 1]; + auto int_mask = VecMask(tmp_vec).template cast(); + auto mmask0 = _mm512_cmp_epi32_mask(int_mask[0], all_ones, _MM_CMPINT_EQ); + auto mmask1 = _mm512_cmp_epi32_mask(int_mask[1], all_ones, _MM_CMPINT_EQ); + auto zero = _mm256_set1_epi16(0); + auto temp0 = _mm256_mask_loadu_epi16( + zero, mmask0, ptr + (2 * i) * Vectorized::size()); + auto temp1 = _mm256_mask_loadu_epi16( + zero, mmask1, ptr + (2 * i + 1) * Vectorized::size()); + result[i] = Vectorized( + _mm512_inserti32x8(_mm512_castsi256_si512(temp0), temp1, 1)); + } + return result; + } +}; + +template +struct VecMaskLoad< + data_t, + 1, + mask_t, + 1, + std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + auto zero = _mm_set1_epi8(0); + auto temp = _mm_mask_loadu_epi8(zero, mmask, ptr); + return Vectorized( + _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0)); + } +}; + +template +struct VecMaskLoad< + data_t, + 2, + mask_t, + 1, + std::enable_if_t< + std::is_same_v || std::is_same_v>> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + at::vec::Vectorized zero_vec(0); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + at::vec::VectorizedN result; + if constexpr (std::is_same_v) { + result[0] = _mm512_mask_loadu_pd(zero_vec, (__mmask8)mmask, ptr); + result[1] = + _mm512_mask_loadu_pd(zero_vec, (__mmask8)(mmask >> 8), ptr + 8); + } else { + result[0] = _mm512_mask_loadu_epi64(zero_vec, (__mmask8)mmask, ptr); + result[1] = + _mm512_mask_loadu_epi64(zero_vec, (__mmask8)(mmask >> 8), ptr + 8); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm512_castsi512_ps(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm512_castps_si512(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm512_castpd_si512(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result[i] = _mm512_castsi512_pd(vec_mask[i]); + } + return result; + } +}; + +template +struct VecMaskCast< + int64_t, + dst_n, + mask_t, + mask_n, + typename std::enable_if_t< + (dst_n == 2 * mask_n) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VecMask apply( + const VecMask& vec_mask) { + VectorizedN result; + auto int_mask = vec_mask.template cast(); +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < mask_n; ++i) { + auto int64_vec = + convert(VectorizedN(int_mask[i])); + result[2 * i] = int64_vec[0]; + result[2 * i + 1] = int64_vec[1]; + } + return VecMask(result); + } +}; + +template +struct VecMaskCast< + dst_t, + dst_n, + int64_t, + mask_n, + typename std::enable_if_t< + (mask_n == 2 * dst_n) && + (std::is_same_v || std::is_same_v), + void>> { + static inline VecMask apply( + const VecMask& vec_mask) { + VectorizedN result; + VectorizedN int64_vec; + for (int i = 0; i < dst_n; ++i) { + int64_vec[0] = vec_mask[2 * i]; + int64_vec[1] = vec_mask[2 * i + 1]; + result[i] = convert(int64_vec); + } + return VecMask(result).template cast(); + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int64_mask = VecMaskCast::apply(vec_mask); + return VecMaskCast::apply(int64_mask); + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int64_mask = VecMaskCast::apply(vec_mask); + return VecMaskCast::apply(int64_mask); + } +}; + +template <> +inline bool VecMask::all_zero() const { + __mmask16 mask = _mm512_test_epi32_mask(mask_[0], mask_[0]); + return mask == 0; +} + +template <> +inline bool VecMask::is_masked(int i) const { + return _mm512_movepi32_mask(mask_[0]) & (1 << i); +} + +template <> +inline bool VecMask::all_masked() const { + __mmask16 mask = _mm512_movepi32_mask(mask_[0]); + return mask == 0xffff; +} + +template +struct VecMaskCheck { + static inline bool all_zero(const VectorizedN& vec_mask) { + bool all_zero = true; + for (int i = 0; i < N; ++i) { + all_zero = + all_zero && (_mm512_test_epi64_mask(vec_mask[i], vec_mask[i]) == 0); + if (!all_zero) { + return all_zero; + } + } + return all_zero; + } + + static inline bool is_masked(const VectorizedN& vec_mask, int i) { + for (int j = 0; j < N; ++j) { + if (i < (j + 1) * 8) { + return _mm512_movepi64_mask(vec_mask[j]) & (1 << (i - j * 8)); + } + } + return false; + } + + static inline bool all_masked(const VectorizedN& vec_mask) { + bool all_masked = true; + for (int i = 0; i < N; ++i) { + all_masked = all_masked && (_mm512_movepi64_mask(vec_mask[i]) == 0xff); + if (!all_masked) { + return all_masked; + } + } + return all_masked; + } +}; + +#define VEC_MASK_METHOD_WITH_CAST_TO_INT( \ + T, N, return_type, method, args_def, args) \ + template <> \ + inline return_type VecMask::method args_def const { \ + return cast().method args; \ + } + +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ()) + +#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h new file mode 100644 index 0000000000000000000000000000000000000000..270b96bac433b52d68329bf0a452381d0c8170a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h @@ -0,0 +1,1552 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// This file defines Vectorized<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vectorized, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vectorized -> 4x Vectorized +// Vectorized -> 4x Vectorized +// Vectorized -> 1x Vectorized +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. + +namespace at { +namespace vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) + +#ifdef _MSC_VER +__declspec(align(64)) struct Vectorizedqi { + protected: + __m512i vals; +#else +struct Vectorizedqi { + protected: + __m512i vals __attribute__((aligned(64))); +#endif + + public: + Vectorizedqi() { + vals = _mm512_setzero_si512(); + } + Vectorizedqi(__m512i v) : vals(v) {} + operator __m512i() const { + return vals; + } +}; + +template +__m512i pack_saturate_and_clamp( + __m512i first, + __m512i second, + T min_val, + T max_val); + +template <> +inline __m512i pack_saturate_and_clamp( + __m512i first [[maybe_unused]], + __m512i second [[maybe_unused]], + int32_t min_val [[maybe_unused]], + int32_t max_val [[maybe_unused]]) { + // This function is for linkage only, will not be used + TORCH_CHECK(false, "pack_saturate_and_clamp is not supported"); + return __m512i{}; +} + +template <> +inline __m512i pack_saturate_and_clamp( + __m512i first, + __m512i second, + int8_t min_val, + int8_t max_val) { + __m512i packed_and_sat = _mm512_packs_epi16(first, second); + return _mm512_max_epi8( + _mm512_set1_epi8(min_val), + _mm512_min_epi8(packed_and_sat, _mm512_set1_epi8(max_val))); +} + +template <> +inline __m512i pack_saturate_and_clamp( + __m512i first, + __m512i second, + uint8_t min_val, + uint8_t max_val) { + __m512i packed_and_sat = _mm512_packus_epi16(first, second); + return _mm512_max_epu8( + _mm512_set1_epi8(min_val), + _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val))); +} + +template +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(at::vec::Vectorized src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 16*8 bits + __m128i input_128 = _mm512_castsi512_si128(src); + // Convert from 16*uint8/int8 to 16*int32 + __m512i input_512_extended; + if constexpr (std::is_same_v) + input_512_extended = _mm512_cvtepu8_epi32(input_128); + else + input_512_extended = _mm512_cvtepi8_epi32(input_128); + // Convert from 16*int32 to 16*float32 + return _mm512_cvtepi32_ps(input_512_extended); +} + +template +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src); + +template <> +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src) { + // Convert from float32 to int32 with truncation + __m512i x_values_int32 = _mm512_cvttps_epi32(src); + + // Convert from int32 to int16 using signed saturation + __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32); + + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + // Convert from int16 to int8 using unsigned saturation + __m512i xyzw_clamped_v = pack_saturate_and_clamp( + xy_packed_v, xy_packed_v, min_val, max_val); + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); + return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); +} + +template <> +at::vec::Vectorized inline convert_float_to_int8( + at::vec::Vectorized src) { + // The type of *_val should be int32_t to ensure correct clamping behavior. + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + __m512 float32_min_val = _mm512_set1_ps(float(min_val)); + __m512 float32_max_val = _mm512_set1_ps(float(max_val)); + __m512 float32_src = _mm512_max_ps(src, float32_min_val); + float32_src = _mm512_min_ps(float32_src, float32_max_val); + __m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src); + __m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped); + return _mm512_castsi128_si512(int8_src); +} + +template +__FORCE_INLINE void QuantizeAvx512( + const float* src, + T* dst, + int len, + float inverse_scale, + int64_t zero_point) { + constexpr int VLEN = 16; + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + const __m512i min_v = _mm512_set1_epi32(min_val); + const __m512i max_v = _mm512_set1_epi32(max_val); + // This is the largest int32 value < int32_max exactly representable in float + constexpr int32_t int32_float_max_val = + std::numeric_limits::max() - 127; + int i = 0; + __m512 inverse_scale_v = _mm512_set1_ps(inverse_scale); + // clang-format off + static const __m512i shuffle_mask_v = _mm512_set_epi8( + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x08, 0x04, 0x00); + // clang-format on + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); + __m512i permute_mask_l8_v = _mm512_set_epi32( + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x0c, + 0x08, + 0x04, + 0x00); + int len_aligned = len / (VLEN * 4) * (VLEN * 4); + for (; i < len_aligned; i += 4 * VLEN) { + // x + __m512 x_vals = _mm512_load_ps(src + i); + __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v); + // If the floating point value is greater than int32_max, + // _mm512_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to + // Clip at int32_float_max_val to avoid this. + x_transformed_v = + _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val)); + // y + __m512 y_vals = _mm512_load_ps(src + i + VLEN); + __m512 y_transformed_v = _mm512_mul_ps(y_vals, inverse_scale_v); + y_transformed_v = + _mm512_min_ps(y_transformed_v, _mm512_set1_ps(int32_float_max_val)); + // z + __m512 z_vals = _mm512_load_ps(src + i + 2 * VLEN); + __m512 z_transformed_v = _mm512_mul_ps(z_vals, inverse_scale_v); + z_transformed_v = + _mm512_min_ps(z_transformed_v, _mm512_set1_ps(int32_float_max_val)); + // w + __m512 w_vals = _mm512_load_ps(src + i + 3 * VLEN); + __m512 w_transformed_v = _mm512_mul_ps(w_vals, inverse_scale_v); + w_transformed_v = + _mm512_min_ps(w_transformed_v, _mm512_set1_ps(int32_float_max_val)); + + __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v); + __m512i y_rounded_v = _mm512_cvtps_epi32(y_transformed_v); + __m512i z_rounded_v = _mm512_cvtps_epi32(z_transformed_v); + __m512i w_rounded_v = _mm512_cvtps_epi32(w_transformed_v); + + // add zero point + x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point)); + y_rounded_v = _mm512_add_epi32(y_rounded_v, _mm512_set1_epi32(zero_point)); + z_rounded_v = _mm512_add_epi32(z_rounded_v, _mm512_set1_epi32(zero_point)); + w_rounded_v = _mm512_add_epi32(w_rounded_v, _mm512_set1_epi32(zero_point)); + + __m512i xy_packed_v = _mm512_packs_epi32(x_rounded_v, y_rounded_v); + __m512i zw_packed_v = _mm512_packs_epi32(z_rounded_v, w_rounded_v); + __m512i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, zw_packed_v, min_val, max_val); + + xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v); + } + + // Additional 8-lane AVX512 version to take advantage when len is smaller + // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM) + for (; i < len / VLEN * VLEN; i += VLEN) { + __m512 x_vals = _mm512_load_ps(src + i); + __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v); + x_transformed_v = + _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val)); + __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v); + x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point)); + __m512i x_clipped_v = + _mm512_max_epi32(min_v, _mm512_min_epi32(max_v, x_rounded_v)); + + x_clipped_v = _mm512_shuffle_epi8(x_clipped_v, shuffle_mask_v); + x_clipped_v = _mm512_permutexvar_epi32(permute_mask_l8_v, x_clipped_v); + _mm_storeu_si128( + reinterpret_cast<__m128i*>(dst + i), + _mm512_castsi512_si128(x_clipped_v)); + } + + for (; i < len; ++i) { + float transformed = src[i] * inverse_scale; + + // Not exactly the same behavior as the vectorized code. + // The vectorized code above always rounds to even in halfway cases + // (https://software.intel.com/en-us/node/523819), but std::nearbyint + // does the same only when the current rounding mode is FE_TONEAREST. + // However, in practice, this should not be a problem because most cases + // use the default rounding mode FE_TONEAREST. + // Note that we cannot implement the same behavior as the vectorized code + // using std::round because it does rounding away from zero in halfway + // cases. + transformed = zero_point + std::nearbyint(transformed); + float clipped = + std::min(std::max(transformed, float(min_val)), float(max_val)); + dst[i] = clipped; + } +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + using size_type = int; + static constexpr size_type size() { + return 16; + } + + static constexpr int float_num_vecs() { + return 1; + } + + static constexpr int int_num_vecs() { + return 1; + } + + using float_vec_return_type = std::array, 1>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint32& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi32(uw); + } + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + __m512 float_vals = _mm512_cvtepi32_ps(vals); + return {vec::fmadd(scale, Vectorized(float_vals), scale_zp_premul)}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m512 float_vals = _mm512_cvtepi32_ps(vals); + return {(Vectorized(float_vals) - zero_point) * scale}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale [[maybe_unused]]) { + Vectorized retval; + auto rhs_data = (__m512)rhs[0]; + at::native::quantize_vec( + scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16); + return retval; + } + + Vectorized maximum(Vectorized b) const { + return _mm512_max_epi32(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epi32(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epi32( + _mm512_max_epi32(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + return {_mm512_sub_epi32(vals, b)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + + __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v); + __m512i rounded = _mm512_cvtps_epi32(scaled); + return _mm512_add_epi32(rounded, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mullo_epi32(a, b); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return _mm512_add_epi32(a, b); +} + +/* + * Convert values from int32 back to int8/uint8 + */ +template +__m512i RequantizeAvx512( + const std::array, 4>& inp, + __m512 multiplier, + __m512i zp) { + static_assert( + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); + __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier); + __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier); + __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier); + __m512 w_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[3]), multiplier); + + __m512i x_rounded_v = _mm512_cvtps_epi32(x_scaled_v); + __m512i y_rounded_v = _mm512_cvtps_epi32(y_scaled_v); + __m512i z_rounded_v = _mm512_cvtps_epi32(z_scaled_v); + __m512i w_rounded_v = _mm512_cvtps_epi32(w_scaled_v); + + /* Add zero point */ + __m512i x_v = _mm512_add_epi32(x_rounded_v, zp); + __m512i y_v = _mm512_add_epi32(y_rounded_v, zp); + __m512i z_v = _mm512_add_epi32(z_rounded_v, zp); + __m512i w_v = _mm512_add_epi32(w_rounded_v, zp); + + /* Pack to int16_t and saturate */ + __m512i xy_packed_v = _mm512_packs_epi32(x_v, y_v); + __m512i zw_packed_v = _mm512_packs_epi32(z_v, w_v); + + __m512i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, zw_packed_v, min_val, max_val); + + /* + * xyzw_clamped_v has results in the following layout so we need to + * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 + * x12-15 y12-15 z12-15 w12-15 + */ + xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); + return xyzw_clamped_v; +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int size() { + return 64; + } + + static constexpr int float_num_vecs() { + return 4; + } + + static constexpr int int_num_vecs() { + return 4; + } + + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = c10::qint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + + Vectorized() {} + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint8& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi8(uw); + } + + // This is needed because the compiler emits awful code for the default + // constructor for moving the enum + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + + // This is added to avoid error: definition of implicit copy assignment + // operator for 'Vectorized' is deprecated because it has a + // user-declared copy constructor [-Werror,-Wdeprecated-copy] + Vectorized& operator=(const Vectorized&) = default; + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + private: + __m512i cvtepi8_epi32(__m128i epi8_vals) const { + return _mm512_cvtepi8_epi32(epi8_vals); + } + + public: + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_neg_zp_premul) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); + __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2)); + __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3)); + + auto val0 = + vec::fmadd(scale, Vectorized(float_val0), scale_neg_zp_premul); + auto val1 = + vec::fmadd(scale, Vectorized(float_val1), scale_neg_zp_premul); + auto val2 = + vec::fmadd(scale, Vectorized(float_val2), scale_neg_zp_premul); + auto val3 = + vec::fmadd(scale, Vectorized(float_val3), scale_neg_zp_premul); + return {val0, val1, val2, val3}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); + __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2)); + __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3)); + + auto val0 = (Vectorized(float_val0) - zero_point) * scale; + auto val1 = (Vectorized(float_val1) - zero_point) * scale; + auto val2 = (Vectorized(float_val2) - zero_point) * scale; + auto val3 = (Vectorized(float_val3) - zero_point) * scale; + return {val0, val1, val2, val3}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + auto* rhs_data = (float*)rhs.data(); + int8_t quantized_values[64]; + QuantizeAvx512( + rhs_data, quantized_values, 64, inverse_scale, zero_point); + return Vectorized::loadu(quantized_values); + } + + Vectorized maximum(Vectorized b) const { + return _mm512_max_epi8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epi8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epi8(_mm512_max_epi8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512i int32_val0 = cvtepi8_epi32(int_val0); + __m512i int32_val1 = cvtepi8_epi32(int_val1); + __m512i int32_val2 = cvtepi8_epi32(int_val2); + __m512i int32_val3 = cvtepi8_epi32(int_val3); + +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +#else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +#endif + + __m512i int32_b0 = cvtepi8_epi32(int_b0); + __m512i int32_b1 = cvtepi8_epi32(int_b1); + __m512i int32_b2 = cvtepi8_epi32(int_b2); + __m512i int32_b3 = cvtepi8_epi32(int_b3); + + __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0); + __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1); + __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2); + __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3); + + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + return RequantizeAvx512(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int size() { + return 64; + } + + static constexpr int float_num_vecs() { + return 4; + } + + static constexpr int int_num_vecs() { + return 4; + } + + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = c10::quint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::quint8& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi8(uw); + } + + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + + // This is added to avoid error: definition of implicit copy assignment + // operator for 'Vectorized' is deprecated because it has a + // user-declared copy constructor [-Werror,-Wdeprecated-copy] + Vectorized& operator=(const Vectorized&) = default; + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + private: + __m512i cvtepu8_epi32(__m128i epu8_vals) const { + return _mm512_cvtepu8_epi32(epu8_vals); + } + + public: + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); + __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2)); + __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3)); + + auto val0 = + vec::fmadd(scale, Vectorized(float_val0), scale_zp_premul); + auto val1 = + vec::fmadd(scale, Vectorized(float_val1), scale_zp_premul); + auto val2 = + vec::fmadd(scale, Vectorized(float_val2), scale_zp_premul); + auto val3 = + vec::fmadd(scale, Vectorized(float_val3), scale_zp_premul); + + return {val0, val1, val2, val3}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); + __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2)); + __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3)); + + auto val0 = (Vectorized(float_val0) - zero_point) * scale; + auto val1 = (Vectorized(float_val1) - zero_point) * scale; + auto val2 = (Vectorized(float_val2) - zero_point) * scale; + auto val3 = (Vectorized(float_val3) - zero_point) * scale; + + return {val0, val1, val2, val3}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + auto* rhs_data = (float*)rhs.data(); + uint8_t quantized_values[64]; + QuantizeAvx512( + rhs_data, quantized_values, 64, inverse_scale, zero_point); + return Vectorized::loadu(quantized_values); + } + + Vectorized maximum(Vectorized b) const { + return _mm512_max_epu8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epu8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epu8(_mm512_max_epu8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512i int32_val0 = cvtepu8_epi32(int_val0); + __m512i int32_val1 = cvtepu8_epi32(int_val1); + __m512i int32_val2 = cvtepu8_epi32(int_val2); + __m512i int32_val3 = cvtepu8_epi32(int_val3); + +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +#else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +#endif + + __m512i int32_b0 = cvtepu8_epi32(int_b0); + __m512i int32_b1 = cvtepu8_epi32(int_b1); + __m512i int32_b2 = cvtepu8_epi32(int_b2); + __m512i int32_b3 = cvtepu8_epi32(int_b3); + + __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0); + __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1); + __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2); + __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3); + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + return RequantizeAvx512(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +#else + +// NOTE: These are low-performance implementations that we fall back on. + +template < + typename T, + typename float_vec_return_type_, + typename int_vec_return_type_, + int size_> +struct VectorizedQuantizedConverter { + static constexpr int size() { + return size_; + } + + static constexpr int float_num_vecs() { + return size() / 8; + } + + static constexpr int int_num_vecs() { + return size() / 8; + } + + using float_vec_return_type = float_vec_return_type_; + using int_vec_return_type = int_vec_return_type_; + + using value_type = typename T::underlying; + std::array vals; + + VectorizedQuantizedConverter(T val) { + for (const auto i : c10::irange(size())) { + vals[i] = val.val_; + } + } + + VectorizedQuantizedConverter(const void* ptr) { + memcpy(vals.data(), ptr, sizeof(value_type) * size()); + } + + void store(void* ptr, int count = size()) const { + memcpy(ptr, vals.data(), count * sizeof(value_type)); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul [[maybe_unused]]) const { + float_vec_return_type rv; + for (const auto i : c10::irange(float_num_vecs())) { + float tmp_vals[16]; + for (const auto j : c10::irange(16)) { + tmp_vals[j] = at::native::dequantize_val( + scale[j], zero_point[j], T(vals[16 * i + j])); + } + rv[i] = Vectorized( + tmp_vals[0], + tmp_vals[1], + tmp_vals[2], + tmp_vals[3], + tmp_vals[4], + tmp_vals[5], + tmp_vals[6], + tmp_vals[7], + tmp_vals[8], + tmp_vals[9], + tmp_vals[10], + tmp_vals[11], + tmp_vals[12], + tmp_vals[13], + tmp_vals[14], + tmp_vals[15]); + } + return rv; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + Vectorized scale_zp_premul; + return dequantize(scale, zero_point, scale_zp_premul); + } + + protected: + VectorizedQuantizedConverter() {} +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + 16> { + Vectorized() + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + 16>() {} + Vectorized(c10::qint32 val) + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + 16>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + 16>(ptr) {} + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale [[maybe_unused]]) { + std::array qvals; + std::array float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * 16], 16); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint32*)qvals.data(), + 16 * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + for (const auto i : c10::irange(size())) { + retval[0].vals[i] = vals[i] - b.vals[i]; + } + return retval; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = + std::nearbyint(static_cast(inp[0].vals[i]) * multiplier) + + zero_point; + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (const auto i : c10::irange(std::decay_t::size())) { + retval.vals[i] = a.vals[i] * b.vals[i]; + } + return retval; +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + Vectorized retval; + for (const auto i : c10::irange(std::decay_t::size())) { + retval.vals[i] = a.vals[i] + b.vals[i]; + } + return retval; +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 64> { + Vectorized() + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 64>() {} + Vectorized(c10::qint8 val) + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 64>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 64>(ptr) {} + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale [[maybe_unused]]) { + std::array qvals; + std::array float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * 16], 16); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::qint8*)qvals.data(), + 16 * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + int32_t rounded = + std::nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 64> { + Vectorized() + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 64>() {} + Vectorized(c10::quint8 val) + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 64>(val) {} + Vectorized(const void* ptr) + : VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 64>(ptr) {} + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale [[maybe_unused]]) { + std::array qvals; + std::array float_vals; + + for (const auto i : c10::irange(float_num_vecs())) { + rhs[i].store(&float_vals[i * 16], 16); + } + + at::native::quantize_vec( + scale, + zero_point, + float_vals.data(), + (c10::quint8*)qvals.data(), + 16 * float_num_vecs()); + + return Vectorized::loadu(qvals.data()); + } + + Vectorized maximum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::max(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized minimum(Vectorized b) const { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min(vals[i], b.vals[i]); + } + return retval; + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + Vectorized retval; + for (const auto i : c10::irange(size())) { + retval.vals[i] = std::min( + std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); + } + return retval; + } + + int_vec_return_type widening_subtract(Vectorized b) const { + int_vec_return_type retval; + constexpr int elem_per_int_vec = size() / int_num_vecs(); + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + retval[i].vals[j] = + static_cast(vals[i * elem_per_int_vec + j]) - + static_cast(b.vals[i * elem_per_int_vec + j]); + } + } + return retval; + } + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + constexpr int elem_per_int_vec = size() / int_num_vecs(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + Vectorized retval; + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { + int32_t rounded = + std::nearbyint(static_cast(inp[i].vals[j]) * multiplier) + + zero_point; + retval.vals[i * elem_per_int_vec + j] = + std::min(std::max(rounded, min_val), max_val); + } + } + return retval; + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +#endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC) + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..05f78c9cfff2acaf1c35bd66684a429f83f7c6ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h @@ -0,0 +1,76 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +// TODO: No need to have this whole header, we can just put it all in +// the cpp file + +namespace at::cuda::detail { + +// Set the callback to initialize Magma, which is set by +// torch_cuda_cu. This indirection is required so magma_init is called +// in the same library where Magma will be used. +TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)()); + + +// The real implementation of CUDAHooksInterface +struct CUDAHooks : public at::CUDAHooksInterface { + CUDAHooks(at::CUDAHooksArgs /*unused*/) {} + void init() const override; + Device getDeviceFromPtr(void* data) const override; + bool isPinnedPtr(const void* data) const override; + const Generator& getDefaultGenerator( + DeviceIndex device_index = -1) const override; + Generator getNewGenerator( + DeviceIndex device_index = -1) const override; + bool hasCUDA() const override; + bool hasMAGMA() const override; + bool hasCuDNN() const override; + bool hasCuSOLVER() const override; + bool hasCuBLASLt() const override; + bool hasROCM() const override; + bool hasCKSDPA() const override; + bool hasCKGEMM() const override; + const at::cuda::NVRTC& nvrtc() const override; + DeviceIndex current_device() const override; + bool isBuilt() const override {return true;} + bool isAvailable() const override {return hasCUDA();} + bool hasPrimaryContext(DeviceIndex device_index) const override; + Allocator* getCUDADeviceAllocator() const override; + Allocator* getPinnedMemoryAllocator() const override; + bool compiledWithCuDNN() const override; + bool compiledWithMIOpen() const override; + bool supportsDilatedConvolutionWithCuDNN() const override; + bool supportsDepthwiseConvolutionWithCuDNN() const override; + bool supportsBFloat16ConvolutionWithCuDNNv8() const override; + bool supportsBFloat16RNNWithCuDNN() const override; + bool hasCUDART() const override; + long versionCUDART() const override; + long versionCuDNN() const override; + long versionRuntimeCuDNN() const override; + long versionCuDNNFrontend() const override; + long versionMIOpen() const override; + std::string showConfig() const override; + double batchnormMinEpsilonCuDNN() const override; + int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override; + void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override; + int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override; + void cuFFTClearPlanCache(DeviceIndex device_index) const override; + int getNumGPUs() const override; + DeviceIndex deviceCount() const override; + DeviceIndex getCurrentDevice() const override; + +#ifdef USE_ROCM + bool isGPUArch(const std::vector& archs, DeviceIndex device_index = -1) const override; +#endif + void deviceSynchronize(DeviceIndex device_index) const override; +}; + +} // at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h new file mode 100644 index 0000000000000000000000000000000000000000..0c5e22a6f2642c79fbbbd37495cd2195fe262738 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h @@ -0,0 +1,156 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states. +// These handles are tied to device, and these libraries requires/recommends not to +// share handles across host threads. +// +// These libraries recommend using one handle per host thread. We may not want to do +// this because threads are relatively light-weight, but creating and destroying +// handles is expensive (destroying the handle causes synchronizations). DataParallel, +// for example, creates new threads for each forward pass. +// +// This file implements a handle pool mechanism. The handle pool returns handles on +// demand as threads request them. If all existing handles in the pool are in use, +// it creates a new one. As threads terminate, they release handles back into the pool. +// In this way, the handle pool never creates more handles than the high-water mark of +// active threads, so it's efficient with DataParallel. + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at::cuda { namespace { + +template +struct DeviceThreadHandlePool : public std::enable_shared_from_this> { + + struct Handle { + Handle_t handle; + Handle(bool create = false) : handle(nullptr) + { + if(create) Create(&handle); + } + // std::vector.emplace() and push_back() may route through temporaries and call + // copy/move constructors along the way. If this is the case, we don't want + // the destructors of temporaries to call cudnnDestroy on the handle. + // We can achieve safety (for the narrow case of stashing within std::vectors) + // by making Handle moveable but not copyable, and transferring handle ownership + // to the latest constructed object. This is not a substitute for full-blown + // reference counting, but reference counting may be overkill here. + // Another alternative is to wrap the saved Handles in unique_ptrs, i.e., + // unordered_map>> created_handles; + Handle(const Handle& rhs) = delete; + // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom + Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); } + // operator= takes argument by value + Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; } + ~Handle() { + if(handle) Destroy(handle); + } + }; + + std::mutex mutex; + + // Handles are lazily created as different threads request them, + // but are never destroyed until the end of the process. + // The maximum number of handles this process will create for each device is equal + // to the high-water mark of the number of concurrently active threads that request + // handles for that device. + // When threads terminate, they release their handles back into the pool for reuse. + // Otherwise, new handles would be created every time new threads were spawned, + // resulting in poor performance for Python modules that repeatedly or frequently + // spawned new sets of threads (like DataParallel, which creates a new set of threads + // for each forward pass). + // + // To prevent potential deadlocks, we explicitly choose not to cap the number + // of handles that are created per device. + // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device, + // only 4 can make forward progress at any time. The other 4 will not release their + // handles until they exit, so the fifth cannot make progress until then. This is + // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an + // intermediate point (ie, before any of them have exited). We have no way to anticipate + // or enforce that user threads will not attempt such intermediate synchronization. + // The only way to ensure safety is to avoid imposing a cap on the number of handles. + std::unordered_map> created_handles; + std::unordered_map> available_handles; + + // PoolWindow lazily creates and caches the handles that a particular thread is using, + // so in the common case handle access doesn't incur either handle creation or a mutex lock. + class PoolWindow + { + public: + PoolWindow(std::shared_ptr parent): weak_parent(std::move(parent)) {} + ~PoolWindow(){ release(); } + + Handle_t reserve(int device) + { + // If this thread already has a handle for this device, return it + if(my_handles.find(device) != my_handles.end()) + return my_handles[device]; + + // otherwise, either grab a handle from the pool if one is available, + // or if not, create a new one. + auto parent = weak_parent.lock(); + TORCH_CHECK(parent, "Cannot create handle during program termination"); + std::lock_guard guard(parent->mutex); + + if(parent->available_handles[device].size() > 0) + { + my_handles[device] = parent->available_handles[device].back(); + parent->available_handles[device].pop_back(); + } + else + { + // In local testing, I do observe that emplace_back sometimes routes through temporaries + // that incur move-constructor and destructor calls. See comments in Handle above. + parent->created_handles[device].emplace_back(true /*create*/); + my_handles[device] = parent->created_handles[device].back().handle; + } + + return my_handles[device]; + } + + private: + // Stores the per-device handles currently owned by this thread + std::unordered_map my_handles; + + std::weak_ptr weak_parent; + + // Called by the destructor. Releases this thread's handles back into the pool. + void release() { + if(!my_handles.empty()) { + auto parent = weak_parent.lock(); + if (!parent) { + // If this thread exits after atexit handlers have completed, the + // cuda context itself may be invalid, so we must leak the handles. + return; + } + + std::lock_guard guard(parent->mutex); + for(auto d_h : my_handles) + parent->available_handles[d_h.first].push_back(d_h.second); + } + } + }; + + // Warning: + // If you want to change this function, be aware that this function will be called + // by multiple threads and there is no mutex guarding the call of this function, so + // make sure your implementation is thread-safe. + PoolWindow *newPoolWindow() { + // The returned pointer will be owned by a thread local variable + // so that different threads does not share the same PoolWindow. + return new PoolWindow(this->shared_from_this()); + } +}; + +}} // namespace at::cuda::detail:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6a09141b9b31c739fdbb50834397f9a92f1ca7f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::cuda::detail { + +TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t); +using at::native::canUse32BitIndexMath; + +template +TensorInfo +getTensorInfo(const at::TensorBase &t) { + IndexType sz[MAX_TENSORINFO_DIMS]; + IndexType st[MAX_TENSORINFO_DIMS]; + + int dims = t.dim(); + for (int i = 0; i < dims; ++i) { + sz[i] = t.size(i); + st[i] = t.stride(i); + } + + scalar* data_ptr = nullptr; + + if constexpr (std::is_const_v) { + data_ptr = t.const_data_ptr(); + } else { + data_ptr = t.mutable_data_ptr(); + } + + return TensorInfo( + data_ptr, dims, sz, st); +} + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh new file mode 100644 index 0000000000000000000000000000000000000000..432117f154c419067542cf2e3f5f51059b2068ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh @@ -0,0 +1,129 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#include +#endif + +namespace at::cuda::detail { + +// A utility class to implement integer division by multiplication, given a fixed +// divisor. +// +// WARNING: The fast divider algorithm is only implemented for unsigned int; +// otherwise we default to plain integer division. For unsigned int, +// we further assume that the dividend is at most INT32_MAX. Thus, +// IntDivider must NOT be used for general integer division. +// +// This reduced range is enough for our purpose, and it allows us to +// slightly simplify the computation. +// +// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1< 0), we can find a "magic number" m (2^N +// <= m < 2^(N+1)) and shift s such that: +// +// \floor(n / d) = \floor((m * n) / 2^(N+s)). +// +// Given such m and s, the integer division can be then implemented as: +// +// let m' = m - 2^N // 0 <= m' < 2^N +// +// fast_integer_division(n): +// // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned +// // integer. Then take the higher N bits. +// t = (m' * n) >> N +// +// // Here we use the fact that n is less than 2^(N-1): otherwise the value +// // of (t + n) may not fit in an N-bit integer. +// return (t + n) >> s +// +// Finding such a magic number is surprisingly easy: +// +// s = \ceil(\log_2 d) +// m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic. +// +// See also: +// - Division by Invariant Integers Using Multiplication, +// Torbjörn Granlund and Peter L. Montgomery, 1994. +// +// - http://www.hackersdelight.org/magic.htm +// +// - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html + +// Result of div/mod operation stored together. +template +struct DivMod { + Value div, mod; + + C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { } +}; + +// Base case: we only have an implementation for uint32_t for now. For +// everything else, we use plain division. +template +struct IntDivider { + IntDivider() = default; + IntDivider(Value d) : divisor(d) { } + + C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; } + C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; } + C10_HOST_DEVICE inline DivMod divmod(Value n) const { + return DivMod(n / divisor, n % divisor); + } + + Value divisor; +}; + +// Implement fast integer division. +template <> +struct IntDivider { + static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int."); + + IntDivider() = default; + + IntDivider(unsigned int d) : divisor(d) { + assert(divisor >= 1 && divisor <= INT32_MAX); + + // TODO: gcc/clang has __builtin_clz() but it's not portable. + for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break; + + uint64_t one = 1; + uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; + m1 = magic; + assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits. + } + + C10_HOST_DEVICE inline unsigned int div(unsigned int n) const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and + // 'm1'. + unsigned int t = __umulhi(n, m1); + return (t + n) >> shift; +#else + // Using uint64_t so that the addition does not overflow. + uint64_t t = ((uint64_t) n * m1) >> 32; + return (t + n) >> shift; +#endif + } + + C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const { + return n - div(n) * divisor; + } + + C10_HOST_DEVICE inline DivMod divmod(unsigned int n) const { + unsigned int q = div(n); + return DivMod(q, n - q * divisor); + } + + unsigned int divisor; // d above. + unsigned int m1; // Magic number: m' above. + unsigned int shift; // Shift amounts. +}; + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..99562629fe531d9468fd8ec51bd98b2a492d4c35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::cuda::detail { + +// CUDA: grid stride looping +// +// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment. +// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final +// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be +// greater than INT_MAX. But in that case _i_n_d_e_x >= n, so there are no +// further iterations and the overflowed value in i=_i_n_d_e_x is not used. +#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type) \ + int64_t _i_n_d_e_x = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; \ + for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) + +#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int) + + +// Use 1024 threads per block, which requires cuda sm_2x or above +constexpr int CUDA_NUM_THREADS = 1024; + +// CUDA: number of blocks for threads. +inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { + TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N); + constexpr int64_t max_int = std::numeric_limits::max(); + + // Round up division for positive number that cannot cause integer overflow + auto block_num = (N - 1) / max_threads_per_block + 1; + TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device"); + + return static_cast(block_num); +} + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h new file mode 100644 index 0000000000000000000000000000000000000000..bab1495dda3989f4a491d3545ee23f8eec4c3773 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +namespace at::cuda { +// Forward-declares at::cuda::NVRTC +struct NVRTC; + +namespace detail { +extern NVRTC lazyNVRTC; +} // namespace detail + +} // namespace at::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5bd215318125ff9f0d9846b2adc2e3c9cb1c2e48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh @@ -0,0 +1,141 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +// If element_sizes is nullptr, then the strides will be in bytes, otherwise +// the strides will be in # of elements. +// Operands that share the same shape, but may have different strides. +// OffsetCalculator iterates the tensor in a column-major order + +#if defined(USE_ROCM) +constexpr int MAX_DIMS = 16; +#else +constexpr int MAX_DIMS = 25; +#endif + +template +struct OffsetCalculator { + // We allow having negative strides to implement some operations like torch.flip + using stride_t = std::conditional_t, + index_t>; + // The offset for each argument. Wrapper around fixed-size array. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = std::array(NARGS, 1)>; + + // if element_sizes is nullptr, then the strides will be in bytes, otherwise + // the strides will be in # of elements. + OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) { + TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims"); + for (int i=0; i < dims; i++){ + sizes_[i] = at::cuda::detail::IntDivider(sizes[i]); + for (int arg = 0; arg < NARGS; arg++) { + int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]); + strides_[i][arg] = strides[arg][i] / element_size; + } + } + } + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + +#if defined(USE_ROCM) + if ((dims > 0) && (dims <= 2)) { + auto divmod = sizes_[0].divmod(linear_idx); +#pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] = divmod.mod * strides_[0][arg]; + if (dims >= 2) { + divmod = sizes_[1].divmod(divmod.div); +#pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] += divmod.mod * strides_[1][arg]; + } + // [...] + return offsets; + } +#endif + + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = 0; + } + + #pragma unroll + for (int dim = 0; dim < MAX_DIMS; ++dim) { + if (dim == dims) { + break; + } + auto divmod = sizes_[dim].divmod(linear_idx); + linear_idx = divmod.div; + + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] += divmod.mod * strides_[dim][arg]; + } + + } + return offsets; + } + + int dims; + at::cuda::detail::IntDivider sizes_[MAX_DIMS]; + stride_t strides_[MAX_DIMS][std::max(NARGS, 1)]; +}; + +template +struct TrivialOffsetCalculator { + // The offset for each argument. Wrapper around fixed-size array. + // The offsets are in # of elements, not in bytes. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = std::array(NARGS, 1)>; + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = linear_idx; + } + return offsets; + } +}; + +// Make an OffsetCalculator with byte offsets +template +static OffsetCalculator make_offset_calculator(const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data()); +} + +// Make an OffsetCalculator with element offsets +template +static OffsetCalculator make_element_offset_calculator( + const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + std::array element_sizes; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + element_sizes[i] = iter.element_size(i); + } + return OffsetCalculator( + iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data()); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e50519eb6a4fc842293e766f162ed26c7a028bd5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh @@ -0,0 +1,48 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// No "#pragma once" because this is a raw definition that can be copied by jit codegen. +// Eager mode clients should not include this file directly, instead, +// they should #include , which has a #pragma once. + +// Stores RNG state values. Passed as a kernel argument. +// See Note [CUDA Graph-safe RNG states]. +// +// The raw definition lives in its own file so jit codegen can easily copy it. +namespace at { + +struct PhiloxCudaState { + PhiloxCudaState() = default; + // Called if graph capture is not underway + PhiloxCudaState(uint64_t seed, + uint64_t offset) { + seed_.val = seed; + offset_.val = offset; + } + // Called if graph capture is underway + PhiloxCudaState(int64_t* seed, + int64_t* offset_extragraph, + uint64_t offset_intragraph) { + seed_.ptr = seed; + offset_.ptr = offset_extragraph; + offset_intragraph_ = offset_intragraph; + captured_ = true; + } + + // Public members, directly accessible by at::cuda::philox::unpack. + // If we made them private with getters/setters, the getters/setters + // would have to be __device__, and we can't declare __device__ in ATen. + union Payload { + uint64_t val; + int64_t* ptr; + }; + + Payload seed_{}; + Payload offset_{}; + uint64_t offset_intragraph_ = 0; + bool captured_ = false; +}; + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2d372718a4e786d676fff76c50da662e370be6ee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh @@ -0,0 +1,121 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::cuda::detail { + +#define MAX_TENSORINFO_DIMS 25 + +// CUDA kernel argument that defines tensor layout +template +struct TensorInfo { + TensorInfo(); + TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + // See note on [collapse dims]. + int collapseDims(const int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_TENSORINFO_DIMS]; + IndexType strides[MAX_TENSORINFO_DIMS]; + int dims; +}; + +template +TensorInfo::TensorInfo() { + data = nullptr; + dims = 0; +} + +template +TensorInfo::TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]) { + data = p; + dims = dim; + TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions"); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template +void +TensorInfo::reduceDim(int dim) { + TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); + sizes[dim] = 1; +} + +template +int +TensorInfo::collapseDims(const int excludeDim) { + auto result = at::collapse_dims(sizes, strides, dims, excludeDim); + dims = std::get<1>(result); + return std::get<0>(result); +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses static dims + for (int i = Dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +// Uses dynamic (runtime) instead of static (compile time) dims +template +struct IndexToOffset { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + for (int i = info.dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..797a857504ddcf336f0119f265c7a6d7e2e802a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h @@ -0,0 +1,705 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif +#include +#include + +namespace at::cuda::tunable { + +using at::blas::ScalingType; + +enum class BlasOp { + N = 0, + T = 1 +}; + +inline char BlasOpToString(BlasOp op) { + switch (op) { + case BlasOp::N: + return 'N'; + case BlasOp::T: + return 'T'; + } + TORCH_CHECK(false, "unrecognized BlasOp"); + return 'N'; +} + +template +inline const char* BLASTypeName(T v) { + return "unknown"; +} + +template <> +inline const char* BLASTypeName(float v) { + return "f32_r"; +} + +template <> +inline const char* BLASTypeName(double v) { + return "f64_r"; +} + +template <> +inline const char* BLASTypeName(BFloat16 v) { + return "bf16_r"; +} + +template <> +inline const char* BLASTypeName(Half v) { + return "f16_r"; +} + +//https://github.com/ROCm/hipBLASLt/blob/develop/library/src/include/auxiliary.hpp#L175 +template <> +inline const char* BLASTypeName(Float8_e4m3fn v) { + return "f8_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e5m2 v) { + return "bf8_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e4m3fnuz v) { + return "f8_fnuz_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e5m2fnuz v) { + return "bf8_fnuz_r"; +} + +template <> +inline const char* BLASTypeName(c10::complex v) { + return "f64_r"; +} + +template <> +inline const char* BLASTypeName(c10::complex v) { + return "f32_r"; +} + +inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) { + std::string BLASType; + switch (scalar_type) { + case c10::ScalarType::Float:{ + BLASType = "f32_r"; + break; + } + case c10::ScalarType::Double:{ + BLASType = "f64_r"; + break; + } + case c10::ScalarType::BFloat16:{ + BLASType = "bf16_r"; + break; + } + case c10::ScalarType::Half: { + BLASType = "f16_r"; + break; + } + case c10::ScalarType::Float8_e4m3fn: { + BLASType = "f8_r"; + break; + } + case c10::ScalarType::Float8_e5m2: { + BLASType = "bf8_r"; + break; + } + case c10::ScalarType::Float8_e4m3fnuz: { + BLASType = "f8_fnuz_r"; + break; + } + case c10::ScalarType::Float8_e5m2fnuz: { + BLASType = "bf8_fnuz_r"; + break; + } + case c10::ScalarType::ComplexFloat:{ + BLASType = "f32_c"; + break; + } + case c10::ScalarType::ComplexDouble:{ + BLASType = "f64_c"; + break; + } + default: + BLASType = "unknown"; + } + return BLASType; + +} + +// Similar to Compute Type in GemmRocblas.h +template +inline std::string ComputeTypeFor() { + return "Unknown ComputeType"; +} + +// This is a union of the compute types for +// ROCBLAS and hipBLASLt. +template <> +inline std::string ComputeTypeFor() { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) { + return "f32_r"; + } else { + return "xf32_r"; + } +} + +template <> +inline std::string ComputeTypeFor() { + return "f64_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor>() { + return "f32_c"; +} + +template <> +inline std::string ComputeTypeFor>() { + return "f64_c"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +// Convert opmath_type to string +template +inline std::string to_string_opmath(const at::opmath_type& value) { + if constexpr (std::is_same_v, c10::complex> || + std::is_same_v, c10::complex>) { + return fmt::format("({:.4f}, {:.4f})", value.real(), value.imag()); + } else { + return fmt::format("{:.4f}", value); + } +} + +// convert activation epilogue to string +inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivationEpilogue& value) { + switch (value) { + case at::cuda::blas::GEMMAndBiasActivationEpilogue::None: + return std::string("None"); + break; + case at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU: + return std::string("RELU"); + break; + case cuda::blas::GEMMAndBiasActivationEpilogue::GELU: + return std::string("GELU"); + break; + default: + return std::string("unknown"); + } +} + +namespace detail { + +static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) { + + if (!config.enabled) { + return true; // skip when disabled + } + + auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); + at::Tensor ref = at::from_blob(c, {size}, options); + at::Tensor oth = at::from_blob(other_c, {size}, options); + at::Tensor ref_float = ref.to(at::kFloat); + at::Tensor oth_float = oth.to(at::kFloat); + + const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol); + if (ok) { + TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol); + } else { + TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol); + } + return ok; +} + +} + +// Note on GetSizeA et al. +// Tensors can be dense or arbitrarily strided. We only need our copies to be large enough. +// Our copies must be at least as large as the m n k shapes dictate, but could be larger +// depending on the lda ldb ldc values. Similarly for the batched case. + +template +struct GemmParams : OpParams { + GemmParams() = default; + + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string beta_str = to_string_opmath(beta); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, bias_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, alpha_str, beta_str, transa, transb, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmParams* DeepCopy(bool duplicate_inputs) const { + GemmParams* copy = new GemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha; + const T* a{}; + int64_t lda{}; + const T* b{}; + int64_t ldb{}; + at::opmath_type beta; + T* c{}; + int64_t ldc{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct GemmAndBiasParams : OpParams { + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string activation_str = to_string_epilogue(activation); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "alpha: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, activation: %s, bias_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, alpha_str, transa, transb, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), activation_str, BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const { + GemmAndBiasParams* copy = new GemmAndBiasParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmAndBiasParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha{}; + const T* a{}; + int64_t lda{}; + const T* b{}; + int64_t ldb{}; + T* c{}; + int64_t ldc{}; + const T* bias{}; + at::cuda::blas::GEMMAndBiasActivationEpilogue activation{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct GemmStridedBatchedParams : OpParams { + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string beta_str = to_string_opmath(beta); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, " + "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(C_Dtype{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_B_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, batch, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = stride_a * batch; + size_t size_dense = m * k * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = stride_b * batch; + size_t size_dense = k * n * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = stride_c * batch; + size_t size_dense = m * n * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmStridedBatchedParams* DeepCopy(bool duplicate_inputs) const { + GemmStridedBatchedParams* copy = new GemmStridedBatchedParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + // NOLINTNEXTLINE(*const-cast*) + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + // NOLINTNEXTLINE(*const-cast*) + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha{}; + const T* a{}; + int64_t lda{}; + int64_t stride_a{}; + const T* b{}; + int64_t ldb{}; + int64_t stride_b{}; + at::opmath_type beta; + C_Dtype* c{}; + int64_t ldc{}; + int64_t stride_c{}; + int64_t batch{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct ScaledGemmParams : OpParams { + ScaledGemmParams() = default; + + std::string BLASSignature() const override { + // Excluding use_fast_accum and use_rowise booleans for now + if (bias_ptr == nullptr) { + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, transa, transb, + ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), + ComputeTypeFor(), ComputeTypeFor()); + } + else { + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, transa, transb, + ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype), + ComputeTypeFor(), ComputeTypeFor()); + } + } + + std::string Signature() const override { + // In Blas.cpp, code defaults to a bias_dtype of Half even when there is no bias vector. + // Search for this line:: + // params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_; + // + // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector. + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s", + transa, transb, m, n, k, lda, ldb, ldc, + a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise, + bias_ptr == nullptr ? "None" : at::toString(bias_dtype)); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + ScaledGemmParams* DeepCopy(bool duplicate_inputs) const { + ScaledGemmParams* copy = new ScaledGemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = c10::cuda::CUDACachingAllocator::raw_alloc(a_size); + copy->b = c10::cuda::CUDACachingAllocator::raw_alloc(b_size); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(ScaledGemmParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + const void* a{}; + const void* a_scale_ptr{}; + int64_t lda{}; + ScalarType a_dtype{}; + ScalarType a_scale_dtype{}; + ScalingType a_scaling_type{}; + const void* b{}; + const void* b_scale_ptr{}; + int64_t ldb{}; + ScalarType b_dtype{}; + ScalarType b_scale_dtype{}; + ScalingType b_scaling_type{}; + const void* bias_ptr{}; + ScalarType bias_dtype{}; + void* c{}; + const void* c_scale_ptr{}; + int64_t ldc{}; + ScalarType c_dtype{}; + void* amax_ptr{}; + bool use_fast_accum{}; +private: + bool duplicate_inputs_{false}; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h new file mode 100644 index 0000000000000000000000000000000000000000..13d0bf23bff74af65cd296a413da661df8f9e183 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h @@ -0,0 +1,692 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TORCH_HIPBLASLT_CHECK(EXPR) \ + do { \ + hipblasStatus_t __err = EXPR; \ + TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS, \ + "hipblaslt error: ", \ + hipblasStatusToString(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +template +constexpr hipDataType HipDataTypeFor(); + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_32F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_16F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_16BF; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_64F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_8F_E4M3_FNUZ; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_8F_E5M2_FNUZ; +} + +// This code is instantiated regardless of ROCm version. +// Prior to ROCm 6.3, we hard-code the known enum values. +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 60300 + return HIP_R_8F_E4M3; +#else + return static_cast(28); +#endif +} + +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 60300 + return HIP_R_8F_E5M2; +#else + return static_cast(29); +#endif +} + +// This type is not intended for matrix types but rather a scale factor. +// Return a dummy value to satisfy linker. +template <> +constexpr hipDataType HipDataTypeFor() { + return static_cast(500); +} + +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 70000 + return HIP_R_4F_E2M1; +#else + return static_cast(33); +#endif +} + +template +int GetBatchFromParams(const GemmParams* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmStridedBatchedParams* params) { + return params->batch; +} + +template +int GetBatchFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmStridedBatchedParams* params) { + return params->stride_a; +} + +template +int GetStrideAFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmStridedBatchedParams* params) { + return params->stride_b; +} + +template +int GetStrideBFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmStridedBatchedParams* params) { + return params->stride_c; +} + +template +int GetStrideCFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +float GetAlphaFromParams(const GemmParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const GemmAndBiasParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const GemmStridedBatchedParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const ScaledGemmParams* params) { + return 1.0; +} + +template +float GetBetaFromParams(const GemmParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const GemmAndBiasParams* params) { + return 0.0; +} + +template +float GetBetaFromParams(const GemmStridedBatchedParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const ScaledGemmParams* params) { + return 0.0; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const ScaledGemmParams* params) { + return params->a_scaling_type; +} + +template +ScalingType GetBScalingTypeFromParams(const ScaledGemmParams* params) { + return params->b_scaling_type; +} + +template +const void* GetAScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const ScaledGemmParams* params) { + return params->a_scale_ptr; +} + +template +const void* GetBScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const ScaledGemmParams* params) { + return params->b_scale_ptr; +} + +template +const void* GetDScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const ScaledGemmParams* params) { + return params->c_scale_ptr; +} + +template +const void* GetBiasPointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const GemmAndBiasParams* params) { + return params->bias; +} + +template +const void* GetBiasPointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const ScaledGemmParams* params) { + return params->bias_ptr; +} + +template +hipDataType GetBiasTypeFromParams(const GemmParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const GemmAndBiasParams* params) { + return HipDataTypeFor(); +} + +template +hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const ScaledGemmParams* params) { + return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype); +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams* params) { + return params->activation; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +static hipblasOperation_t _hipblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return HIPBLAS_OP_N; + case 't': + case 'T': + return HIPBLAS_OP_T; + case 'c': + case 'C': + return HIPBLAS_OP_C; + } + TORCH_CHECK(false, + "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +static char _charFromhipblasOp(hipblasOperation_t op) { + switch (op) { + case HIPBLAS_OP_N: + return 'N'; + case HIPBLAS_OP_T: + return 'T'; + case HIPBLAS_OP_C: + return 'C'; + } + TORCH_CHECK(false, + "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`"); +} + +static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) { + if (layout == BlasOp::N) { + return HIPBLAS_OP_N; + } + return HIPBLAS_OP_T; +} + +template +struct HipBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class HipBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor< + hipblasLtMatmulDescOpaque_t, + &hipblasLtMatmulDescDestroy> { + public: + HipBlasLtMatmulDescriptor( + hipblasComputeType_t compute_type, + hipDataType scale_type) { + hipblasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_HIPBLASLT_CHECK( + hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) { + TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; + +template +class HipblasltGemmOp : public Callable { + public: + HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {} + + TuningStatus Call(const ParamsT* params) override { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipDataTypeFor(); + auto b_datatype = HipDataTypeFor(); + auto in_out_datatype = HipDataTypeFor(); + auto opa = _hipblasOpFromChar(params->transa); + auto opb = _hipblasOpFromChar(params->transb); + + TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen"); + + float alpha = GetAlphaFromParams(params); + float beta = GetBetaFromParams(params); + + hipblasLtMatrixLayout_t mat_a, mat_b, mat_c; + if (opa == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda)); + } + if (opb == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb)); + } + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc)); + + // specific to batched gemmm + int batch = GetBatchFromParams(params); + if (batch > 1) { + int64_t stride_a = GetStrideAFromParams(params); + int64_t stride_b = GetStrideBFromParams(params); + int64_t stride_c = GetStrideCFromParams(params); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); + } + + hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } + HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb); + + // specific to scaled gemm + const void* mat1_scale_ptr = GetAScalePointerFromParams(params); + const void* mat2_scale_ptr = GetBScalePointerFromParams(params); + const void* result_scale_ptr = GetDScalePointerFromParams(params); + if (mat1_scale_ptr && mat2_scale_ptr) { + hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER; + hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER; + if (GetAScalingTypeFromParams(params) == ScalingType::RowWise) { +#if defined(HIPBLASLT_OUTER_VEC) + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); +#elif defined(HIPBLASLT_VEC_EXT) + a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; +#endif + } + if (GetBScalingTypeFromParams(params) == ScalingType::RowWise) { +#if defined(HIPBLASLT_OUTER_VEC) + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); +#elif defined(HIPBLASLT_VEC_EXT) + b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; +#endif + } + matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr); + matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr); + } + if (result_scale_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); + } + + const void* bias_ptr = GetBiasPointerFromParams(params); + auto bias_datatype = GetBiasTypeFromParams(params); + if (bias_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype); + auto activation = GetActivationFromParams(params); + if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS); + } + else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS); + } + else { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS); + } + } + + size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize(); + + auto op_handle = at::cuda::getCurrentCUDABlasLtHandle(); + + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle, + matmul.descriptor(), + &alpha, + mat_a, + mat_b, + &beta, + mat_c, + mat_c, + algo_, + ret_workspace_size); + + if (status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size >= workspace_size) { + return FAIL; + } + } + else { + return FAIL; + } + + void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace(); + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle, + matmul.descriptor(), + &alpha, + params->a, + mat_a, + params->b, + mat_b, + &beta, + params->c, + mat_c, + params->c, + mat_c, + &algo_, + workspace_buffer, + workspace_size, + at::cuda::getCurrentCUDAStream())); + + //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c)); + return OK; + } + + private: + hipblasLtMatmulAlgo_t algo_; +}; + +template +auto GetHipBlasLtTypeStringAndOps() { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipDataTypeFor(); + auto b_datatype = HipDataTypeFor(); + auto in_out_datatype = HipDataTypeFor(); + std::vector heuristic_result; +#if ROCM_VERSION == 60400 + // hipblaslt TT fp32 regression on ROCm 6.4, cannot use + if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F) + && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) { + std::vector>>> ignore; + return ignore; + } +#endif + + hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; + if (at::globalContext().allowTF32CuBLAS()) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } + + hipblasLtHandle_t handle; + TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle)); + TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle, + hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + transa_outer, + transb_outer, + a_datatype, + b_datatype, + in_out_datatype, + in_out_datatype, + computeType, + heuristic_result)); + TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle)); + + int returned_algo_count = heuristic_result.size(); + std::vector>>> ret; + for (int i = 0; i < returned_algo_count; i++) { + auto algo = heuristic_result[i].algo; + int algo_index = hipblaslt_ext::getIndexFromAlgo(algo); + auto callable = std::make_unique>(algo); + std::string type_string = fmt::sprintf("Gemm_Hipblaslt_%d", algo_index); + ret.emplace_back(type_string, std::move(callable)); + } + + return ret; +} + +template +auto GetHipBlasLtGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmAndBiasTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtScaledGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +#undef TORCH_HIPBLASLT_CHECK + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h new file mode 100644 index 0000000000000000000000000000000000000000..8734d42b01a9a8603532f3284b14904471543a2e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h @@ -0,0 +1,282 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include + +#define ROCBLAS_BETA_FEATURES_API +#include + +#define TORCH_ROCBLAS_CHECK(EXPR) \ + do { \ + rocblas_status __err = EXPR; \ + TORCH_CHECK(__err == rocblas_status_success, \ + "rocblas error: ", \ + rocblas_status_to_string(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +template +constexpr rocblas_datatype RocBlasDataTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_bf16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +constexpr rocblas_datatype RocBlasComputeTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // FP16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // BF16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +auto DoCastForHalfOrBfloat16(const T fp) { + return fp; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const Half fp) { + // alpha and beta should be the same as compute_type, in Half case it is float. + float h = fp; + return h; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const BFloat16 fp) { + // alpha and beta should be the same as compute_type, in bfloat16 case it is float. + float h = fp; + return h; +} + +static rocblas_operation _rocblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return rocblas_operation_none; + case 't': + case 'T': + return rocblas_operation_transpose; + case 'c': + case 'C': + return rocblas_operation_conjugate_transpose; + } + TORCH_CHECK(false, + "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +template +class RocblasGemmOp : public Callable> { + public: + RocblasGemmOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) + return FAIL; // no support for TF32 in rocBLAS + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, + params->b, input_output_type, params->ldb, + &h_b, + params->c, input_output_type, params->ldc, + params->c, input_output_type, params->ldc, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(fmt::sprintf("Gemm_Rocblas_%d", solutions[i]), std::move(callable))); + } + return ret; +} + +template +class RocblasGemmStridedBatchedOp : public Callable> { + public: + RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmStridedBatchedParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) + return FAIL; // no support for TF32 in rocBLAS + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_strided_batched_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, params->stride_a, + params->b, input_output_type, params->ldb, params->stride_b, + &h_b, + params->c, input_output_type, params->ldc, params->stride_c, + params->c, input_output_type, params->ldc, params->stride_c, + params->batch, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmStridedBatchedTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + // Sort the solutions in ascending order to make the solution vector deterministic across runs + std::sort(solutions.begin(), solutions.end()); + + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable))); + } + return ret; +} + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h new file mode 100644 index 0000000000000000000000000000000000000000..14f1f089ad4fc04b28c6c1c1d36fe64056725fd9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include + +namespace at::cuda::tunable { + +class StreamTimer : public ITimer { + public: + StreamTimer(); + ~StreamTimer() override; + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_{}; + cudaEvent_t end_{}; +}; + +class StreamTimerNoSync : public ITimer { + public: + StreamTimerNoSync(); + ~StreamTimerNoSync() override; + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_{}; + cudaEvent_t end_{}; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h new file mode 100644 index 0000000000000000000000000000000000000000..c055f6e72989c3c6e66a35671d10839f3bb354c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h @@ -0,0 +1,270 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TUNABLE_LOGV(LEVEL, ...) getTuningContext()->Log(LEVEL, __VA_ARGS__) +#define TUNABLE_LOG1(...) TUNABLE_LOGV(1, __VA_ARGS__) +#define TUNABLE_LOG2(...) TUNABLE_LOGV(2, __VA_ARGS__) +#define TUNABLE_LOG3(...) TUNABLE_LOGV(3, __VA_ARGS__) + +namespace at::cuda::tunable { + +enum TORCH_CUDA_CPP_API TuningStatus { + OK = 0, + FAIL = 1, + UNSUPPORTED = 2, +}; + +// Mapping from params signature to kernel id +class TORCH_CUDA_CPP_API ResultEntry { + public: + explicit ResultEntry(std::string key, double time) : key_(std::move(key)), time_(time) {} + explicit ResultEntry(std::string key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {} + bool operator==(const ResultEntry& other) const { return key_ == other.key_; } + bool operator!=(const ResultEntry& other) const { return key_ != other.key_; } + operator std::string () { return key_; } + std::string GetKey() const { return key_; } + double GetTime() const { return time_; } + friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry); + static ResultEntry Null() { return ResultEntry("Null", 0.0); } + static ResultEntry Default() { return ResultEntry("Default", 0.0); } + + private: + std::string key_; + double time_; + std::string blas_sig_; +}; + +typedef std::unordered_map KernelMap; +typedef std::unordered_map ResultsMap; +typedef std::unordered_map> UntunedMap; + +struct TORCH_CUDA_CPP_API TuningResults { + // Validates if these results are compatible with the libraries + std::unordered_map validators; + + // Mapping from Callable signature to Callable's tuning result + ResultsMap results; +}; + +class TORCH_CUDA_CPP_API TuningResultsManager { + public: + TuningResultsManager() = default; + ~TuningResultsManager() = default; + + KernelMap Lookup(const std::string& op_signature); + + ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature); + + void AddImpl(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best, + KernelMap& kernel_map); + + void Add(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best); + + void Delete(const std::string& op_signature, const std::string& params_signature); + + void DisjointMergeImpl( + const std::string& op_signature, + const KernelMap& kernel_map, + /*out*/ ResultsMap& results); + + void Load(const ResultsMap& results_to_load); + + ResultsMap Dump(); + + void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map); + + size_t GetSize(); + + void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, + const std::string& params_signature, const std::string& blas_signature); + + void InitRealtimeAppend( + const std::string& filename, + const std::unordered_map& validators); + + void AppendResultLine(const std::string& op_sig, + const std::string& param_sig, + const ResultEntry& result); + + void CloseRealtimeAppend(); // For clean shutdown + private: + std::mutex lock_; + std::mutex realtime_file_mutex_; + std::unique_ptr realtime_out_; + std::string realtime_filename_; + ResultsMap results_; + UntunedMap untuned_results_; + bool validators_written_ = false; + +}; + +class TORCH_CUDA_CPP_API TuningResultsValidator { + public: + using GetFunc = std::function; + using ValidateFunc = std::function; + using GetValidateFuncs = std::unordered_map>; + + TuningResultsValidator(); + ~TuningResultsValidator() = default; + + std::unordered_map GetAllValidators() const; + TuningStatus ValidateAll(const std::unordered_map& to_validate) const; + void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf); + + protected: + static std::string GetPyTorchVersion() ; + TuningStatus ValidatePyTorchVersion(const std::string& value) const; + + public: + static constexpr const std::array mandatory_keys{"PT_VERSION"}; + + private: + GetValidateFuncs validators_; +}; + +struct NumericalCheckConfig { + bool enabled{false}; + double atol{1e-5}; + double rtol{1e-5}; + + NumericalCheckConfig() = default; + NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {} +}; + + +class TORCH_CUDA_CPP_API TuningContext { + public: + TuningContext(); + ~TuningContext(); + TuningContext(TuningContext &) = delete; + TuningContext(TuningContext &&) = delete; + TuningContext &operator=(TuningContext &) = delete; + TuningContext &operator=(TuningContext &&) = delete; + + void EnableTunableOp(bool value); + bool IsTunableOpEnabled() const; + + void EnableTuning(bool value); + bool IsTuningEnabled() const; + + void EnableRecordUntuned(bool value); + bool IsRecordUntunedEnabled() const; + std::ofstream& GetUntunedFile(); + + void EnableNumericsCheck(bool value); + bool IsNumericsCheckEnabled() const; + void SetNumericalCheckConfig(bool enabled, double atol, double rtol); + NumericalCheckConfig GetNumericalCheckConfig() const; + + void SetMaxTuningDurationMs(int max_duration_ms); + int GetMaxTuningDurationMs() const; + + void SetMaxTuningIterations(int max_iter); + int GetMaxTuningIterations() const; + + void SetMaxWarmupDurationMs(int max_duration_ms); + int GetMaxWarmupDurationMs() const; + + void SetMaxWarmupIterations(int max_iter); + int GetMaxWarmupIterations() const; + + void EnableICacheFlush(bool value); + bool IsICacheFlushEnabled() const; + + void SetRotatingBufferSize(int size); + int GetRotatingBufferSize() const; + + TuningResultsManager& GetTuningResultsManager(); + + TuningResultsValidator& GetTuningResultsValidator(); + + TuningResults GetTuningResults(); + + TuningStatus LoadTuningResults(const TuningResults& tr); + + void SetFilename(const std::string& filename, bool insert_device_ordinal=false); + std::string GetFilename() const; + + bool ReadFile(const std::string& filename={}); + + template + void Log(int level, Types... args) { + if (GetLogOkay() && GetLogLevel() >= level) { + GetLog() << c10::str(args...) << std::endl; + } + } + + private: + std::string GetLogFilename() const; + int GetLogLevel() const; + bool GetLogOkay() const; + std::ostream& GetLog() const; + + bool enable_; + bool tuning_enable_; + bool record_untuned_enable_; + bool manager_initialized_; + bool numerics_check_enable_; + int max_tuning_duration_ms_; + int max_tuning_iterations_; + int max_warmup_duration_ms_; + int max_warmup_iterations_; + bool icache_flush_; + int rotating_buffer_size_; + mutable TuningResultsManager manager_; + mutable c10::once_flag manager_init_once_; + TuningResultsValidator validator_; + std::string filename_; + std::ofstream untuned_file_; + size_t results_count_from_input_file_; + bool is_shutting_down_; + + NumericalCheckConfig numerics_cfg_{}; +}; + +TORCH_CUDA_CPP_API TuningContext* getTuningContext(); + +class ITimer { + public: + ITimer() = default; + virtual ~ITimer() = default; + + virtual void Start() = 0; + virtual void End() = 0; + + /// Computes the elapsed time in milliseconds between Start() and End() + virtual float Duration() = 0; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h new file mode 100644 index 0000000000000000000000000000000000000000..b377374967ee2f224983c993145eb427d2cc57bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h @@ -0,0 +1,334 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#ifdef USE_ROCM +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::cuda::tunable { + +template +class DefaultGemmOp : public Callable> { + public: + TuningStatus Call(const GemmParams* params) override { + at::cuda::blas::gemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, + params->b, params->ldb, + params->beta, + params->c, params->ldc); + return OK; + } +}; + +static bool _transposeBoolFromChar(char op) { + return op == 't' || op == 'T'; +} + +template +class DefaultGemmAndBiasOp : public Callable> { + public: + TuningStatus Call(const GemmAndBiasParams* params) override { + at::cuda::blas::gemm_and_bias( + _transposeBoolFromChar(params->transa), + _transposeBoolFromChar(params->transb), + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, + params->b, params->ldb, + params->bias, + params->c, params->ldc, + params->activation); + return OK; + } +}; + +template +class DefaultGemmStridedBatchedOp : public Callable> { + public: + TuningStatus Call(const GemmStridedBatchedParams* params) override { + at::cuda::blas::bgemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, params->stride_a, + params->b, params->ldb, params->stride_b, + params->beta, + params->c, params->ldc, params->stride_c, + params->batch); + return OK; + } +}; + +template +class DefaultScaledGemmOp : public Callable> { + public: + TuningStatus Call(const ScaledGemmParams* params) override { + at::cuda::blas::scaled_gemm( + params->transa, + params->transb, + params->m, + params->n, + params->k, + params->a, + params->a_scale_ptr, + params->lda, + params->a_dtype, + params->a_scale_dtype, + params->a_scaling_type, + params->b, + params->b_scale_ptr, + params->ldb, + params->b_dtype, + params->b_scale_dtype, + params->b_scaling_type, + params->bias_ptr, + params->bias_dtype, + params->c, + params->c_scale_ptr, + params->ldc, + params->c_dtype, + params->use_fast_accum, + std::nullopt /* alpha */); + return OK; + } +}; + +template +inline bool IsZero(T v) { + return v == 0.0f; +} + +template <> +inline bool IsZero(BFloat16 v) { + return v.x == 0; +} + +template <> +inline bool IsZero(Half v) { + return float(v) == 0.0f; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0f; +} + +template +inline const char* TypeName(T v) { + return "unknown"; +} + +template <> +inline const char* TypeName(float v) { + if (at::globalContext().allowTF32CuBLAS()) { + return "tf32"; + } else { + return "float"; + } +} + +template <> +inline const char* TypeName(double v) { + return "double"; +} + +template <> +inline const char* TypeName(BFloat16 v) { + return "BFloat16"; +} + +template <> +inline const char* TypeName(Half v) { + return "Half"; +} + +template <> +inline const char* TypeName(Float8_e4m3fn v) { + return "Float8_e4m3fn"; +} + +template <> +inline const char* TypeName(Float8_e5m2 v) { + return "Float8_e5m2"; +} + +template <> +inline const char* TypeName(Float8_e4m3fnuz v) { + return "Float8_e4m3fnuz"; +} + +template <> +inline const char* TypeName(Float8_e5m2fnuz v) { + return "Float8_e5m2fnuz"; +} + +template <> +inline const char* TypeName(Float8_e8m0fnu v) { + return "Float8_e8m0fnu"; +} + +template <> +inline const char* TypeName(c10::complex v) { + return "c10::complex"; +} + +template <> +inline const char* TypeName(c10::complex v) { + return "c10::complex"; +} + +template +class GemmTunableOp : public TunableOp> { + public: + GemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED"); + if (!env_rocblas.has_value() || env_rocblas.value()) { + for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class GemmAndBiasTunableOp : public TunableOp> { + public: + GemmAndBiasTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmAndBiasTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class GemmStridedBatchedTunableOp : public TunableOp> { + public: + GemmStridedBatchedTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED"); + if (!env_rocblas.has_value() || env_rocblas.value()) { + for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmStridedBatchedTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class ScaledGemmTunableOp : public TunableOp> { + public: + ScaledGemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("ScaledGemmTunableOp_%s_%s_%s_%c%c", + TypeName(AT{}), + TypeName(BT{}), + TypeName(CT{}), + BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h new file mode 100644 index 0000000000000000000000000000000000000000..1a59c1aebc7f01340384c0bbc3cbdc1c3299a6dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h @@ -0,0 +1,434 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include + +namespace at::cuda::tunable { + +template +class Callable { + public: + virtual ~Callable() = default; + virtual TuningStatus Call(const ParamsT* /*unused*/) { + return FAIL; + } + virtual TuningStatus IsSupported(const ParamsT* params) { + return Call(params); + } +}; + +namespace { + +/** http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */ + +class Stats { + public: + Stats() { + _n = 0UL; + _mean = 0.0; + _M2 = 0.0; + _sum = 0.0; + _min = 0.0; + _max = 0.0; + } + + void sample_value(const double x) { + double delta = 0; + _sum = _sum + x; + if (0UL == _n) { + _min = x; + _max = x; + } + else { + _min = _min < x ? _min : x; + _max = _max > x ? _max : x; + } + _n = _n + 1UL; + delta = x - _mean; + _mean = _mean + delta/_n; + _M2 = _M2 + delta * (x - _mean); + } + + double variance() const { + return _M2/(_n-1); + } + + double stddev() const { + return std::sqrt(variance()); + } + + unsigned long _n; + double _mean; + double _M2; + double _sum; + double _min; + double _max; +}; + +class FixedSizeStack { + private: + std::deque stack; + const size_t max_size; + + public: + FixedSizeStack(size_t size) : max_size(size) {} + + void push(const std::string& value) { + if (stack.size() >= max_size) { + stack.pop_front(); // Remove the oldest entry + } + stack.push_back(value); // Add new entry + } + + auto rbegin() { return stack.rbegin(); } + auto rend() { return stack.rend(); } +}; + +} // anonymous namespace + +template +class TunableOp { + public: + virtual ~TunableOp() = default; + + TuningStatus operator()(const ParamsT* params) { + ResultEntry result = ResultEntry::Null(); + TuningContext* ctx = getTuningContext(); + if (ctx->IsTunableOpEnabled()) { + auto& mgr = ctx->GetTuningResultsManager(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + auto blas_sig = params->BLASSignature(); + result = mgr.Lookup(op_sig, params_sig); + // If there is not previous tuning result been found, we do the tuning iff tuning is enabled + if (result == ResultEntry::Null()) { + if (ctx->IsTuningEnabled()) { + result = FindFastest(params); + mgr.Add(op_sig, params_sig, result); + } + else if (ctx->IsRecordUntunedEnabled()) { + // or record the gemm into file + mgr.RecordUntuned(ctx->GetUntunedFile(), op_sig, params_sig, blas_sig); + } + } + } + else { + result = ResultEntry::Default(); + } + if (result == ResultEntry::Null()) { + TUNABLE_LOG2("no result, using default"); + result = ResultEntry::Default(); + } + auto iter = ops_.find(result); + TORCH_CHECK(iter != ops_.end()); + return iter->second->Call(params); + } + + virtual std::string Signature() { + // According to C++17 standard https://wg21.link/n4659 section 15.7.4 + // > if the operand of typeid refers to the + // > object under construction or destruction, typeid yields the std::type_info object representing the constructor + // > or destructor’s class. + // So delay the op signature generation. + c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); }); + return signature_; + } + + protected: + void RegisterOp(const std::string& name, std::unique_ptr> op) { + this->op_names_.emplace_back(name); + this->ops_.emplace(name, std::move(op)); + } + + private: + static void WarmUp(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + for (size_t i = 0; i < num_iter; i++) { + if (do_flush) { + at::cuda::flush_icache(); + } + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + } + + static double ProfileSimple(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + StreamTimerNoSync timer{}; + + // Small Mandatory Warmup + // Reduces outliers + for (size_t i = 0; i < 2; i++) { + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + + timer.Start(); + for (size_t i = 0; i < num_iter; i++) { + if (do_flush) { + at::cuda::flush_icache(); + } + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + timer.End(); + return timer.Duration() / num_iter; + } + + static Stats ProfileStats(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + std::vector timer(num_iter); + + // Small Mandatory Warmup + // Reduces outliers + for (size_t i = 0; i < 2; i++) { + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + + for (size_t i = 0; i < num_iter; i++) { + timer[i].Start(); + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + timer[i].End(); + if (do_flush) { + at::cuda::flush_icache(); + } + } + Stats s; + for (size_t i = 0; i < num_iter; i++) { + s.sample_value(timer[i].Duration()); + } + return s; + } + + protected: + virtual ResultEntry FindFastest(const ParamsT* params) { + TuningContext* ctx = getTuningContext(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + auto blas_sig = params->BLASSignature(); + TUNABLE_LOG2("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates"); + auto min_duration_ms = std::numeric_limits::infinity(); + std::string id_name = "Default"; + ParamsT* reference_params = nullptr; + auto top_solns = FixedSizeStack(5); + + // numeric check option is controlled by non-static env var, so check it once per tuned operator + bool do_numerics_check = ctx->IsNumericsCheckEnabled(); + + // calculate a reference answer for numerical check + if (do_numerics_check) { + reference_params = params->DeepCopy(false); + TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK); + } + + // need copies of params to reuse + // make as many copies as will fill the requested rotating buffer size, if requested + // rotating_size guaranteed to be >= 0 even though GetRotatingBufferSize() returns int + size_t rotating_size = ctx->GetRotatingBufferSize(); + bool use_buffer_rotation = (rotating_size > 0); + size_t param_size = params->GetSize(use_buffer_rotation); + size_t param_count = (rotating_size / param_size) + 1; + constexpr size_t MB = 1024ull*1024; + if (use_buffer_rotation) { + TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ", + "Needed Size: ", param_size/MB, " MiB. ", + "Needed number of param copies: ", param_count); + } + TORCH_CHECK(param_count > 0); + + std::vector reusable_params(param_count); + for (size_t i = 0; i < param_count; i++) { + reusable_params[i] = params->DeepCopy(use_buffer_rotation); + } + + // for rotating buffer + size_t offset = 0; + + for (size_t i = 0; i < op_names_.size(); i++) { + auto* candidate = ops_[op_names_[i]].get(); // borrow pointer + + auto status = candidate->Call(reusable_params[0]); + if (status != OK) { + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + // collect a small profile + int approx_num_iter = 3; + auto s = ProfileStats(candidate, reusable_params, approx_num_iter, offset); + double approx_duration = s._mean; + // bail if too slow + if (approx_duration > 1.5 * min_duration_ms) { + TUNABLE_LOG3("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + // 2nd phase skip, more aggressive + approx_num_iter = 10; + s = ProfileStats(candidate, reusable_params, approx_num_iter, offset); + approx_duration = s._mean; + // bail if too slow + if (approx_duration > 1.15 * min_duration_ms) { + TUNABLE_LOG3("├──2nd skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + if (do_numerics_check) { + ParamsT* numerical_params = params->DeepCopy(false); + auto status = candidate->Call(numerical_params); + if (status != OK) { + numerical_params->Delete(); + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + status = reference_params->NumericalCheck(numerical_params); + numerical_params->Delete(); + if (status != OK) { + TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + } + + // for warmup does user set max duration, max iters, or both? + // warmup is skipped by default, i.e. warmup_iter = 0 + // warmup will be set to the non-zero value of max_warmup_duration + // or max_warmup_iter + // if both are non-zero, we take the smaller of the two. + double max_warmup_duration = ctx->GetMaxWarmupDurationMs(); + int max_warmup_iter = ctx->GetMaxWarmupIterations(); + int warmup_iter = 0; // default + if (max_warmup_duration > 0) { + int duration_iters = max_warmup_duration / approx_duration; + if (max_warmup_iter > 0) { + warmup_iter = std::min(max_warmup_iter, duration_iters); + } + else { + warmup_iter = duration_iters; + } + } + else if (max_warmup_iter > 0) { + warmup_iter = max_warmup_iter; + } + + // for tuning does user set max duration, max iters, or both? + double max_tuning_duration = ctx->GetMaxTuningDurationMs(); + int max_tuning_iter = ctx->GetMaxTuningIterations(); + int tuning_iter = 100; // default + if (max_tuning_duration > 0) { + int duration_iters = max_tuning_duration / approx_duration; + if (max_tuning_iter > 0) { + tuning_iter = std::min(max_tuning_iter, duration_iters); + } + else { + tuning_iter = duration_iters; + } + } + else if (max_tuning_iter > 0) { + tuning_iter = max_tuning_iter; + } + // tuning must run at least 1 iteration + tuning_iter = std::max(1, tuning_iter); + + // do the full warmup followed by tuning + double warmup_ms = warmup_iter * approx_duration; + double tuning_ms = tuning_iter * approx_duration; + TUNABLE_LOG3("├──tuning using " + "warmup iters ", warmup_iter, " [", warmup_ms, " ms] " + "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ", + "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]); + TUNABLE_LOG3("├──offset at ", offset); + WarmUp(candidate, reusable_params, warmup_iter, offset); + s = ProfileStats(candidate, reusable_params, tuning_iter, offset); + auto s_stddev = s.stddev(); + // Assume normal distribution. + // Solution with smallest mean + 2*sigma will be a better solution? + // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) { + if (s._mean < min_duration_ms) { + TUNABLE_LOG3("├──found better instance id=", i, ". " , s._mean, "ms. ", op_names_[i], + " min ", s._min, + " max ", s._max, + " mean ", s._mean, + " std ", s_stddev); + min_duration_ms = s._mean; + id_name = op_names_[i]; + std::string current_soln = std::to_string(s._mean) + " " + op_names_[i]; + top_solns.push(current_soln); + } + else { + TUNABLE_LOG3("├──found slower instance id=", i, ". " , s._mean, "ms. ", op_names_[i], + " min ", s._min, + " max ", s._max, + " mean ", s._mean, + " std ", s_stddev); + } + } + + for (size_t i = 0; i < reusable_params.size(); i++) { + reusable_params[i]->Delete(); + } + if (reference_params) { + reference_params->Delete(); + } + + TUNABLE_LOG2("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name); + TUNABLE_LOG2("└──top five solutions for ", op_sig, '(', params_sig, ") "); + for (auto it = top_solns.rbegin(); it != top_solns.rend(); ++it) { + TUNABLE_LOG2(" ", *it); + } + return ResultEntry(id_name, min_duration_ms, blas_sig); + } + + private: + std::string CreateSignature() { +#ifndef _WIN32 + const auto* name = typeid(*this).name(); + // NOLINTNEXTLINE(*array*) + char buf[256]; + size_t buf_len = 256; + abi::__cxa_demangle(name, buf, &buf_len, nullptr); + buf[255] = '\0'; + return buf; +#else + return typeid(*this).name(); +#endif + } + + mutable c10::once_flag signature_init_once_; + std::string signature_; + + std::unordered_map>> ops_; + std::vector op_names_; +}; + +struct OpParams { + virtual ~OpParams() = default; + virtual std::string Signature() const = 0; + virtual std::string BLASSignature() const = 0; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..005f4b0a55c787c61c76fbe4acbdc870e8dd9fb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -0,0 +1,248 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10::hip { + +// Takes a valid HIPAllocator (of any sort) and turns it into +// an allocator pretending to be a CUDA allocator. See +// Note [Masquerading as CUDA] +class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator { + HIPCachingAllocator::HIPAllocator* allocator_; +public: + explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator) + : allocator_(allocator) {} + + virtual ~HIPAllocatorMasqueradingAsCUDA() = default; + + // From c10::Allocator + + DataPtr allocate(size_t size) override { + DataPtr r = allocator_->allocate(size); + r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index())); + return r; + } + + bool is_simple_data_ptr(const DataPtr& data_ptr) const override { + return allocator_->is_simple_data_ptr(data_ptr); + } + + DeleterFnPtr raw_deleter() const override { + return allocator_->raw_deleter(); + } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + allocator_->copy_data(dest, src, count); + } + + // From DeviceAllocator + + bool initialized() override { + return allocator_->initialized(); + } + + void emptyCache(MempoolId_t mempool_id = {0, 0}) override { + allocator_->emptyCache(mempool_id); + } + + void recordStream(const DataPtr& ptr, c10::Stream stream) override { + HIPStream hip_stream = HIPStream(stream); + recordStream(ptr, hip_stream); + } + + CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override { + return allocator_->getDeviceStats(device); + } + + void resetAccumulatedStats(c10::DeviceIndex device) override { + allocator_->resetAccumulatedStats(device); + } + + void resetPeakStats(c10::DeviceIndex device) override { + allocator_->resetPeakStats(device); + } + + // From CUDAAllocator + + void* raw_alloc(size_t nbytes) override { + return allocator_->raw_alloc(nbytes); + } + + void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override { + return allocator_->raw_alloc_with_stream(nbytes, stream); + } + + void raw_delete(void* ptr) override { + allocator_->raw_delete(ptr); + } + + void init(int device_count) override { + allocator_->init(device_count); + } + + double getMemoryFraction(c10::DeviceIndex device) override { + return allocator_->getMemoryFraction(device); + } + + void setMemoryFraction(double fraction, c10::DeviceIndex device) override { + allocator_->setMemoryFraction(fraction, device); + } + + std::vector getExpandableSegmentSizes(c10::DeviceIndex device) override { + return allocator_->getExpandableSegmentSizes(device); + } + + void enable(bool value) override { + allocator_->enable(value); + } + + bool isEnabled() const override { + return allocator_->isEnabled(); + } + + void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override { + allocator_->cacheInfo(device, largestBlock); + } + + void* getBaseAllocation(void* ptr, size_t* size) override { + return allocator_->getBaseAllocation(ptr, size); + } + + void recordStream(const DataPtr& ptr, HIPStream stream) override { + allocator_->recordStream(ptr, stream); + } + + HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override { + return allocator_->snapshot(mempool_id); + } + + void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) override { + allocator_->beginAllocateToPool(device, mempool_id, filter); + } + + void endAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id) override { + allocator_->endAllocateToPool(device, mempool_id); + } + + void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->releasePool(device, mempool_id); + } + + int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override { + return allocator_->getPoolUseCount(device, mempool_id); + } + + void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPAllocator* allocator = nullptr) override { + allocator_->createOrIncrefPool(device, mempool_id, allocator); + } + + void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setUseOnOOM(device, mempool_id); + } + + void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setNoSplit(device, mempool_id); + } + + bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) override { + return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations); + } + + HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override { + return allocator_->shareIpcHandle(ptr); + } + + std::shared_ptr getIpcDevPtr(std::string handle) override { + return allocator_->getIpcDevPtr(handle); + } + + bool isHistoryEnabled() override { + return allocator_->isHistoryEnabled(); + } + + void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) override { + allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); + } + + void recordAnnotation( + const std::vector>& md) override { + allocator_->recordAnnotation(md); + } + + void pushCompileContext(std::string& md) override { + allocator_->pushCompileContext(md); + } + + void popCompileContext() override { + allocator_->popCompileContext(); + } + + void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override { + allocator_->attachOutOfMemoryObserver(observer); + } + + void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override { + allocator_->attachAllocatorTraceTracker(tracker); + } + + void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override { + allocator_->enablePeerAccess(dev, dev_to_access); + } + + hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) override { + return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); + } + + std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) override { + return allocator_->getCheckpointState(device, id); + } + + HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) override { + auto cpd = allocator_->setCheckpointPoolState(device, pps); + for (auto& ptr : cpd.dataptrs_allocd) { + ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index())); + } + return cpd; + } + + std::string name() override { + return allocator_->name(); + } + +}; + +} // namespace c10::hip + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..5f0214ee3c8c9d23a07aa2070f92e78bdbc326a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h @@ -0,0 +1,203 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { +// forward declaration +class DataPtr; +namespace hip { +namespace HIPCachingAllocatorMasqueradingAsCUDA { + +C10_HIP_API HIPCachingAllocator::HIPAllocator* get(); +C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream); + +inline void* raw_alloc(size_t nbytes) { + return get()->raw_alloc(nbytes); +} + +inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) { + return get()->raw_alloc_with_stream(nbytes, stream); +} + +inline void raw_delete(void* ptr) { + return get()->raw_delete(ptr); +} + +inline void init(int device_count) { + return get()->init(device_count); +} + +inline double getMemoryFraction(c10::DeviceIndex device) { + return get()->getMemoryFraction(device); +} + +inline void setMemoryFraction(double fraction, c10::DeviceIndex device) { + return get()->setMemoryFraction(fraction, device); +} + +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + return get()->emptyCache(mempool_id); +} + +inline void enable(bool value) { + return get()->enable(value); +} + +inline bool isEnabled() { + return get()->isEnabled(); +} + +inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) { + return get()->cacheInfo(device, largestBlock); +} + +inline void* getBaseAllocation(void* ptr, size_t* size) { + return get()->getBaseAllocation(ptr, size); +} + +inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) { + return get()->getDeviceStats(device); +} + +inline void resetAccumulatedStats(c10::DeviceIndex device) { + return get()->resetAccumulatedStats(device); +} + +inline void resetPeakStats(c10::DeviceIndex device) { + return get()->resetPeakStats(device); +} + +inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) { + return get()->snapshot(mempool_id); +} + +inline std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) { + return get()->getCheckpointState(device, id); +} + +inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) { + return get()->setCheckpointPoolState(device, std::move(pps)); +} + +inline void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) { + get()->beginAllocateToPool(device, mempool_id, std::move(filter)); +} + +inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->endAllocateToPool(device, mempool_id); +} + +inline void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) { + return get()->recordHistory( + enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +} + +inline void recordAnnotation( + const std::vector>& md) { + return get()->recordAnnotation(md); +} + +inline void pushCompileContext(std::string& md) { + return get()->pushCompileContext(md); +} + +inline void popCompileContext() { + return get()->popCompileContext(); +} + +inline bool isHistoryEnabled() { + return get()->isHistoryEnabled(); +} + +inline bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) { + return get()->checkPoolLiveAllocations( + device, mempool_id, expected_live_allocations); +} + +inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) { + return get()->attachOutOfMemoryObserver(std::move(observer)); +} + +inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) { + return get()->attachAllocatorTraceTracker(std::move(tracker)); +} + +inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->releasePool(device, mempool_id); +} + +inline void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) { + get()->createOrIncrefPool(device, mempool_id, allocator_ptr); +} + +inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setUseOnOOM(device, mempool_id); +} + +inline void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setNoSplit(device, mempool_id); +} + +inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->getPoolUseCount(device, mempool_id); +} + +inline std::shared_ptr getIpcDevPtr(std::string handle) { + return get()->getIpcDevPtr(std::move(handle)); +} + +inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) { + return get()->shareIpcHandle(ptr); +} + +inline std::string name() { + return get()->name(); +} + +inline hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) { + return get()->memcpyAsync( + dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); +} + +inline void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access) { + return get()->enablePeerAccess(dev, dev_to_access); +} + +} // namespace HIPCachingAllocatorMasqueradingAsCUDA +} // namespace hip +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..be82a7c22e3f62e77ef9b9f232fe1f1ce864efa8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h @@ -0,0 +1,388 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// The includes of HIPGuard.h +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10 { namespace hip { + +// Note [Masquerading as CUDA] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// c10_hip is very easy to understand: it is HIPified from c10_cuda, +// and anywhere you said CUDA, the source code now says HIP. HIPified +// PyTorch is much harder to understand: it is HIPified from regular +// PyTorch, yes, but NO source-to-source translation from CUDA to +// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP". +// For example, when you use HIPified PyTorch, you say x.cuda() to +// move a tensor onto ROCm device. We call this situation "HIP +// masquerading as CUDA". +// +// This leads to a very awkward situation when we want to call c10_hip +// code from PyTorch, since c10_hip is expecting things to be called +// HIP, but PyTorch is calling them CUDA (masquerading as HIP). To +// fix this impedance mismatch, we have MasqueradingAsCUDA variants +// for all c10_hip classes. These translate between the "HIP" and "CUDA +// masquerading as HIP" worlds. For example, +// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a +// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type() +// returns CUDA, getDevice() reports the current HIP device as a CUDA +// device.) +// +// We should be able to delete all of these classes entirely once +// we switch PyTorch to calling a HIP a HIP. +// +// When you add a new MasqueradingAsCUDA class/function, you need to +// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py +// +// +// +// By the way, note that the cpp file associated with this also +// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with +// this HIP implementation. + +struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA; + HIPGuardImplMasqueradingAsCUDA() {} + HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) { + TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA); + } + c10::DeviceType type() const override { + return c10::DeviceType::CUDA; + } + Device exchangeDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_cuda()); + Device old_device = getDevice(); + if (old_device.index() != d.index()) { + C10_HIP_CHECK(hipSetDevice(d.index())); + } + return old_device; + } + Device getDevice() const override { + int device; + C10_HIP_CHECK(hipGetDevice(&device)); + return Device(c10::DeviceType::CUDA, device); + } + void setDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_cuda()); + C10_HIP_CHECK(hipSetDevice(d.index())); + } + void uncheckedSetDevice(Device d) const noexcept override { + C10_HIP_CHECK_WARN(hipSetDevice(d.index())); + } + Stream getStream(Device d) const override { + return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap(); + } + Stream getDefaultStream(Device d) const override { + return getDefaultHIPStreamMasqueradingAsCUDA(d.index()); + } + Stream getNewStream(Device d, int priority = 0) const override { + return getStreamFromPoolMasqueradingAsCUDA(priority, d.index()); + } + Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override { + return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index()); + } + Stream exchangeStream(Stream s) const override { + HIPStreamMasqueradingAsCUDA cs(s); + auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index()); + setCurrentHIPStreamMasqueradingAsCUDA(cs); + return old_stream.unwrap(); + } + DeviceIndex deviceCount() const noexcept override { + int deviceCnt; + hipError_t _err; + _err = hipGetDeviceCount(&deviceCnt); + if(_err != hipErrorNoDevice && _err != hipSuccess) + C10_HIP_CHECK(_err); + return deviceCnt; + } + + // Event-related functions + // Note: hipEventCreateWithFlags should be called on the same device as + // the recording stream's device. + void createEvent( + hipEvent_t* hip_event, + const EventFlag flag) const { + // Maps PyTorch's Event::Flag to HIP flag + auto hip_flag = hipEventDefault; + switch (flag) { + case EventFlag::PYTORCH_DEFAULT: + hip_flag = hipEventDisableTiming; + break; + case EventFlag::BACKEND_DEFAULT: + hip_flag = hipEventDefault; + break; + default: + TORCH_CHECK(false, "HIP event received unknown flag"); + } + + C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag)); + } + + void destroyEvent( + void* event, + const DeviceIndex device_index) const noexcept override { + if (!event) return; + auto hip_event = static_cast(event); + int orig_device; + C10_HIP_CHECK_WARN(hipGetDevice(&orig_device)); + C10_HIP_CHECK_WARN(hipSetDevice(device_index)); + C10_HIP_CHECK_WARN(hipEventDestroy(hip_event)); + C10_HIP_CHECK_WARN(hipSetDevice(orig_device)); + } + + void record(void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override { + TORCH_CHECK(device_index == -1 || device_index == stream.device_index(), + "Event device index ", + device_index, + " does not match recording stream's device index ", + stream.device_index(), + "."); + + hipEvent_t hip_event = static_cast(*event); + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + + // Moves to stream's device to record + const auto orig_device = getDevice(); + setDevice(stream.device()); + + // Creates the event (lazily) + if (!hip_event) createEvent(&hip_event, flag); + C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream)); + // Makes the void* point to the (possibly just allocated) HIP event + *event = hip_event; + + // Resets device + setDevice(orig_device); + } + + void block( + void* event, + const Stream& stream) const override { + if (!event) return; + hipEvent_t hip_event = static_cast(event); + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + const auto orig_device = getDevice(); + setDevice(stream.device()); + C10_HIP_CHECK(hipStreamWaitEvent( + hip_stream, + hip_event, + /*flags (must be zero)=*/ 0)); + setDevice(orig_device); + } + + bool queryEvent(void* event) const override { + if (!event) return true; + hipEvent_t hip_event = static_cast(event); + const hipError_t err = hipEventQuery(hip_event); + if (err != hipErrorNotReady) C10_HIP_CHECK(err); + else { + // ignore and clear the error if not ready + (void)hipGetLastError(); + } + return (err == hipSuccess); + } + + // Stream-related functions + bool queryStream(const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + return hip_stream.query(); + } + + void synchronizeStream(const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + hip_stream.synchronize(); + } + + void synchronizeEvent(void* event) const override { + if (!event) + return; + hipEvent_t hip_event = static_cast(event); + C10_HIP_CHECK(hipEventSynchronize(hip_event)); + } + + // Note: synchronizeDevice can be safely called from any device + void synchronizeDevice(const c10::DeviceIndex device_index) const override { + int orig_device{-1}; + C10_HIP_CHECK(hipGetDevice(&orig_device)); + C10_HIP_CHECK(hipSetDevice(device_index)); + C10_HIP_CHECK(hipDeviceSynchronize()); + C10_HIP_CHECK(hipSetDevice(orig_device)); + } + + void recordDataPtrOnStream( + const c10::DataPtr& data_ptr, + const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream); + } + + double elapsedTime(void* event1, void* event2, const DeviceIndex device_index) + const override { + TORCH_CHECK( + event1 && event2, + "Both events must be recorded before calculating elapsed time."); + int orig_device; + C10_HIP_CHECK(hipGetDevice(&orig_device)); + C10_HIP_CHECK(hipSetDevice(device_index)); + hipEvent_t hip_event1 = static_cast(event1); + hipEvent_t hip_event2 = static_cast(event2); + float time_ms = 0; + // raise hipErrorNotReady if either event is recorded but not yet completed + C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2)); + C10_HIP_CHECK(hipSetDevice(orig_device)); + return static_cast(time_ms); + } +}; + +// All of the guards which have HIPGuardImpl burned in need to also have +// variants using HIPGuardImplMasqueradingAsCUDA. + +/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with +/// the correct InlineDeviceGuard burned in. Sorry about the +/// copy-pasting. + +struct HIPGuardMasqueradingAsCUDA { + explicit HIPGuardMasqueradingAsCUDA() = delete; + explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {} + explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {} + + HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete; + HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete; + HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete; + HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete; + + void set_device(Device device) { guard_.set_device(device); } + void reset_device(Device device) { guard_.reset_device(device); } + void set_index(DeviceIndex device_index) { guard_.set_index(device_index); } + Device original_device() const { return guard_.original_device(); } + Device current_device() const { return guard_.current_device(); } + + private: + c10::impl::InlineDeviceGuard guard_; +}; + +struct OptionalHIPGuardMasqueradingAsCUDA { + explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {} + explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional device_opt) : guard_(device_opt) {} + explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional device_index_opt) : guard_(device_index_opt) {} + + OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete; + OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete; + OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete; + OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete; + + void set_device(Device device) { guard_.set_device(device); } + void reset_device(Device device) { guard_.reset_device(device); } + void set_index(DeviceIndex device_index) { guard_.set_index(device_index); } + std::optional original_device() const { return guard_.original_device(); } + std::optional current_device() const { return guard_.current_device(); } + void reset() { guard_.reset(); } + +private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + +struct HIPStreamGuardMasqueradingAsCUDA { + explicit HIPStreamGuardMasqueradingAsCUDA() = delete; + explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {} + HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete; + HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete; + HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete; + HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete; + + void reset_stream(Stream stream) { guard_.reset_stream(stream); } + + HIPStreamMasqueradingAsCUDA original_stream() const { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream()); + } + HIPStreamMasqueradingAsCUDA current_stream() const { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream()); + } + + Device current_device() const { return guard_.current_device(); } + Device original_device() const { return guard_.original_device(); } + +private: + c10::impl::InlineStreamGuard guard_; +}; + +struct OptionalHIPStreamGuardMasqueradingAsCUDA { + explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {} + explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {} + explicit OptionalHIPStreamGuardMasqueradingAsCUDA(std::optional stream_opt) : guard_(stream_opt) {} + + OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete; + + void reset_stream(Stream stream) { guard_.reset_stream(stream); } + + std::optional original_stream() const { + auto r = guard_.original_stream(); + if (r.has_value()) { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + std::optional current_stream() const { + auto r = guard_.current_stream(); + if (r.has_value()) { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + void reset() { guard_.reset(); } + +private: + c10::impl::InlineOptionalStreamGuard guard_; +}; + +struct HIPMultiStreamGuardMasqueradingAsCUDA { + explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef streams) + : guard_(unwrapStreams(streams)) {} + + HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete; + +private: + c10::impl::InlineMultiStreamGuard guard_; + + static std::vector unwrapStreams(ArrayRef hipStreams) { + std::vector streams; + streams.reserve(hipStreams.size()); + for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) { + streams.push_back(hipStream); + } + return streams; + } +}; + +}} // namespace c10::hip + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..48f1459396b82283290e457a16f9ec66ee500601 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h @@ -0,0 +1,140 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10 { namespace hip { + +// See Note [Masquerading as CUDA] for motivation + +class HIPStreamMasqueradingAsCUDA { +public: + + enum Unchecked { UNCHECKED }; + + explicit HIPStreamMasqueradingAsCUDA(Stream stream) + : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) { + // We did the coercion unchecked; check that it was right. + TORCH_CHECK(stream.device().is_cuda() /* !!! */); + } + + explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream) + // Unsafely coerce the "CUDA" stream into a HIP stream + : stream_( + HIPStream( + Stream( + Stream::UNSAFE, + Device(c10::DeviceType::HIP, stream.device_index()), + stream.id()) + ) + ) {} + + // New constructor, just for this. Does NOT coerce. + explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {} + + bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept { + return stream_ == other.stream_; + } + + bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept { + return stream_ != other.stream_; + } + + operator hipStream_t() const { return stream_.stream(); } + + operator Stream() const { + // Unsafely coerce HIP stream into a "CUDA" stream + return Stream(Stream::UNSAFE, device(), id()); + } + + DeviceIndex device_index() const { return stream_.device_index(); } + + // Unsafely coerce HIP device into CUDA device + c10::DeviceType device_type() const { return c10::DeviceType::CUDA; } + + Device device() const { + // Unsafely coerce HIP device into CUDA device + return Device(c10::DeviceType::CUDA, stream_.device_index()); + } + + StreamId id() const { return stream_.id(); } + bool query() const { return stream_.query(); } + void synchronize() const { stream_.synchronize(); } + int priority() const { return stream_.priority(); } + hipStream_t stream() const { return stream_.stream(); } + + Stream unwrap() const { + // Unsafely coerce HIP stream into "CUDA" stream + return Stream(Stream::UNSAFE, device(), id()); + } + + c10::StreamData3 pack3() const noexcept { + // Unsafely coerce HIP stream into "CUDA" stream before packing + return unwrap().pack3(); + } + + static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id, + DeviceIndex device_index, + c10::DeviceType device_type) { + // NB: constructor manages CUDA->HIP translation for us + return HIPStreamMasqueradingAsCUDA(Stream::unpack3( + stream_id, device_index, device_type)); + } + + static std::tuple priority_range() { return HIPStream::priority_range(); } + + // New method, gets the underlying HIPStream + HIPStream hip_stream() const { return stream_; } + +private: + HIPStream stream_; +}; + +HIPStreamMasqueradingAsCUDA +inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) { + return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device)); +} + +HIPStreamMasqueradingAsCUDA +inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) { + return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device)); +} + +HIPStreamMasqueradingAsCUDA +inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) { + return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device)); +} + +inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { + return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index)); +} + +inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { + return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index)); +} + +inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) { + setCurrentHIPStream(stream.hip_stream()); +} + +inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) { + stream << s.hip_stream() << " (masquerading as CUDA)"; + return stream; +} + +}} // namespace c10::hip + +namespace std { + template <> + struct hash { + size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept { + return std::hash{}(s.unwrap()); + } + }; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c1d088a57f4a74af873ff240661852c76af3e144 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t); +DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..314bc2c06d7acdd436d11bc9d20eb4553efee103 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { +struct TensorIteratorBase; + +namespace native { +inline namespace CPU_CAPABILITY { + +void direct_copy_kernel(TensorIteratorBase &iter); +void copy_kernel(TensorIterator& iter, bool /*non_blocking*/); + +}}} // namespace at::native::CPU_CAPABILITY + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..86cf48ff2a6823ccb27987121fd797d75e2b70e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h @@ -0,0 +1,430 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CPU_CAPABILITY_AVX2 +#include +#include +#endif + + + + +namespace at::native::templates::cpu { +namespace { + +// ==================================================== Random ======================================================== + +template +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) { + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cpu", AT_WRAP([&] { + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t { + uniform_int_from_to_distribution random(range, base); + return random(generator); + }); + }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +} + +// This is the special kernel to handle single specific case: +// from(inclusive) = std::numeric_limits::lowest() +// to(exclusive) = None (= std::numeric_limits::max() + 1) +template +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [generator]() -> scalar_t { + uniform_int_full_range_distribution random; + return random(generator); + }); + } else { + TORCH_CHECK(false, "random_full_64_bits_range_kernel_cpu handles only int64, double, float and bfloat16"); + } + }); +} + +template +struct RandomFromToKernel { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { + random_from_to_kernel(iter, range, base, check_generator(gen)); + } + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_full_64_bits_range_kernel(iter, check_generator(gen)); + } +}; + +template +void random_kernel(TensorIteratorBase& iter, RNG generator) { + std::lock_guard lock(generator->mutex_); + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] { + cpu_serial_kernel(iter, [generator]() -> scalar_t { + uniform_int_distribution random; + return random(generator); + }); + }); +} + +template +struct RandomKernel { + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_kernel(iter, check_generator(gen)); + } +}; + +// ==================================================== Normal ======================================================== + +#ifdef CPU_CAPABILITY_AVX2 +void normal_fill_16_AVX2(float *data, + const __m256* two_pi, + const __m256* one, + const __m256* minus_two, + const __m256* mean, + const __m256* std_v) { + const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data)); + const __m256 u2 = _mm256_loadu_ps(data + 8); + // sincos256_ps and log256_ps are from avx_mathfun.h + const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1))); + const __m256 theta = _mm256_mul_ps(*two_pi, u2); + __m256 sintheta, costheta; + sincos256_ps(theta, &sintheta, &costheta); + const __m256 n1 = _mm256_mul_ps(radius, costheta); + const __m256 n2 = _mm256_mul_ps(radius, sintheta); + _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *std_v, *mean)); + _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *std_v, *mean)); +} + +template +void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) { + float *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + const __m256 two_pi = _mm256_set1_ps(2.0f * c10::pi); + const __m256 one = _mm256_set1_ps(1.0f); + const __m256 minus_two = _mm256_set1_ps(-2.0f); + const __m256 mean_v = _mm256_set1_ps(mean); + const __m256 std_v = _mm256_set1_ps(std); + + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &std_v); + } + + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &std_v); + } +} +#endif + +template +void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { + for (const auto j : c10::irange(8)) { + const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. + const scalar_t u2 = data[j + 8]; + const scalar_t radius = std::sqrt(-2 * std::log(u1)); + const scalar_t theta = 2.0f * c10::pi * u2; + data[j] = radius * std::cos(theta) * std + mean; + data[j + 8] = radius * std::sin(theta) * std + mean; + } +} + +#if defined(__VSX__) || defined(CPU_CAPABILITY_VSX) +static void normal_fill_16_VSX(float *data,const Vectorized &two_pi,const Vectorized &one,const Vectorized &minus_two,const Vectorized &mean,const Vectorized &std) { + using Vec = Vectorized; + Vec u1=one-Vec::loadu(data); + Vec u2=Vec::loadu(data+8); + Vec radius=(minus_two * u1.log()); + radius=radius.sqrt(); + Vec theta=two_pi * u2; + Vec output_vec=radius * theta.cos() * std + mean; + Vec output_vec2=radius * theta.sin() * std + mean; + output_vec.store(data); + output_vec2.store(data+8); +} + +template +void normal_fill_VSX(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { + float *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + + using Vec = Vectorized; + const Vec two_pi = Vec(2.0f * c10::pi); + const Vec one = Vec(1.0f); + const Vec minus_two = Vec(-2.0f); + const Vec var_vec = Vec(std); + const Vec mean_vec = Vec(mean); + + for (int64_t i = 0; i < size - 15; i += 16) { + if(Vec::size()==8) { + normal_fill_16_VSX(data + i, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data + i, mean, std); + } + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + if(Vec::size()==8){ + normal_fill_16_VSX(data, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data, mean, std); + } + } +} +#endif //VSX + +template +void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { + scalar_t *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16(data + i, mean, std); + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + normal_fill_16(data, mean, std); + } +} + +template +void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) { + auto size = self.numel(); + if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { +#ifdef CPU_CAPABILITY_AVX2 + normal_fill_AVX2(self, static_cast(mean), static_cast(std), generator); +#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) + normal_fill_VSX(self, static_cast(mean), static_cast(std), generator); +#else + normal_fill(self, static_cast(mean), static_cast(std), generator); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "normal_kernel_cpu", [&] { + if (size >= 16 && self.is_contiguous()) { + normal_fill(self, static_cast(mean), static_cast(std), generator); + } else { + auto iter = TensorIterator::borrowing_nullary_op(self); + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [mean, std, generator]() -> scalar_t { + at::normal_distribution normal(mean, std); + return static_cast(normal(generator)); + }); + } + }); + } +} + +template +struct NormalKernel { + void operator()(Tensor& self, double mean, double std, std::optional gen) { + normal_kernel(self, mean, std, check_generator(gen)); + } +}; + +// ==================================================== Uniform ======================================================= + +template +void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + auto from = static_cast(from_); + auto to = static_cast(to_); + at::uniform_real_distribution uniform(from, to); + cpu_serial_kernel(iter, [&uniform, generator]() -> scalar_t { + return static_cast(uniform(generator)); + }); + }); +} + +template +struct UniformKernel { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { + uniform_kernel(iter, from, to, check_generator(gen)); + } +}; + +// ==================================================== Cauchy ======================================================== + +template +void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::cauchy_distribution cauchy(median, sigma); + cpu_serial_kernel(iter, [&cauchy, generator]() -> scalar_t { + return static_cast(cauchy(generator)); + }); + }); +} + +template +struct CauchyKernel { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { + cauchy_kernel(iter, median, sigma, check_generator(gen)); + } +}; + +// ================================================== LogNormal ======================================================= + +template +void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::lognormal_distribution logNormal(mean, std); + cpu_serial_kernel(iter, [&logNormal, generator]() -> scalar_t { + return static_cast(logNormal(generator)); + }); + }); +} + +template +struct LogNormalKernel { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { + log_normal_kernel(iter, mean, std, check_generator(gen)); + } +}; + +// =================================================== Geometric ====================================================== + +template +void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::geometric_distribution geometric(p); + cpu_serial_kernel(iter, [&geometric, generator]() -> scalar_t { + return static_cast(geometric(generator)); + }); + }); +} + +template +struct GeometricKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + geometric_kernel(iter, p, check_generator(gen)); + } +}; + +// ================================================== Exponential ===================================================== + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) { + TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::exponential_distribution exponential(lambda); + cpu_serial_kernel(iter, [&exponential, generator]() -> scalar_t { + return static_cast(exponential(generator)); + }); + }); +} + +template +struct ExponentialKernel { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { + exponential_kernel(iter, lambda, check_generator(gen)); + } +}; + +// ================================================== Bernoulli ======================================================= + +template +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, + self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(generator->mutex_); + using self_t = scalar_t; + auto p_cpu = p_.to(kCPU); + auto p = expand_inplace(self, p_cpu); + auto iter = TensorIteratorConfig() + .add_output(self) + .add_const_input(*p) + .check_all_same_dtype(false) + .build(); + if (p->scalar_type() == kDouble) { + cpu_serial_kernel(iter, [&](const double p_val) -> self_t { + at::bernoulli_distribution bernoulli(p_val); + return static_cast(bernoulli(generator)); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, + p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] { + using p_t = scalar_t; + cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { + at::bernoulli_distribution bernoulli(p_val); + return static_cast(bernoulli(generator)); + }); + }); + } + }); +} + +template +void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, + self.scalar_type(), "bernoulli_scalar_cpu_", [&] { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(generator->mutex_); + auto iter = TensorIterator::borrowing_nullary_op(self); + cpu_serial_kernel(iter, [p, generator]() -> scalar_t { + at::bernoulli_distribution bernoulli(p); + return static_cast(bernoulli(generator)); + }); + }); +} + +template +struct BernoulliKernel { + void operator()(const TensorBase &self, double p, std::optional gen) { + bernoulli_kernel(self, p, check_generator(gen)); + } + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { + bernoulli_kernel(self, p_, check_generator(gen)); + } +}; + +}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h new file mode 100644 index 0000000000000000000000000000000000000000..e214126e00d106b18702cb926c9a0c1fe5550c27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h @@ -0,0 +1,88 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to +// access constants such as M_SQRT2 and M_2_SQRTPI. +#ifdef _WIN32 +#define _USE_MATH_DEFINES +#include +#include +#endif // _WIN32 + +#include +#include // For c10::is_reduced_floating_point_v. + +namespace at::native { +inline namespace CPU_CAPABILITY { +constexpr double kGeluBeta = M_SQRT2 * M_2_SQRTPI * 0.5; +constexpr double kGeluKappa = 0.044715; + +template +using reduced_fp_to_float_t = std::conditional_t, float, T>; + +template , bool> = true> +float reduced_fp_to_float(T x) { + return float(x); +} + +template , bool> = true> +T reduced_fp_to_float(T x) { + return x; +} + +template +T scalar_gelu_approximated_with_tanh(T x) { + using opmath_t = reduced_fp_to_float_t; + auto x_float = reduced_fp_to_float(x); + auto x_cube = x_float * x_float * x_float; + auto inner = opmath_t(kGeluBeta) * (x_float + opmath_t(kGeluKappa) * x_cube); + return opmath_t(0.5) * x_float * (opmath_t(1) + std::tanh(inner)); +} + +template , bool> = true> +vec::Vectorized vectorized_gelu_approximated_with_tanh(vec::Vectorized x) { + const vec::Vectorized kPointFiveVec(T(0.5)); + const vec::Vectorized kOneVec(T(1)); + const vec::Vectorized kGeluBetaVec((T(kGeluBeta))); + const vec::Vectorized kGeluKappaVec((T(kGeluKappa))); + auto x_cube = x * x * x; + vec::Vectorized inner_vec = kGeluBetaVec * (x + kGeluKappaVec * x_cube); + return kPointFiveVec * x * (kOneVec + inner_vec.tanh()); +} + +template , bool> = true> +vec::Vectorized vectorized_gelu_approximated_with_tanh(vec::Vectorized x) { + auto [x0, x1] = at::vec::convert_to_float(x); + return at::vec::convert_from_float( + vectorized_gelu_approximated_with_tanh(x0), + vectorized_gelu_approximated_with_tanh(x1)); +} + + +template +T scalar_gelu(T x) { + using opmath_t = reduced_fp_to_float_t; + const auto kAlpha = opmath_t(M_SQRT1_2); + return reduced_fp_to_float(x) * opmath_t(0.5) * (opmath_t(1) + std::erf(reduced_fp_to_float(x) * kAlpha)); +} + +template, bool> = true> +vec::Vectorized vectorized_gelu(vec::Vectorized x) { + const vec::Vectorized kAlphaVec(T(M_SQRT1_2)); + const vec::Vectorized kOneVec(T(1)); + const vec::Vectorized kPointFiveVec(T(0.5)); + return x * kPointFiveVec * (kOneVec + (x * kAlphaVec).erf()); +} + +template, bool> = true> +vec::Vectorized vectorized_gelu(vec::Vectorized x) { + auto [x0, x1] = at::vec::convert_to_float(x); + return at::vec::convert_from_float(vectorized_gelu(x0), vectorized_gelu(x1)); +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h new file mode 100644 index 0000000000000000000000000000000000000000..2c3f03718a9d5c08dc5c0fa47bedfe12e2a8fd95 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +// custom min and max to be used in logcumsumexp for complex arguments +template +std::pair, c10::complex> _logcumsumexp_minmax(c10::complex x, c10::complex y) { + if (at::_isnan(y)) { // either real is nan or imag is nan + return std::make_pair(y, y); + } else if (at::_isnan(x)) { // either real is nan or imag is nan + return std::make_pair(x, x); + } else { + return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x); + } +} + +template +scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) { + // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp + scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan + scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan + if (min != max || std::isfinite(min)) { + // nan will be propagated here + return std::log1p(std::exp(min - max)) + max; + } else { + // special case to correctly handle infinite cases + return x; + } +} + +template +c10::complex _log_add_exp_helper(const c10::complex& x, const c10::complex& y) { + auto [min, max] = _logcumsumexp_minmax(x, y); + auto min_real = std::real(min); + auto max_real = std::real(max); + + if (at::_isnan(min)) { // either real is nan or imag is nan + // handling the "infectious" NaNs + return {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}; + } else if (!std::isfinite(min_real) && (min_real == max_real)) { + if (min_real < 0) { + // handle the -inf case, the imaginary part here does not really matter as the exp(value) + // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined. + // It does not matter if we're taking the exp of this value + return min; + } else { + // handle the +inf case, we don't need the special precision for log1p for small values + // and to avoid producing nan in case of real(max) == real(min) == +inf + return std::log(std::exp(min) + std::exp(max)); + } + } else { + return std::log1p(std::exp(min - max)) + max; + } +} + +} // end namespace +} //end at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..6af3a57749a51e46a6a536d164b2a1218e5bb269 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h @@ -0,0 +1,315 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at::native { inline namespace CPU_CAPABILITY { + +using namespace vec; + +#define VEC_LOOP_HEADER(func_t, data) \ + using scalar_t = typename function_traits::result_type; \ + using Vec = Vectorized; \ + char* out_ptr = data[0]; \ + (void) out_ptr; + +// reduction that is contiguous over the input in dim 0 +template +inline bool is_contiguous_reduction(const int64_t* strides) { + return strides[0] == 0 && + strides[1] == sizeof(typename traits::arg2_t); +} + +// reduction that is contiguous over the input in dim 1 +template +inline bool is_outer_reduction(const int64_t* strides) { + return strides[0] == 0 && + strides[2] == sizeof(typename traits::result_type) && + strides[3] == sizeof(typename traits::arg2_t); +} + +template +inline void vectorized_reduction(char** data, int64_t n, int64_t stride, + func_t op, vec_func_t vop, bool reduce) { + VEC_LOOP_HEADER(func_t, data) + const char* in1_ptr = data[1]; + Vec acc[4]; + for (const auto j : c10::irange(4)) { + acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t)); + } + for (const auto i : c10::irange(1, n)) { + const char* ptr = in1_ptr + stride * i; + acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t)))); + acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t)))); + acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t)))); + acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t)))); + } + if (reduce) { + scalar_t buffer[Vec::size()]; + acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3])); + acc[0].store(buffer); + for (const auto j : c10::irange(1, Vec::size())) { + buffer[0] = op(buffer[0], buffer[j]); + } + auto dst = (scalar_t*)out_ptr; + *dst = op(*dst, buffer[0]); + } else { + for (const auto j : c10::irange(4)) { + auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t); + acc[j] = vop(acc[j], Vec::loadu(dst)); + acc[j].store(dst); + } + } +} + +template +inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) { + for ([[maybe_unused]] const auto j : c10::irange(n)) { + f(); + data[0] += strides[0]; + data[1] += strides[1]; + } +} + +// computes the reduction out = op(out, in) +template +inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) { + VEC_LOOP_HEADER(func_t, data) + constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t); + int64_t count = n / (4 * Vec::size()); + if (count > 0) { + vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true); + } + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t strides[] = { 0, 0, sizeof(scalar_t) }; + basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op); +} + +// computes the reduction out = op(out, in) +template +inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) { + VEC_LOOP_HEADER(func_t, data) + + // reduce down each column of 4 * Vec::size() elements. + constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t); + int64_t outer_stride[2] = { vector_stride, vector_stride }; + UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] { + vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false); + }); + + // reduce down the remaining columns + int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) }; + int64_t remaining = size1 % (4 * Vec::size()); + UNARY_OUTER_LOOP(data, step, remaining, [&] { + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t strides[] = { 0, 0, inner_stride }; + basic_loop(ptrs, strides, 0, size0, op); + }); +} + +template +static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) { + // static_assert(std::is_same_v, "data types must match"); + if (index < num_outputs) { + char *out = (char *) iter.data_ptr(index); + *(res_t *) out = result; + } +} + +template +static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) { + AT_ASSERT(num_outputs == 1); + set_result(0, result, iter, num_outputs); +} + +template +inline std::enable_if_t +for_each_in_tuple(const std::tuple& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) { + return i; +} + +template +inline std::enable_if_t +for_each_in_tuple(const std::tuple& t, const TensorIteratorBase &iter, const int num_outputs) { + if (i < (size_t)num_outputs) { + set_result(i, std::get(t), iter, num_outputs); + return for_each_in_tuple(t, iter, num_outputs); + } + return i; +} + +template +static void set_results(const std::tuple& result, const TensorIteratorBase &iter, const int num_outputs) { + AT_ASSERT(num_outputs >= 1); + std::size_t result_size = for_each_in_tuple(result, iter, num_outputs); + AT_ASSERT((size_t)num_outputs == result_size); +} + +template +struct all_same : std::conjunction< + std::is_same... +> {}; + +// data_t is the input/output data type. +// acc_t is a type that contains all the necessary data +// to continue reducing. +// index_t is a one-dimensional index +// +// ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy +// the following. +// reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value. +// combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one. +// project: acc_t -> out_t finishes the reduction, getting the required output. +// +// Additionally, acc_t must be default-constructible: +// acc_t {} is an identity for combine, +// and project(acc_t {}) is the value of the operation on zero elements. +// +// The point of `combine` is to support parallelization - +// the idea is to one sequence of `reduce` calls per thread of execution, +// and then to combine them at the end with `combine`. +// +// If there is more than one output element, +// our parallelization strategy is to use one thread for each of them, +// which means that `combine` will never be called. +// +// If, on the other hand, there is only one, then we split the input into +// into several pieces, reduce each separately, and then combine them. + +template +void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) { + using rf_t = decltype(&ops_t::reduce); + using cf_t = decltype(&ops_t::combine); + using pf_t = decltype(&ops_t::project); + using r_traits = binary_function_traits; + using c_traits = binary_function_traits; + using p_traits = unary_function_traits; + using acc_t = typename p_traits::arg1_t; + using data_t = typename r_traits::arg2_t; + static_assert( + all_same< + acc_t, + init_t, + typename r_traits::arg1_t, + typename r_traits::result_type, + typename c_traits::arg1_t, + typename c_traits::arg2_t, + typename c_traits::result_type>::value, + "all accumulate types must match"); + static_assert( + std::is_default_constructible_v, + "the accumulate type must be default-constructible" + ); + const int num_outputs = iter.noutputs(); + iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) { + auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t { + int ntensors = sub_iter.ntensors(); + sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) { + AT_ASSERT(ntensors - num_outputs == 1); + char *in = data[ntensors - 1]; + int64_t stride = strides[ntensors - 1]; + for (const auto i : c10::irange(size)) { + acc = ops.reduce(acc, c10::load(in), begin + i); + in += stride; + } + }, {begin, end}); + return ops.translate_idx(acc, sub_iter.view_offsets()[0]); + }; + acc_t total_acc = init; + auto numel = sub_iter.numel(); + if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 || + at::in_parallel_region()) { + total_acc = reduction_body(total_acc, 0, numel); + } else { + int max_threads = at::get_num_threads(); + AT_ASSERT(max_threads > 0); + static_assert( + !std::is_same_v, + "Concurrently modifying different references into std::vector is UB." + ); + std::vector buffer((unsigned)max_threads, init); + at::parallel_for(0, numel, internal::GRAIN_SIZE, + [&](int64_t begin, int64_t end) { + auto& acc = buffer[at::get_thread_num()]; + acc = reduction_body(acc, begin, end); + } + ); + for (const auto i : c10::irange(max_threads)) { + total_acc = ops.combine(total_acc, buffer[i]); + } + } + set_results(ops.project(total_acc), sub_iter, num_outputs); + }); +} + +template +void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) { + using traits = binary_function_traits; + static_assert( + all_same< + typename traits::result_type, + typename traits::arg1_t, + typename traits::arg2_t>::value, + "all types must match"); + + iter.output_base().fill_(ident); + iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) { + int64_t outer_strides[] = { strides[2], strides[3] }; + if (is_contiguous_reduction(strides)) { + // input is contiguous in dim 0, output is reduced in dim 0 + UNARY_OUTER_LOOP(data, outer_strides, size1, [&] { + vectorized_inner_reduction(data, size0, op, vop); + }); + } else if (is_outer_reduction(strides)) { + // input and output are contiguous in dim 1 + int64_t inner_stride = strides[1]; // stride of input in dim 0 + vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop); + } else { + UNARY_OUTER_LOOP(data, outer_strides, size1, [&] { + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t inner_strides[3] = { strides[0], strides[0], strides[1] }; + basic_loop(ptrs, inner_strides, 0, size0, op); + }); + } + }); +} + +// when reduction is on most inner dimension (dim 0 in TensorIterator) +// and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim` +// can be used. +inline bool is_reduce_lastdim(TensorIteratorBase& iter) { + return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0) + && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1); +} + +template +void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) { + auto shape = iter.shape(); + int64_t dim_size = shape[0]; + int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size); + TensorIterator sub_iter(iter); + // create sub iterator to parallel on all non-reduce-dims + sub_iter.narrow(0, 0, 1); + auto loop = [&](char** data, const int64_t* strides, int64_t size) { + char* out = data[0]; + char* in = data[1]; + for (int64_t i = 0; i < size; ++i) { + reduce_op(out, in, dim_size); + out += strides[0]; + in += strides[1]; + } + }; + sub_iter.for_each(loop, grain_size); +} + +}} // namespace at::native:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a2b3bf4061ffc690c1eb1b6a57577510403f1eeb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using weight_norm_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, int64_t); +using weight_norm_backward_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, + const TensorBase&, const TensorBase&, int64_t); + +DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub) +DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fa31a27e798745e9a638473a30c97ff83510de7e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h @@ -0,0 +1,216 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +template using opmath_t = at::opmath_type; + +constexpr int64_t kChunkSize = 16; + +template +void AddMoments( + int64_t m0_add, + const T& m1_add, + const T& m2_add, + int64_t& m0, + T& m1, + T& m2) { + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const T delta = m1_add - m1; + m1 += c * delta; + m2 += m2_add + delta * delta * c * static_cast(m0); + m0 = n; +} + +template +C10_ALWAYS_INLINE void AddMomentsVec( + int64_t m0_add, + const vec::Vectorized& m1_add, + const vec::Vectorized& m2_add, + int64_t& m0, + vec::Vectorized& m1, + vec::Vectorized& m2) { + using Vec = vec::Vectorized; + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const Vec c_vec(c); + const Vec delta = m1_add - m1; + const Vec m2_tmp = m2 + m2_add; + const Vec c_vec_delta = c_vec * delta; + const Vec m0_delta = delta * Vec(static_cast(m0)); + m1 = m1 + c_vec_delta; + m2 = fmadd(m0_delta, c_vec_delta, m2_tmp); + m0 = n; +} + +template +inline std::enable_if_t>, void> +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized>; + Vec m1_vec(0); + Vec m2_vec(0); + for (const auto j : c10::irange(m0)) { + const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size()); + const Vec tmpVec = c_vecs[j]; + const Vec delta_vec = x_vec - m1_vec; + m1_vec = fmadd(tmpVec, delta_vec, m1_vec); + const Vec tmpVec2 = x_vec - m1_vec; + m2_vec = fmadd(delta_vec, tmpVec2, m2_vec); + } + AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0); +} + +// each bfloat16/half vector will be converted to two float vectors, +// and accumulated successively on m1_stk0/m2_stk0. +template +inline std::enable_if_t>, void> +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized; + using fVec = vec::Vectorized>; + fVec m1_fvec0(0), m1_fvec1(0); + fVec m2_fvec0(0), m2_fvec1(0); + for (const auto j : c10::irange(m0)) { + const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size()); + const fVec tmpVec = c_vecs[j]; + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + const fVec delta_fvec0 = x_fvec0 - m1_fvec0; + const fVec delta_fvec1 = x_fvec1 - m1_fvec1; + m1_fvec0 = fmadd(delta_fvec0, tmpVec, m1_fvec0); + m1_fvec1 = fmadd(delta_fvec1, tmpVec, m1_fvec1); + const fVec delta_fvec2 = x_fvec0 - m1_fvec0; + const fVec delta_fvec3 = x_fvec1 - m1_fvec1; + m2_fvec0 = fmadd(delta_fvec0, delta_fvec2, m2_fvec0); + m2_fvec1 = fmadd(delta_fvec1, delta_fvec3, m2_fvec1); + } + AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0); + AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0); +} + +// Compute rowwise moments by Welford algorithm and cascade sum to improve +// numerical stability. +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +// https://en.wikipedia.org/wiki/Pairwise_summation +template +std::pair, opmath_t> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { + using math_t = opmath_t; + + constexpr int64_t kVecSize = vec::Vectorized::size(); + constexpr int64_t kAccVecSize = vec::Vectorized::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + + using Vec = vec::Vectorized; + const Vec kZeroVec(math_t(0)); + std::array m0_stk = {{0}}; + std::array m1_stk; + m1_stk.fill(kZeroVec); + std::array m2_stk; + m2_stk.fill(kZeroVec); + + for (const auto i : c10::irange(m)) { + const T* X_ptr = X + i * kChunkSize * kVecSize; + const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize); + static std::array c_vecs = ([]() { + std::array result; + for (const auto i : c10::irange(kChunkSize)) { + result[i] = Vec(math_t(1) / static_cast(i + 1)); + } + return result; + })(); + UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]); + + int64_t mask = i + 1; + for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) { + AddMomentsVec( + m0_stk[j - 1], + m1_stk[j - 1], + m2_stk[j - 1], + m0_stk[j], + m1_stk[j], + m2_stk[j]); + m0_stk[j - 1] = 0; + m1_stk[j - 1] = kZeroVec; + m2_stk[j - 1] = kZeroVec; + mask >>= 1; + } + } + for (const auto i : c10::irange(1, depth)) { + AddMomentsVec( + m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]); + } + + std::array m1_arr{}; + std::array m2_arr{}; + m1_stk[0].store(m1_arr.data()); + m2_stk[0].store(m2_arr.data()); + + int64_t m0 = 0; + math_t m1 = 0; + math_t m2 = 0; + for (int64_t i = n * kVecSize; i < N; ++i) { + math_t x = static_cast(X[i]); + const math_t delta = x - m1; + ++m0; + m1 += delta / static_cast(m0); + m2 += delta * (x - m1); + } + // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result + int64_t m0_add = n * kVecSize / kAccVecSize; + for (const auto i : c10::irange(kAccVecSize)) { + AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2); + } + + return std::make_pair(m1, m2 / static_cast(N - ddof)); +} + +template +std::pair, opmath_t> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) { + using Vec = vec::Vectorized; + constexpr int64_t kVecSize = Vec::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + if (depth <= 4) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 8) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 16) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 32) { + return RowwiseMomentsImpl(X, N, ddof); + } else { + return RowwiseMomentsImpl(X, N, ddof); + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1c036c8edba98f808f615ff5a522942a10b05bd3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh @@ -0,0 +1,161 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::cuda::detail { + +using Strides = std::array; + +template < + typename DtypeA, + typename DtypeB, + typename DtypeOutput, + typename DtypeScale, + typename ProblemShape, + typename StrideA, + typename StrideB, + typename StrideOutput> +__global__ void prepare_grouped_gemm_data( + DtypeA* A, + DtypeB* B, + DtypeOutput* output, + DtypeScale* scale_A, + DtypeScale* scale_B, + DtypeA** A_ptrs, + DtypeB** B_ptrs, + DtypeOutput** output_ptrs, + DtypeScale** inputA_scale_ptrs, + DtypeScale** inputB_scale_ptrs, + ProblemShape* problem_sizes, + // Strides for cutlass, cute::Stride + StrideA* stride_A, + StrideB* stride_B, + StrideOutput* stride_output, + const int32_t* offs, + int32_t M, + int32_t N, + int32_t K, + // Original strides of the input tensors + Strides tensor_StrideA, + Strides tensor_StrideB, + Strides tensor_StrideOutput, + Strides tensor_ShapeA, + Strides tensor_ShapeB, + int64_t a_scale_stride, + int64_t b_scale_stride, + bool a_row_major = true, + bool b_row_major = false) { + int32_t tid = threadIdx.x; + int32_t delta = 0; + int32_t offset = 0; + if (offs != nullptr) { + int32_t start = tid == 0 ? 0 : offs[tid - 1]; + offset = offs[tid]; + delta = offset - start; + CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n"); + + // TMA transfers require global memory tensor addresses to be + // aligned to 16 bytes. + if (tid < blockDim.x - 1) { + // Check this requirement for input tensors, in case group + // addresses are increased along the dynamic dimension. + if ((K < 0 && a_row_major) || // 2D/2D: check along K dimension + (M < 0 && !a_row_major)) { // 3D/2D: check along N dimension + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + if ((K < 0 && !b_row_major) || // 2D/2D: check along K dimension + (N < 0 && b_row_major)) { // 3D/2D: check along N dimension + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + + // Check the same requirement for output tensor (that is always + // contiguous, and in row-major layout). + if (N < 0) { + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected output tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + } + } + int64_t lda, ldb, ldoutput; + if (M < 0) { + // A and output is 2d + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n"); + M = delta; + lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1]; + ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2]; + ldoutput = tensor_StrideOutput[0]; + A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = tid == 0 ? scale_A : scale_A + offs[tid - 1]; + inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride; + } + output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput; + B_ptrs[tid] = B + tid * tensor_StrideB[0]; + } else if (N < 0) { + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n"); + N = delta; + lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2]; + ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed + ldoutput = tensor_StrideOutput[0]; + A_ptrs[tid] = A + tid * tensor_StrideA[0]; + output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1]; + B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[1]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride; + inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1]; + } + } else if (K < 0) { + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n"); + // A, B is 2d, output is 3d + K = delta; + lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1]; + ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; + ldoutput = tensor_StrideOutput[1]; + A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[1]; + B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[0]; + output_ptrs[tid] = output + tid * tensor_StrideOutput[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * M; + inputB_scale_ptrs[tid] = scale_B + tid * N; + } + } else { + // A, B, output are 3D + lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2]; + ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2]; + ldoutput = tensor_StrideOutput[1]; + A_ptrs[tid] = A + tid * tensor_StrideA[0]; + B_ptrs[tid] = B + tid * tensor_StrideB[0]; + output_ptrs[tid] = output + tid * tensor_StrideOutput[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride; + inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride; + } + } + problem_sizes[tid] = ProblemShape(M, N, K); + + // make_cute_packed_stride only replaces one of the stride elements with + // one the provided values in the shape arguments + // the indices of the src/dst depend on whether A/B are row-major + // so constructing shape argument with two similar lda values + // while it looks non-sensical (and it is a nonsensical shape) + // is fine for these stride construction purposes - the one that will be used + // for replacement is correct, the other one is ignored, and we don't have to + // branch on whether A/B are row-major + stride_A[tid] = cutlass::make_cute_packed_stride(StrideA{}, {lda, lda, 1}); + stride_B[tid] = cutlass::make_cute_packed_stride(StrideB{}, {ldb, ldb, 1}); + stride_output[tid] = + cutlass::make_cute_packed_stride(StrideOutput{}, {M, ldoutput, 1}); +} +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..793922407e457e5971bd14d03cb29fb9d275655c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +// returns 2**floor(log2(n)) +static int lastPow2(unsigned int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c98379b40d11e0a293bf8b13bc547321eaebc901 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh @@ -0,0 +1,407 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace { + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension. +// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024. +// The template arguments have the following meaning: +// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples. +// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small. +// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp. +// This is important because it means only __shfl_ instructions are required for reductions. +// Note that this means WARP_SIZE must be a power of two and <= architecture warp size. +// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch. +// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs. +// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed. +// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed. +// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t. +// This allows SoftMax to be fused with a cast immediately following the SoftMax. +// The mask should have the same shape as input, with a boolean indicate if the value is masked. +// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D. +// For instance: +// input_t=half, acc_t=float, output_t=half => read half tensor, float accumulators, write half tensor. +// input_t=half, acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor. +// input_t_float, acc_t=float, output_t=half => read float tensor, float accumulators, write half tensor. + +template +__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + int idx_offset = first_batch * stride + local_idx; + + src += idx_offset; + dst += idx_offset; + + if (is_transformer_mask) { + mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx; + } else { + mask += idx_offset; + } + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + elements[i][it] = src[i*element_count+it*WARP_SIZE]; + } else { + elements[i][it] = -std::numeric_limits::infinity(); + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + bool is_meaningful_max = false; + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (is_masked) { + int idx = it*WARP_SIZE; + if ((idx + local_idx) < batch_element_count) { + if (!is_transformer_mask) { + idx += i*element_count; + } + if (!mask[idx]) { + max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + is_meaningful_max = true; + } + } + } else { + max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it]; + } + } + if (is_masked) { + if (!is_meaningful_max) { + max_value[i] = -std::numeric_limits::infinity(); + } + } + } + warp_reduce(max_value); + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + int idx = it*WARP_SIZE; + bool valid = (idx + local_idx) < batch_element_count; + if (!is_transformer_mask) { + idx += i*element_count; + } + if (valid) { + if (!mask[idx]) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + if (!is_log_softmax) { + // Masked values are treated as -infinity, and std::exp(-infinity) is 0. + elements[i][it] = 0; + } + } + } else { + if (!is_log_softmax) { + elements[i][it] = 0.; + } + } + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + if (is_log_softmax) sum[i] = std::log(sum[i]); + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_log_softmax) { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i]; + } else if (sum[i] == 0) { + dst[i*element_count+it*WARP_SIZE] = std::numeric_limits::quiet_NaN(); + } else { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i]; + } + } else { + break; + } + } + } +} + +template +__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x % WARP_SIZE; + + // the first element to process by the current thread + int thread_offset = first_batch * stride + local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + if (is_masked) { + mask += thread_offset; + } + + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE]; + output_reg[i][it] = output[i*element_count+it*WARP_SIZE]; + } else { + grad_reg[i][it] = acc_t(0); + output_reg[i][it] = acc_t(0); + } + } + } + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) { + sum[i] += grad_reg[i][it]; + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_masked && mask[i*element_count+it*WARP_SIZE]) { + gradInput[i*element_count+it*WARP_SIZE] = 0; + } + // compute gradients + else if (is_log_softmax) { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]); + } else { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]); + } + } + } + } +} + +} // end of anonymous namespace + +template +void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E: \ + softmax_warp_forward \ + <<>>(dst, \ + src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_FORWARD(10); // 1024 + LAUNCH_SOFTMAX_WARP_FORWARD(11); // 2048 + default: + break; + } + } +} + +template +void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ + softmax_warp_backward \ + <<>> \ + (grad_input, grad, output, batch_count, softmax_elements_stride, \ + softmax_elements, mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024 + default: + break; + } + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..9584f4710ea0657269da74889322589df83d54c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; + +namespace native { + +// NOTE: these functions require output tensors to be contiguous +void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..fc0fb8ca6043142a5c4c34b83ca1da4208878bc0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// Marks a lambda as executable on both the host and device. The __host__ +// attribute is important so that we can access static type information from +// the host, even if the function is typically only executed on the device. +#ifndef GPU_LAMBDA +#define GPU_LAMBDA __host__ __device__ +#endif + +#if defined(USE_ROCM) +constexpr int num_threads() { + return 256; +} + +constexpr int thread_work_size() { return 4; } +#else +constexpr uint32_t num_threads() { + return C10_WARP_SIZE * 4; +} + +constexpr int thread_work_size() { return 8; } +#endif + +constexpr int block_work_size() { return thread_work_size() * num_threads(); } + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..97ece5bfd2d7cba3fac04076f1abc3719d8024f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h new file mode 100644 index 0000000000000000000000000000000000000000..42118c217630a793e3dc99db81ced73f4cec6f65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_addmm_activation_out_cpu : public at::meta::structured__addmm_activation { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out); +}; +struct TORCH_API structured_addmm_activation_out_cuda : public at::meta::structured__addmm_activation { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..d99cfde46f1f4850e4a4815ac71c18cc06762be9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale_::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval); +} + +// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_out(at::Tensor & out, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out); +} +// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_outf(const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out) { + return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out); +} + +// aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out) +inline ::std::tuple _amp_update_scale(const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..49cf9563e00deab38c11abfb68d9b81a5c9fa885 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h new file mode 100644 index 0000000000000000000000000000000000000000..12883720942c874ab4d6755ead3051337bfe5163 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _cast_Char(const at::Tensor & self, bool non_blocking=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dc20a697e6fb4fbb749eaca4c04de45050f5cb5a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _cast_Half(const at::Tensor & self, bool non_blocking=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..000e41741551c82a2c2653b2f5e81cc38e117201 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _convert_weight_to_int4pack_for_cpu { + using schema = at::Tensor (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_convert_weight_to_int4pack_for_cpu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_convert_weight_to_int4pack_for_cpu(Tensor self, int innerKTiles) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t innerKTiles); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t innerKTiles); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..80219580c7f35528cb79e40c2eb6492d98ae9ba2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_ctc_loss { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity); +}; + +struct TORCH_API _cudnn_ctc_loss_Tensor { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity); +}; + +struct TORCH_API _cudnn_ctc_loss_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h new file mode 100644 index 0000000000000000000000000000000000000000..a52157f65be2298de1d1c76e35533c9d7415c904 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int +inline int64_t _cufft_get_plan_cache_max_size(at::DeviceIndex device_index) { + return at::_ops::_cufft_get_plan_cache_max_size::call(device_index); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3a0e776f5b0e4a5f001bdf36e7e5bdc5d50f12af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API void _cummin_helper(const at::Tensor & self, at::Tensor & values, at::Tensor & indices, int64_t dim); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5c438b64ce7aca7a6a231b50468e1cea1b8d8150 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API int64_t _debug_has_internal_overlap(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..7c2541f982101aad537118295f41ac7d34a57a06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); +} +namespace symint { + template >> + at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); + } +} + +// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_dense_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); +} +namespace symint { + template >> + at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_symint_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_symint_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d74307ee632e81fcbe5efb98546dc71bac2d4469 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _embedding_bag_forward_only_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1); +TORCH_API ::std::tuple _embedding_bag_forward_only_outf(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b24ce717374bcf9d99ab82248d6d9f47abe81796 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1); +TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..80ca9c4276dd8ddb545a314ffa58883ff727e74a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _fft_r2c { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fft_r2c"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided); +}; + +struct TORCH_API _fft_r2c_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, int64_t, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fft_r2c"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..3f302ef4d3f695a88175a4a0c4f3c29fbba5d496 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt); +} +namespace symint { + template >> + ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt); + } +} + +// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _flash_attention_backward_symint(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right); +} +namespace symint { + template >> + ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h new file mode 100644 index 0000000000000000000000000000000000000000..0d2f4593a28ce22251a9bec54095b180ec346467 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_asin(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_asin(at::TensorList self) { + return at::_ops::_foreach_asin::call(self); +} + +// aten::_foreach_asin_(Tensor(a!)[] self) -> () +inline void _foreach_asin_(at::TensorList self) { + return at::_ops::_foreach_asin_::call(self); +} + +// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_asin_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_asin_out::call(self, out); +} +// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_asin_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_asin_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..eebf66a88b9303f4a283f2bdcb53087d63750d01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_floor(at::TensorList self); +TORCH_API void _foreach_floor_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_floor_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_floor_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..eb4c773498213cd132e3a21e637e33a2c793d133 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_frac(at::TensorList self); +TORCH_API void _foreach_frac_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2bf4590e7cce3190f772f739e2c0c4c21d352ee9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sign(at::TensorList self); +TORCH_API void _foreach_sign_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sign_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sign_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1af5d09c4e152720cb81533a7219875cba743ef2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _functional_sym_constrain_range_for_size { + using schema = at::Tensor (const at::Scalar &, ::std::optional, ::std::optional, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_functional_sym_constrain_range_for_size"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor"; + static at::Tensor call(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h new file mode 100644 index 0000000000000000000000000000000000000000..d21f9bf639992cfca76c45bac8a1fec3e70654cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h @@ -0,0 +1,69 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () +inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () +inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad__tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} +// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out) { + return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} + +// aten::_fused_adagrad(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out, Tensor[] state_steps_out) +inline ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} +// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out) { + return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} + +// aten::_fused_adagrad.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out) +inline ::std::tuple<::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3add70e853927d8d9e80cecba4d68a5942489d00 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _fused_rms_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional & weight, ::std::array output_mask); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7e82a8d1315e29aa2450793e5de14442d79f44a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _fused_rms_norm_backward_cuda(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional & weight, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5c952ad3b5af8c9d69705226fe50cf75ec93ca94 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _fw_primal_copy_out(at::Tensor & out, const at::Tensor & self, int64_t level); +TORCH_API at::Tensor & _fw_primal_copy_outf(const at::Tensor & self, int64_t level, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8393df9dee2248d540eb77c90bc2ed9f2a82657f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _gather_sparse_backward(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & grad); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4ddfbdf06414d0cc7133b58ddf1096dbf6f6f3a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & _index_put_impl_(at::Tensor & self, const c10::List<::std::optional> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ad43c636a72fd962b80fd51acabc0d044e9333df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_log_softmax_backward_cpu_out : public at::meta::structured__log_softmax_backward_data { +void impl(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, const at::Tensor & out); +}; +struct TORCH_API structured_log_softmax_backward_cuda_out : public at::meta::structured__log_softmax_backward_data { +void impl(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..13442417e38ef45875e9da399ac74601c018ea66 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _logcumsumexp(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..bcb38bb0e9868c094e8f68f4495c7e55ab8f553e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _make_per_tensor_quantized_tensor_out(const at::Tensor & self, double scale, int64_t zero_point, at::Tensor & out); +TORCH_API at::Tensor make_per_tensor_quantized_tensor_cpu(const at::Tensor & self, double scale, int64_t zero_point); +TORCH_API at::Tensor make_per_tensor_quantized_tensor_cuda(const at::Tensor & self, double scale, int64_t zero_point); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3f9cfb8c53f58e7f261b2a22c1c4c7eff5ad0ad0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _mkldnn_reshape_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef shape); +TORCH_API at::Tensor & _mkldnn_reshape_outf(const at::Tensor & self, at::IntArrayRef shape, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h new file mode 100644 index 0000000000000000000000000000000000000000..41a15aa5c7727a84409e6368a48798fa7f151d75 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor +inline at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) { + return at::_ops::_mps_convolution_transpose::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups); +} +namespace symint { + template >> + at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) { + return at::_ops::_mps_convolution_transpose::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups); + } +} + +// aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor +inline at::Tensor _mps_convolution_transpose_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) { + return at::_ops::_mps_convolution_transpose::call(self, weight, padding, output_padding, stride, dilation, groups); +} +namespace symint { + template >> + at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) { + return at::_ops::_mps_convolution_transpose::call(self, weight, padding, output_padding, stride, dilation, groups); + } +} + +// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out); +} +namespace symint { + template >> + at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out); + } +} + +// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out); +} +namespace symint { + template >> + at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out); + } +} + +// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _mps_convolution_transpose_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out); +} +namespace symint { + template >> + at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out); + } +} + +// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _mps_convolution_transpose_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out); +} +namespace symint { + template >> + at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) { + return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..57ab178108d3a34403ae4fa7b9da11cc03b41e49 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _native_batch_norm_legit_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps); +TORCH_API ::std::tuple _native_batch_norm_legit_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd); +TORCH_API ::std::tuple _native_batch_norm_legit(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps); +TORCH_API ::std::tuple _native_batch_norm_legit(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, bool training, double momentum, double eps); +TORCH_API ::std::tuple _native_batch_norm_legit_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, bool training, double momentum, double eps); +TORCH_API ::std::tuple _native_batch_norm_legit_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ae773bd7d5b671669035611a14696b38a1275505 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _pdist_forward_out(const at::Tensor & self, double p, at::Tensor & out); +TORCH_API at::Tensor _pdist_forward(const at::Tensor & self, double p=2); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e85728ec26480e5cd20ea8c052e4ec92cbe768d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API void _print(c10::string_view s); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d7e4d85e0c0fa9589b11b0d6e858a32752de9e0d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _scaled_dot_product_flash_attention_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, c10::SymInt, c10::SymInt, double, bool, const at::Tensor &, const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_scaled_dot_product_flash_attention_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)"; + static ::std::tuple call(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional scale); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional scale); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8d166a3dd1b0fac5450b2185fec166f0a7a012e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_cpu(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const ::std::optional & attn_mask={}, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe1a4c7aecef2ddb3655a34a879343aeb342956 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_fused_attention_overrideable(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & attn_bias={}, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9ecb42910d2beacebd87847ecd466dfd00dbe015 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _slow_conv2d_forward_output { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_slow_conv2d_forward"; + static constexpr const char* overload_name = "output"; + static constexpr const char* schema_str = "_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output); +}; + +struct TORCH_API _slow_conv2d_forward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_slow_conv2d_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..f2745c3ffc6efb4a77af718e2a95cb20e7915895 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__softmax_backward_data : public at::impl::MetaBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c8904318ac09d06dfc2675dcfbb4cf0110902f4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9d5db02db2381c6ad24205ff14e29c66be4c1c3f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_log_softmax_int { + using schema = at::Tensor (const at::Tensor &, int64_t, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_log_softmax"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, ::std::optional dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, ::std::optional dtype); +}; + +struct TORCH_API _sparse_log_softmax_Dimname { + using schema = at::Tensor (const at::Tensor &, at::Dimname, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_log_softmax"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Dimname dim, ::std::optional dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, ::std::optional dtype); +}; + +struct TORCH_API _sparse_log_softmax { + using schema = at::Tensor (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_log_softmax"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, bool half_to_float); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float); +}; + +struct TORCH_API _sparse_log_softmax_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_log_softmax"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..be054e403728f881d3ea5a073c6c151e40a4f08c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _sparse_mm_reduce_impl_backward_sparse_csr_cpu(const at::Tensor & self, const at::Tensor & grad_out, const at::Tensor & weight, c10::string_view reduce, const at::Tensor & arg_out, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8ba9054ff7481c1502e229ec74d57e621450ffe7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _sparse_semi_structured_addmm(const at::Tensor & input, const at::Tensor & mat1, const at::Tensor & mat1_meta, const at::Tensor & mat2, const at::Scalar & alpha=1, const at::Scalar & beta=1, ::std::optional out_dtype=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..dadd446e1558cdd30f8916690c307be7f38e78ef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_softmax_int::call(self, dim, dtype); +} + +// aten::_sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_softmax_Dimname::call(self, dim, dtype); +} + +// aten::_sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_softmax::call(self, dim, half_to_float); +} + +// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out); +} +// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) { + return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d9270da98fca4dc399ae91cdaa6644841de9b95b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_serialization_subcmul { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_serialization_subcmul"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ebc37dbc78804386bbea23d0abd0fe00d28939ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_warn_in_autograd { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_warn_in_autograd"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_test_warn_in_autograd(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _test_warn_in_autograd_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_warn_in_autograd"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5d2586820a918d37e3b4d7bf43d3af4e05b4edca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _thnn_fused_gru_cell { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias); +}; + +struct TORCH_API _thnn_fused_gru_cell_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cfb3cb03ff9a8114fd6e4c087e6915f18cbd93c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _to_sparse_bsr(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cc469abb37d75ebad8b18be6da4e2679bdb4a1af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _to_sparse_csr(const at::Tensor & self, ::std::optional dense_dim=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8af14d73c569b0cba19c8d49f694ab8c068fff3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _triton_scaled_dot_attention { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_triton_scaled_dot_attention"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor"; + static at::Tensor call(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p); +}; + +struct TORCH_API _triton_scaled_dot_attention_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, double, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_triton_scaled_dot_attention"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_triton_scaled_dot_attention.out(Tensor q, Tensor k, Tensor v, float dropout_p=0.0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..36172229b4fc54d1a0e0ad09f64f589570324127 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d26e88f4d0dd26288c607b5d12b3824cb1cc7975 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_bicubic2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..32a94a5a500d88c3d390761e8aeab05074f4aba8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact1d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact1d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h new file mode 100644 index 0000000000000000000000000000000000000000..d52eb044dd747fbf60232e6c36a5a7c044eb772a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor +inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors); + } +} + +// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor +inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w); + } +} + +// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..0c3bf8234734e728d7c8885c5c59954cc8c73edc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__upsample_nearest_exact3d : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, at::ArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h new file mode 100644 index 0000000000000000000000000000000000000000..143ebaf74c8ba2b8b54159a1ae5586287301f4d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool +inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank) { + return at::_ops::_use_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank); +} + +// aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool +inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank) { + return at::_ops::_use_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c7d6708303629d8bab44c9e45f0f0e6ca3337256 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_bsr_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_bsr_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cbc189450438d8ded976143386c9d8bd87ffd971 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _weight_norm(const at::Tensor & v, const at::Tensor & g, int64_t dim=0); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9f2401ac35082d0798cc3ec6b34a9babdeecee25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _weight_norm_interface_backward_out(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple weight_norm_backward_cpu(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); +TORCH_API ::std::tuple weight_norm_backward_cuda(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ddc82b6fb6b64aa734c70bd4aed92de92411f972 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _wrapped_linear_prepack(const at::Tensor & weight, const at::Tensor & weight_scale, const at::Tensor & weight_zero_point, const at::Tensor & bias); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..98bc32b7020176294200047a1b4d50348297ae3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _wrapped_quantized_linear_prepacked { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_wrapped_quantized_linear_prepacked"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h new file mode 100644 index 0000000000000000000000000000000000000000..332e9284731804c6e80dfca8275a72ada7bd5d1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) { + return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out); +} +// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) { + return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out); +} + +// aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor +inline at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) { + return at::_ops::addcdiv::call(self, tensor1, tensor2, value); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h new file mode 100644 index 0000000000000000000000000000000000000000..bdfeaae63201db89c45e052362ff367d93587717 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::align_tensors(Tensor[] tensors) -> Tensor[] +inline ::std::vector align_tensors(at::TensorList tensors) { + return at::_ops::align_tensors::call(tensors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d98ecc829e952fb085603756920c6379c29c267a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API arctanh { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "arctanh(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API arctanh_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "arctanh_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API arctanh_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ae5e637068bf39a33a78767ca007532e941aa2e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor argmin(const at::Tensor & self, ::std::optional dim=::std::nullopt, bool keepdim=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..92f62ded996c33504b4a68f8084d7cfb82308ec8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_atanh : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9336475ca68a0798779ac558e3fa01cccf4ce01d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor atleast_3d(const at::Tensor & self); +TORCH_API ::std::vector atleast_3d(at::TensorList tensors); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..da53ab8c9c4bbf3c17af663b7dca05550c0508f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API avg_pool1d { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::avg_pool1d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad); +}; + +struct TORCH_API avg_pool1d_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::avg_pool1d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "avg_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..e0d17657ba5d3f84d66d8845f8a56e621b2575bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & avg_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override) { + return at::_ops::avg_pool2d_backward_grad_input::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input); +} +// aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & grad_input) { + return at::_ops::avg_pool2d_backward_grad_input::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input); +} + +// aten::avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor +inline at::Tensor avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override) { + return at::_ops::avg_pool2d_backward::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..47d50a3a94b7846bd188c146f28221d1d8b269de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor batch_norm_elemt(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); +TORCH_API at::Tensor & batch_norm_elemt_out(at::Tensor & out, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); +TORCH_API at::Tensor & batch_norm_elemt_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h new file mode 100644 index 0000000000000000000000000000000000000000..92a00d968d26a39e66cdc1a6282b779484f331db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::bitwise_or_Tensor_out::call(self, other, out); +} +// aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::bitwise_or_Tensor_out::call(self, other, out); +} + +// aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) { + return at::_ops::bitwise_or_Scalar_out::call(self, other, out); +} +// aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) { + return at::_ops::bitwise_or_Scalar_out::call(self, other, out); +} + +// aten::bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor +inline at::Tensor bitwise_or(const at::Tensor & self, const at::Scalar & other) { + return at::_ops::bitwise_or_Scalar::call(self, other); +} + +// aten::bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor +inline at::Tensor bitwise_or(const at::Scalar & self, const at::Tensor & other) { + return at::_ops::bitwise_or_Scalar_Tensor::call(self, other); +} + +// aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor +inline at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::bitwise_or_Tensor::call(self, other); +} + +// aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Scalar & self, const at::Tensor & other) { + return at::_ops::bitwise_or_Scalar_Tensor_out::call(self, other, out); +} +// aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & bitwise_or_outf(const at::Scalar & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::bitwise_or_Scalar_Tensor_out::call(self, other, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..93d0456a3071dbee9ca22831a46d34ce89bb4c01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & bitwise_or_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5f50b11858ae9b250220560f392e9227a373b0c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor ccol_indices_default(const at::Tensor & self); +TORCH_API at::Tensor ccol_indices_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..272a44791d5ac1b585d6767eb4bf08b5952e35df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API chunk { + using schema = ::std::vector (const at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::chunk"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, int64_t chunks, int64_t dim); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t chunks, int64_t dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7fa32c1c64f5659a5fe2396c97288ff9abb2a5cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor concat(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & concat_outf(at::TensorList tensors, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor concat(at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & concat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d4d2151ff261579f164d7b0c872f587e0e34c5e0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API conv_transpose2d_input { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::conv_transpose2d"; + static constexpr const char* overload_name = "input"; + static constexpr const char* schema_str = "conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..bc9ce492c781bce2d82d0d5a81f8f99827d7db9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor copysign(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & copysign_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & copysign_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & copysign_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h new file mode 100644 index 0000000000000000000000000000000000000000..807162ef475fe5f91c30c624d51af68fda23dac4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8f672eadac3c2572b68f5592a1f97d1fa1727abc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & crow_indices_copy_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & crow_indices_copy_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b8edd9fa6500d41801fb8bd5b40b5cc800e926ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor cumprod(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & cumprod_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & cumprod_outf(const at::Tensor & self, at::Dimname dim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor & cumprod_(at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9d287b0ffa62abb12ab9e9e954626a1b88c4e4f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor diagonal(const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1); +TORCH_API at::Tensor diagonal(const at::Tensor & self, at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h new file mode 100644 index 0000000000000000000000000000000000000000..1d141b737498ee3d889f18e58c8effe7b93dd574 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h @@ -0,0 +1,87 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::div.Tensor(Tensor self, Tensor other) -> Tensor +inline at::Tensor div(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::div_Tensor::call(self, other); +} + +// aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::div_out::call(self, other, out); +} +// aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::div_out::call(self, other, out); +} + +// aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor +inline at::Tensor div(const at::Tensor & self, const at::Tensor & other, ::std::optional rounding_mode) { + return at::_ops::div_Tensor_mode::call(self, other, rounding_mode); +} + +// aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, ::std::optional rounding_mode) { + return at::_ops::div_out_mode::call(self, other, rounding_mode, out); +} +// aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_outf(const at::Tensor & self, const at::Tensor & other, ::std::optional rounding_mode, at::Tensor & out) { + return at::_ops::div_out_mode::call(self, other, rounding_mode, out); +} + +// aten::div.Scalar(Tensor self, Scalar other) -> Tensor +inline at::Tensor div(const at::Tensor & self, const at::Scalar & other) { + return at::_ops::div_Scalar::call(self, other); +} + +// aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor +inline at::Tensor div(const at::Tensor & self, const at::Scalar & other, ::std::optional rounding_mode) { + return at::_ops::div_Scalar_mode::call(self, other, rounding_mode); +} + +// aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) { + return at::_ops::div_Scalar_out::call(self, other, out); +} +// aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) { + return at::_ops::div_Scalar_out::call(self, other, out); +} + +// aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other, ::std::optional rounding_mode) { + return at::_ops::div_Scalar_mode_out::call(self, other, rounding_mode, out); +} +// aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & div_outf(const at::Tensor & self, const at::Scalar & other, ::std::optional rounding_mode, at::Tensor & out) { + return at::_ops::div_Scalar_mode_out::call(self, other, rounding_mode, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f9d815f19182a50598b9f9bee08feb6948b5ada6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API embedding { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::embedding"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor"; + static at::Tensor call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); +}; + +struct TORCH_API embedding_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::embedding"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h new file mode 100644 index 0000000000000000000000000000000000000000..884429fe107b4da3d41b36063ad27a1473d37d20 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) +inline at::Tensor & embedding_renorm_(at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) { + return at::_ops::embedding_renorm_::call(self, indices, max_norm, norm_type); +} + +// aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & embedding_renorm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) { + return at::_ops::embedding_renorm_out::call(self, indices, max_norm, norm_type, out); +} +// aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & embedding_renorm_outf(const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type, at::Tensor & out) { + return at::_ops::embedding_renorm_out::call(self, indices, max_norm, norm_type, out); +} + +// aten::embedding_renorm(Tensor self, Tensor indices, float max_norm, float norm_type) -> Tensor +inline at::Tensor embedding_renorm(const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) { + return at::_ops::embedding_renorm::call(self, indices, max_norm, norm_type); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ab0f2932d6233a6e98fb9bc799c8ab63e32b42aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor empty(at::IntArrayRef size, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty(at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); +TORCH_API at::Tensor empty_symint(c10::SymIntArrayRef size, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_symint(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a1c1de4b1aaa29a53eeb472bf56ad20e32393fa1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={}); +TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={}); +TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e7a620bbb4388e3bd07d3394374fd62805f5ce84 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & empty_strided_out_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_cuda(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_meta_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_unknown_quantized(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e3a573c421272df7801a2e543af7258f83f30d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API bool equal(const at::Tensor & self, const at::Tensor & other); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..80fdd2fda4ae2f7f62fe4cf34bf9394604f4d322 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor erf(const at::Tensor & self); +TORCH_API at::Tensor & erf_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c73c50b315a4081638a84b92913dec1ea184cb9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fft_fft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_fft2_symint_out(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..973228e10f49ee4f467b18b51678653e6743b4b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fft_irfft2 { + using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_irfft2"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm); +}; + +struct TORCH_API fft_irfft2_out { + using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_irfft2"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h new file mode 100644 index 0000000000000000000000000000000000000000..4981643fd116edd8e23cd307fd8033135d1ceccf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor +inline at::Tensor fft_rfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_rfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm); + } +} + +// aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor +inline at::Tensor fft_rfft_symint(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft::call(self, n, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_rfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft::call(self, n, dim, norm); + } +} + +// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_rfft_symint_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft_out::call(self, n, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_rfft_out::call(self, n, dim, norm, out); + } +} + +// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_rfft_symint_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_rfft_out::call(self, n, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_rfft_out::call(self, n, dim, norm, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d7cdf9f3fe42adc6e91ca4e8bcc2d59af88abc82 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fft_rfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor fft_rfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_rfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_rfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +TORCH_API at::Tensor & fft_rfft2_symint_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_rfft2_symint_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2c6c37bb8c172d61fbf62b81dfcb569917dab01f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple fractional_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples); +TORCH_API ::std::tuple fractional_max_pool2d_out(at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples); +TORCH_API ::std::tuple fractional_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b4e5766304ee387cbd823d5bfd42b162de8c8250 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fractional_max_pool3d_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fractional_max_pool3d_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input); +}; + +struct TORCH_API fractional_max_pool3d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fractional_max_pool3d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..3eff09091d9093cb254b7f68c2ff42fa39caf47f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_gather : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4fe0b341091ae1fa0cd1292458e411b2b9da9436 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor gcd(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gcd_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gcd_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f6c6b347793670678fd73a7d6a1b6623465d714f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor gelu(const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_out(at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_outf(const at::Tensor & self, c10::string_view approximate, at::Tensor & out); +TORCH_API at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate="none"); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h new file mode 100644 index 0000000000000000000000000000000000000000..9f031c7c8b84858b071de8b1fb4eea18dc4ab4fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::ger(Tensor self, Tensor vec2) -> Tensor +inline at::Tensor ger(const at::Tensor & self, const at::Tensor & vec2) { + return at::_ops::ger::call(self, vec2); +} + +// aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & ger_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & vec2) { + return at::_ops::ger_out::call(self, vec2, out); +} +// aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & ger_outf(const at::Tensor & self, const at::Tensor & vec2, at::Tensor & out) { + return at::_ops::ger_out::call(self, vec2, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b1c4de7dfcee9f1485c9250491919bd048ec7110 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API greater_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +}; + +struct TORCH_API greater_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "greater.Scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API greater_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API greater_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "greater.Tensor(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API greater__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API greater__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a737519c3ff3a27f5511b2882871fd70c0979561 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & grid_sampler_2d_out(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out); +TORCH_API at::Tensor grid_sampler_2d_cpu(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); +TORCH_API at::Tensor grid_sampler_2d_cuda(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..37d273dc8cdbac5af6dc0b8c9c531597028d1e9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardsigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input); +} +// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardsigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) { + return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input); +} + +// aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor +inline at::Tensor hardsigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardsigmoid_backward::call(grad_output, self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..e7b49a3e190fcf4929dde4b1dd3ecdfec803db8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_hardsigmoid_backward : public TensorIteratorBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..892ece6f84ae8ca9f80e528372bb932cbfdb66c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hardswish(const at::Tensor & self); +TORCH_API at::Tensor & hardswish_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & hardswish_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & hardswish_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..a7709ef73430311dc50a236a864e7c249821f600 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) { + return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input); +} +// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input) { + return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input); +} + +// aten::hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor +inline at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) { + return at::_ops::hardtanh_backward::call(grad_output, self, min_val, max_val); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..144e59c679abdd10096eb55ce08399d902953a4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor hash_tensor(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false, int64_t mode=0); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h new file mode 100644 index 0000000000000000000000000000000000000000..3afc8b577723d664a639158fa802b006e5c69a8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hstack(Tensor[] tensors) -> Tensor +inline at::Tensor hstack(at::TensorList tensors) { + return at::_ops::hstack::call(tensors); +} + +// aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & hstack_out(at::Tensor & out, at::TensorList tensors) { + return at::_ops::hstack_out::call(tensors, out); +} +// aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & hstack_outf(at::TensorList tensors, at::Tensor & out) { + return at::_ops::hstack_out::call(tensors, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..17d4f3472f9050511a1c1520ffc533c332c43a3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & huber_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta); +TORCH_API at::Tensor & huber_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..e94757084730f1d57cf366f293fcd0d40a91c365 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h @@ -0,0 +1,44 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_index_add : public at::impl::MetaBase { + + template + struct TORCH_API precompute_out { + + precompute_out set_dim(int64_t value) { + static_assert(DIM == false, "dim already set"); + precompute_out ret; +ret.dim = value; +return ret; + } + + int64_t dim; + }; + using meta_return_ty = precompute_out ; + meta_return_ty meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..a5cd9056522b1cd97c04df9979e441d014ba1b92 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor +inline at::Tensor instance_norm(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { + return at::_ops::instance_norm::call(input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0c2790b1388072cc79512fd3a2fc204963f3226d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor inverse(const at::Tensor & self); +TORCH_API at::Tensor & inverse_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & inverse_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0e6ca3c2824ed95472376cc405c81c6a26590954 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API bool is_inference(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2e024c5d39f0f9686ec781bff3dd0ced0b57f1c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API bool is_pinned(const at::Tensor & self, ::std::optional device=::std::nullopt); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h new file mode 100644 index 0000000000000000000000000000000000000000..29f56db48e94344501185cee43825347bde69b5c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor isclose(const at::Tensor & self, const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c15158f09213113bed4a3d760ec562413210f4b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor isneginf(const at::Tensor & self); +TORCH_API at::Tensor & isneginf_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & isneginf_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..14951b5ccbe251aa7c99244b6a7d89c8fe3d7e10 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor isreal(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fd191a208873053d7b4671337f5b4b9882fd47ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API leaky_relu_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::leaky_relu_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input); +}; + +struct TORCH_API leaky_relu_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::leaky_relu_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..25429a732d69d19811a4a8cdf0fa104ff4fe5ddf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & lift_fresh_copy_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & lift_fresh_copy_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fe07b2a5eb9fcdec33ce921f20af02cc13fb7e93 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor linalg_cross(const at::Tensor & self, const at::Tensor & other, int64_t dim=-1); +TORCH_API at::Tensor & linalg_cross_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1); +TORCH_API at::Tensor & linalg_cross_outf(const at::Tensor & self, const at::Tensor & other, int64_t dim, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..4f3368dff66b454c7fdfa868b723e9fc6add9fbc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_linalg_cross : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & other, int64_t dim); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h new file mode 100644 index 0000000000000000000000000000000000000000..ba5bcb116dd51221203fcc67e73b135aa83768e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) +inline ::std::tuple linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L") { + return at::_ops::linalg_eigh::call(self, UPLO); +} + +// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) +inline ::std::tuple linalg_eigh_out(at::Tensor & eigvals, at::Tensor & eigvecs, const at::Tensor & self, c10::string_view UPLO="L") { + return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs); +} +// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) +inline ::std::tuple linalg_eigh_outf(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs) { + return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h new file mode 100644 index 0000000000000000000000000000000000000000..3a15412312fe0d962a841853dfc4eda8f4ac3bb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) +inline ::std::tuple linalg_ldl_factor(const at::Tensor & self, bool hermitian=false) { + return at::_ops::linalg_ldl_factor::call(self, hermitian); +} + +// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) +inline ::std::tuple linalg_ldl_factor_out(at::Tensor & LD, at::Tensor & pivots, const at::Tensor & self, bool hermitian=false) { + return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots); +} +// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) +inline ::std::tuple linalg_ldl_factor_outf(const at::Tensor & self, bool hermitian, at::Tensor & LD, at::Tensor & pivots) { + return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h new file mode 100644 index 0000000000000000000000000000000000000000..85f62a7c75b1076e23ef9fd3c2c147bd1675fdf5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_multi_dot(Tensor[] tensors) -> Tensor +inline at::Tensor linalg_multi_dot(at::TensorList tensors) { + return at::_ops::linalg_multi_dot::call(tensors); +} + +// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_multi_dot_out(at::Tensor & out, at::TensorList tensors) { + return at::_ops::linalg_multi_dot_out::call(tensors, out); +} +// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_multi_dot_outf(at::TensorList tensors, at::Tensor & out) { + return at::_ops::linalg_multi_dot_out::call(tensors, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9464d45a7751087c8c387341fcdd9cd3128445a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, double rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, double rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, double rcond, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, const at::Tensor & rcond, bool hermitian, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h new file mode 100644 index 0000000000000000000000000000000000000000..295f38fcd3e70fe992ea43595bf9082285b88149 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple linalg_slogdet(const at::Tensor & A); +TORCH_API ::std::tuple linalg_slogdet_out(const at::Tensor & A, at::Tensor & sign, at::Tensor & logabsdet); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..f3b8ce9c20e90bd2fd7cd33ad1596422f0ab5669 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_log1p : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h new file mode 100644 index 0000000000000000000000000000000000000000..8bbe6f9a3fcf089cc13fb1e35bb9adabc0d2df27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_sigmoid_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::log_sigmoid_out::call(self, out); +} +// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_sigmoid_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::log_sigmoid_out::call(self, out); +} + +// aten::log_sigmoid(Tensor self) -> Tensor +inline at::Tensor log_sigmoid(const at::Tensor & self) { + return at::_ops::log_sigmoid::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fe3ccb46eb4082be9da8936764e63812541fd2de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer); +TORCH_API at::Tensor & log_sigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer); +TORCH_API at::Tensor & log_sigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h new file mode 100644 index 0000000000000000000000000000000000000000..34b6706df7fc2f21b0e063143a195248d9b7f43f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::logaddexp2_out::call(self, other, out); +} +// aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::logaddexp2_out::call(self, other, out); +} + +// aten::logaddexp2(Tensor self, Tensor other) -> Tensor +inline at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::logaddexp2::call(self, other); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b12c09dd34eca0e5badcd52e906a82b01b45a9f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..34fc2a808efa0afdaf1571f6c09b775d1715c623 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor logcumsumexp(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & logcumsumexp_out(const at::Tensor & self, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor logcumsumexp(const at::Tensor & self, at::Dimname dim); +TORCH_API at::Tensor & logcumsumexp_out(const at::Tensor & self, at::Dimname dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..29e594c7dfdb46fb0fcd474187b291ff97588823 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor logical_and(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logical_and_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9cea312ae4ba6b55d3a33bd4225e306f1a75d8b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & logical_xor_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logical_xor_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3730bcb3b0673d0a65036a90891af1b99c2a2a1c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor logit(const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & logit_out(at::Tensor & out, const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & logit_outf(const at::Tensor & self, ::std::optional eps, at::Tensor & out); +TORCH_API at::Tensor & logit_(at::Tensor & self, ::std::optional eps=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h new file mode 100644 index 0000000000000000000000000000000000000000..3f5e6583ecc8b91fa3300c50dd00112e002736f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_out::call(start, end, steps, base, out); +} +// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_out::call(start, end, steps, base, out); +} + +// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out); +} +// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out); +} + +// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out); +} +// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out); +} + +// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out); +} +// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ffcc5a3501559667389b789183a7e38319000c1b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor masked_scatter(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source); +TORCH_API at::Tensor & masked_scatter_out(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out); +TORCH_API at::Tensor & masked_scatter__cpu(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source); +TORCH_API at::Tensor & masked_scatter__cuda(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7c350897170b58f9cbcf9b33af34b9ff1abc02cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API matmul { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::matmul"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "matmul(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API matmul_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::matmul"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5a1b521df218186e05aac90ad748f3a676ee6c3f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API max_pool2d_with_indices_out { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::max_pool2d_with_indices"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices); +}; + +struct TORCH_API max_pool2d_with_indices { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::max_pool2d_with_indices"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c23a22e039712ca29fc5167bfc8ad56f87cef493 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional & alpha, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..85f0e0f71c0eabf5c60e970ed075cfa158c550e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor miopen_convolution(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic); +TORCH_API at::Tensor miopen_convolution_symint(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bee78a94be892d967760dfcb280055fc93866e23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API miopen_convolution { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); +}; + +struct TORCH_API miopen_convolution_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0ed35c78d273e01fc25cc5619ca2bf6ce9a19df5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor math_mish_backward(const at::Tensor & grad_output, const at::Tensor & self); +TORCH_API at::Tensor mish_backward(const at::Tensor & grad_output, const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h new file mode 100644 index 0000000000000000000000000000000000000000..ca183b58f8bee439911855a302823bbfea12c013 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) +inline ::std::tuple mkldnn_linear_backward_weights(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) { + return at::_ops::mkldnn_linear_backward_weights::call(grad_output, input, weight, bias_defined); +} + +// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple mkldnn_linear_backward_weights_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) { + return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1); +} +// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple mkldnn_linear_backward_weights_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0f2860569a561e95875cdd8a62ac606b60514c8a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple mps_convolution_transpose_backward_out_symint(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ac11224ac580d337b8e356e2fdf8ca22390cb6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_mse_loss : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & target, int64_t reduction); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c21f17eaee5c12646779c857df5137098ec8b6dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_mse_loss_out : public at::meta::structured_mse_loss { +void impl(const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..569dbe7a9092b4533a08e55777c29fb4855bb8e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API multi_margin_loss_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const ::std::optional &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::multi_margin_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional & weight, int64_t reduction, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional & weight, int64_t reduction, at::Tensor & out); +}; + +struct TORCH_API multi_margin_loss { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const ::std::optional &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::multi_margin_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional & weight, int64_t reduction); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional & weight, int64_t reduction); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a903e11f66b5159099f50633fb789810ef92d31c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor multilabel_margin_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target); +TORCH_API at::Tensor & multilabel_margin_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target); +TORCH_API at::Tensor & multilabel_margin_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h new file mode 100644 index 0000000000000000000000000000000000000000..82775810c09aa3e8051265f0b839141b399fffb7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor multiply(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & multiply_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & multiply_(at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor multiply(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & multiply_(at::Tensor & self, const at::Scalar & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e4873dd71df73b8cb90e4bbebbb44670fb1337ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nansum_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nansum_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fbb064726cc38b64e0547ef51525f555cc9ca732 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_invstd, bool train, double eps, ::std::array output_mask); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..803c442144d231099ef3abb4af3a5850a06311a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API native_layer_norm_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, ::std::array); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::native_layer_norm_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional & weight, const ::std::optional & bias, ::std::array output_mask); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional & weight, const ::std::optional & bias, ::std::array output_mask); +}; + +struct TORCH_API native_layer_norm_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, ::std::array, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::native_layer_norm_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))"; + static ::std::tuple call(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional & weight, const ::std::optional & bias, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional & weight, const ::std::optional & bias, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..22746c258d7705a2e5c23aa935c0ae5448554e7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::native_norm(Tensor self, Scalar p=2) -> Tensor +inline at::Tensor native_norm(const at::Tensor & self, const at::Scalar & p=2) { + return at::_ops::native_norm::call(self, p); +} + +// aten::native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor +inline at::Tensor native_norm(const at::Tensor & self, const ::std::optional & p, at::IntArrayRef dim, bool keepdim, ::std::optional dtype) { + return at::_ops::native_norm_ScalarOpt_dim_dtype::call(self, p, dim, keepdim, dtype); +} + +// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2) { + return at::_ops::native_norm_out::call(self, p, out); +} +// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_outf(const at::Tensor & self, const at::Scalar & p, at::Tensor & out) { + return at::_ops::native_norm_out::call(self, p, out); +} + +// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::IntArrayRef dim, bool keepdim, ::std::optional dtype) { + return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out); +} +// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_outf(const at::Tensor & self, const ::std::optional & p, at::IntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9a563f2c91b7e7623cb8349f08a1370bd3e16061 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & new_full_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h new file mode 100644 index 0000000000000000000000000000000000000000..c2734b3f39712e2641a9cf44a702817722bcb034 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +namespace symint { + template >> + at::Tensor new_zeros(const at::Tensor & self, at::IntArrayRef size, at::TensorOptions options={}) { + return at::_ops::new_zeros::call(self, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +namespace symint { + template >> + at::Tensor new_zeros(const at::Tensor & self, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::new_zeros::call(self, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory); + } +} + +namespace symint { + template >> + at::Tensor new_zeros(const at::Tensor & self, c10::SymIntArrayRef size, at::TensorOptions options={}) { + return at::_ops::new_zeros::call(self, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +namespace symint { + template >> + at::Tensor new_zeros(const at::Tensor & self, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::new_zeros::call(self, size, dtype, layout, device, pin_memory); + } +} + +// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out); +} +namespace symint { + template >> + at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out); + } +} + +// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_zeros_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) { + return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out); +} +namespace symint { + template >> + at::Tensor & new_zeros_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) { + return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out); + } +} + +// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_zeros_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::new_zeros_out::call(self, size, out); +} +namespace symint { + template >> + at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::new_zeros_out::call(self, size, out); + } +} + +// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_zeros_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) { + return at::_ops::new_zeros_out::call(self, size, out); +} +namespace symint { + template >> + at::Tensor & new_zeros_outf(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) { + return at::_ops::new_zeros_out::call(self, size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2daf30c26a82433de871943c16cfbf3ba6c4a02c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor pad_sequence(at::TensorList sequences, bool batch_first=false, double padding_value=0.0, c10::string_view padding_side="right"); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..b10009bb2594e43389be17ae319a2e144ba1e4b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor +inline at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor) { + return at::_ops::pixel_shuffle::call(self, upscale_factor); +} + +// aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & pixel_shuffle_out(at::Tensor & out, const at::Tensor & self, int64_t upscale_factor) { + return at::_ops::pixel_shuffle_out::call(self, upscale_factor, out); +} +// aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & pixel_shuffle_outf(const at::Tensor & self, int64_t upscale_factor, at::Tensor & out) { + return at::_ops::pixel_shuffle_out::call(self, upscale_factor, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..afd8ffc3aa48640cfbd697fcfe4c54a6991c10b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5519e52dc4a77d3dbb1381fbf0e504b902c50c0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor poisson(const at::Tensor & self, ::std::optional generator=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f5094d41c3fd272b7244d514365d1344725e2df0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & poisson_out(const at::Tensor & self, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor _s_poisson_cpu(const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor _s_poisson_cuda(const at::Tensor & self, ::std::optional generator=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..0d78ac59e59dc61b49ace81989edb0059fed0f57 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor +inline at::Tensor quantized_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool2d::call(self, kernel_size, stride, padding, dilation, ceil_mode); +} + +// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} +// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) { + return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..09a116a24bc596fdc4833164b11e441ccf49f91b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API randn { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API randn_generator { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "generator"; + static constexpr const char* schema_str = "randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional generator, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional generator, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API randn_names { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "names"; + static constexpr const char* schema_str = "randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API randn_generator_with_names { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "generator_with_names"; + static constexpr const char* schema_str = "randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API randn_out { + using schema = at::Tensor & (c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out); +}; + +struct TORCH_API randn_generator_out { + using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "generator_out"; + static constexpr const char* schema_str = "randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional generator, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional generator, at::Tensor & out); +}; + +struct TORCH_API randn_names_out { + using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "names_out"; + static constexpr const char* schema_str = "randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional names, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional names, at::Tensor & out); +}; + +struct TORCH_API randn_generator_with_names_out { + using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::randn"; + static constexpr const char* overload_name = "generator_with_names_out"; + static constexpr const char* schema_str = "randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..97ac74755dff5e11d9f7ce51133b63c0463d3256 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor reflection_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & reflection_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8d17e59fbe477843c687e0c5d17708e5ff819643 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad3d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & reflection_pad3d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7be4f7bc996fe76403a9e9aa71cb120604516df0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & relu_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor relu(const at::Tensor & self); +TORCH_API at::Tensor & relu_(at::Tensor & self); +TORCH_API at::Tensor NestedTensor_relu(const at::Tensor & self); +TORCH_API at::Tensor & NestedTensor_relu_(at::Tensor & self); +TORCH_API at::Tensor relu_sparse(const at::Tensor & self); +TORCH_API at::Tensor & relu_sparse_(at::Tensor & self); +TORCH_API at::Tensor relu_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & relu_sparse_csr_(at::Tensor & self); +TORCH_API at::Tensor mkldnn_relu(const at::Tensor & self); +TORCH_API at::Tensor & mkldnn_relu_(at::Tensor & self); +TORCH_API at::Tensor relu_quantized_cpu(const at::Tensor & self); +TORCH_API at::Tensor & relu_quantized_cpu_(at::Tensor & self); +TORCH_API at::Tensor relu_quantized_cuda(const at::Tensor & self); +TORCH_API at::Tensor & relu_quantized_cuda_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1fe323549a841c89f619351ee507d998f68b5752 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API repeat { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::repeat"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "repeat(Tensor self, SymInt[] repeats) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef repeats); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats); +}; + +struct TORCH_API repeat_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::repeat"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef repeats, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h new file mode 100644 index 0000000000000000000000000000000000000000..1716001e1491d005a32bd07f1841e47d1869323b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::resolve_conj(Tensor(a) self) -> Tensor(a) +inline at::Tensor resolve_conj(const at::Tensor & self) { + return at::_ops::resolve_conj::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ef36ac996710062c0e5b9be66a4d05008562e75b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor resolve_conj(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..568f16aa3c8682ee9a04f704f3bf3fad827e427d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple rnn_relu(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); +TORCH_API ::std::tuple rnn_relu(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h new file mode 100644 index 0000000000000000000000000000000000000000..119c29f2182ab840e7d8b89cf835ce4e9189da52 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) +inline ::std::tuple rnn_tanh(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::rnn_tanh_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) +inline ::std::tuple rnn_tanh(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) { + return at::_ops::rnn_tanh_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h new file mode 100644 index 0000000000000000000000000000000000000000..15e7ebc65529d456a9a29245e94a010433129e6e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor rnn_tanh_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const ::std::optional & b_ih={}, const ::std::optional & b_hh={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e07f05ce2d8a210161e28d79ccb1f5b577b42a40 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor row_indices(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h new file mode 100644 index 0000000000000000000000000000000000000000..5dabbc6a65a867bd952f67eaaa766425dbec614b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor +inline at::Tensor scaled_dot_product_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & attn_mask={}, double dropout_p=0.0, bool is_causal=false, ::std::optional scale=::std::nullopt, bool enable_gqa=false) { + return at::_ops::scaled_dot_product_attention::call(query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7ece27168ab5549fd1bf7fbf882113e2a282a2ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API scatter_reduce_two { + using schema = at::Tensor (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce"; + static constexpr const char* overload_name = "two"; + static constexpr const char* schema_str = "scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); +}; + +struct TORCH_API scatter_reduce__two { + using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce_"; + static constexpr const char* overload_name = "two"; + static constexpr const char* schema_str = "scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); +}; + +struct TORCH_API scatter_reduce_two_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce"; + static constexpr const char* overload_name = "two_out"; + static constexpr const char* schema_str = "scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f5d7f31da8cd46fbd4b4f55a459321e4f338d650 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, ::std::optional side=::std::nullopt, const ::std::optional & sorter={}); +TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, ::std::optional side=::std::nullopt, const ::std::optional & sorter={}); +TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32, bool right, ::std::optional side, const ::std::optional & sorter, at::Tensor & out); +TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, ::std::optional side=::std::nullopt, const ::std::optional & sorter={}); +TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, ::std::optional side=::std::nullopt, const ::std::optional & sorter={}); +TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32, bool right, ::std::optional side, const ::std::optional & sorter, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..74f4e4aa25f333e542c60f160eb6a765dad787e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sigmoid_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sigmoid_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input); +}; + +struct TORCH_API sigmoid_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sigmoid_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & output); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h new file mode 100644 index 0000000000000000000000000000000000000000..e40f5ef108e0b9cef30c1afe8703e248aedda5f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::silu(Tensor self) -> Tensor +inline at::Tensor silu(const at::Tensor & self) { + return at::_ops::silu::call(self); +} + +// aten::silu_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & silu_(at::Tensor & self) { + return at::_ops::silu_::call(self); +} + +// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & silu_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::silu_out::call(self, out); +} +// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & silu_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::silu_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b2251a63ac29ccf06494eb8df8622736a754a08b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor sin(const at::Tensor & self); +TORCH_API at::Tensor & sin_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & sin_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & sin_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..62d1c634bf47293125583b913fd9edf96bbfa316 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sin { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sin"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "sin(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API sin_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sin_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "sin_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API sin_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sin"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h new file mode 100644 index 0000000000000000000000000000000000000000..29a86e4eef1f7309606cb092256b05211845e84f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); +} +namespace symint { + template >> + at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); + } +} + +// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice_inverse_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start, end, step); +} +namespace symint { + template >> + at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start, end, step); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6494b3d433d7ef3aa3a5435170bdcfc9685034bc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor & slow_conv3d_out(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..6888047ba23da2fa9e80978015744e34ea5d6675 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & softplus_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) { + return at::_ops::softplus_backward_grad_input::call(grad_output, self, beta, threshold, grad_input); +} +// aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & softplus_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input) { + return at::_ops::softplus_backward_grad_input::call(grad_output, self, beta, threshold, grad_input); +} + +// aten::softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor +inline at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) { + return at::_ops::softplus_backward::call(grad_output, self, beta, threshold); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2aae61de1bd0a0ef6ca7688d1b9dc241509ac471 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor sparse_resize_and_clear(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +TORCH_API const at::Tensor & sparse_resize_and_clear_out(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out); +TORCH_API const at::Tensor & sparse_resize_and_clear_(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..986d60cb7e9facb2ab3859fa3a5ce800f6493194 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_bessel_y0(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y0_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y0_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1bc66a6ebc2ab6f913fd8c9032857ba1238418cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_bessel_y1(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d1c555c260e2d9158ec2b61f87d19c2be9c37042 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_y1 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y1"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_y1(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_y1_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y1"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..65e27a1cce3e2e180ab32ddd8d94b10c1a292555 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..12161eef9dc647334bdfc539d73bc6b2defb3e00 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_digamma(const at::Tensor & self); +TORCH_API at::Tensor & special_digamma_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_digamma_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6176e342151b38208607c276c4b9e138718d17bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor special_exp2(const at::Tensor & self); +TORCH_API at::Tensor & special_exp2_out(const at::Tensor & self, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..96f8c2be0776afd38c7b00d8f65108aa2378a869 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_modified_bessel_k0(const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_k0_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_k0_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ec3c35b9e42a558ce480834879c955e672214292 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_ndtri(const at::Tensor & self); +TORCH_API at::Tensor & special_ndtri_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_ndtri_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h new file mode 100644 index 0000000000000000000000000000000000000000..4f15a5d8b45211e41a5e4554c57ce9bf27e3fc8e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_sinc(Tensor self) -> Tensor +inline at::Tensor special_sinc(const at::Tensor & self) { + return at::_ops::special_sinc::call(self); +} + +// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_sinc_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_sinc_out::call(self, out); +} +// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_sinc_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_sinc_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h new file mode 100644 index 0000000000000000000000000000000000000000..65c5b4205074ea6c5b75896d7df8e840b1b7ae3c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor squeeze(const at::Tensor & self); +TORCH_API at::Tensor squeeze_nested(const at::Tensor & self); +TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self); +TORCH_API at::Tensor & squeeze_(at::Tensor & self); +TORCH_API at::Tensor squeeze(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor squeeze_dim_nested(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & squeeze_(at::Tensor & self, int64_t dim); +TORCH_API at::Tensor squeeze(const at::Tensor & self, at::Dimname dim); +TORCH_API at::Tensor & squeeze_(at::Tensor & self, at::Dimname dim); +TORCH_API at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor squeeze_dim_nested(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor & squeeze_(at::Tensor & self, at::IntArrayRef dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f1ac1fa61468f0b667435b2e48390ac1bb87ee0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API squeeze { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "squeeze(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API squeeze_dim { + using schema = at::Tensor (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze"; + static constexpr const char* overload_name = "dim"; + static constexpr const char* schema_str = "squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim); +}; + +struct TORCH_API squeeze_dimname { + using schema = at::Tensor (const at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, at::Dimname dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim); +}; + +struct TORCH_API squeeze_dims { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze"; + static constexpr const char* overload_name = "dims"; + static constexpr const char* schema_str = "squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim); +}; + +struct TORCH_API squeeze_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "squeeze_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API squeeze__dim { + using schema = at::Tensor & (at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze_"; + static constexpr const char* overload_name = "dim"; + static constexpr const char* schema_str = "squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t dim); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim); +}; + +struct TORCH_API squeeze__dims { + using schema = at::Tensor & (at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze_"; + static constexpr const char* overload_name = "dims"; + static constexpr const char* schema_str = "squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::IntArrayRef dim); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::IntArrayRef dim); +}; + +struct TORCH_API squeeze__dimname { + using schema = at::Tensor & (at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::squeeze_"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::Dimname dim); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9c85851c80c6cb5e9e92c30cbaf7b19474a45ab3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API void sym_constrain_range_for_size(const at::Scalar & size, ::std::optional min=::std::nullopt, ::std::optional max=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1f5630e6a8687594ccd381d35d334730ed61c0a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor tanh(const at::Tensor & self); +TORCH_API at::Tensor & tanh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & tanh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tanh_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..526f7e21b55768396de83aa0bde4a3e550867f16 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API threshold { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); +}; + +struct TORCH_API threshold_ { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); +}; + +struct TORCH_API threshold_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e18c2984c2b94046761531a117e47ffcfe880902 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple topk(const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_symint(const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_outf(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple topk_symint_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_symint_outf(const at::Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..a9f458df316e1475a7f375f6c02e232685d3dadf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API transpose_int { + using schema = at::Tensor (const at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::transpose"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, int64_t dim0, int64_t dim1); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1); +}; + +struct TORCH_API transpose_Dimname { + using schema = at::Tensor (const at::Tensor &, at::Dimname, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::transpose"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, at::Dimname dim0, at::Dimname dim1); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim0, at::Dimname dim1); +}; + +struct TORCH_API transpose_ { + using schema = at::Tensor & (at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::transpose_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t dim0, int64_t dim1); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim0, int64_t dim1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3fbf6401b983fbd12ee15fd82cf01b45f00b224c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API tril_ { + using schema = at::Tensor & (at::Tensor &, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, c10::SymInt diagonal); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::SymInt diagonal); +}; + +struct TORCH_API tril_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +}; + +struct TORCH_API tril { + using schema = at::Tensor (const at::Tensor &, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "tril(Tensor self, SymInt diagonal=0) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymInt diagonal); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f53872f4a5bfab95e042e0043b8c9187945bc303 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor triplet_margin_loss(const at::Tensor & anchor, const at::Tensor & positive, const at::Tensor & negative, double margin=1.0, double p=2, double eps=1e-06, bool swap=false, int64_t reduction=at::Reduction::Mean); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4c2a6f786014abebc1b4ec403dd0720f80801f5f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API true_divide_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "true_divide.Tensor(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API true_divide__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API true_divide_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API true_divide_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "true_divide.Scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API true_divide__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h new file mode 100644 index 0000000000000000000000000000000000000000..ef1fa5feb1eaaaf1d1f908c2d6a852997180d29c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] +inline ::std::vector unflatten_dense_tensors(const at::Tensor & flat, at::TensorList tensors) { + return at::_ops::unflatten_dense_tensors::call(flat, tensors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d0379fcfa0d812120afa20bbd52453af91ac58e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor unfold_backward(const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step); +TORCH_API at::Tensor unfold_backward_symint(const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1a3d3b6008169b231cf3a0f9f842db6c542be3b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bc67d28ed4dab093dd813b25d332783bd35bd793 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unfold { + using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unfold"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d33ffa20727f7847813886dd89e13497b796ecb1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API uniform_ { + using schema = at::Tensor & (at::Tensor &, double, double, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, double from, double to, ::std::optional generator); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double from, double to, ::std::optional generator); +}; + +struct TORCH_API uniform_out { + using schema = at::Tensor & (const at::Tensor &, double, double, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, double from, double to, ::std::optional generator, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional generator, at::Tensor & out); +}; + +struct TORCH_API uniform { + using schema = at::Tensor (const at::Tensor &, double, double, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, double from, double to, ::std::optional generator); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional generator); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ae3167dbb12abf13532dd80f7b89f2ef9f73a61f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple unique_dim_out(const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +TORCH_API ::std::tuple unique_dim_cpu(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false); +TORCH_API ::std::tuple unique_dim_cuda(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..785055511a03466b9d59da907b1fffbf6f1d4a2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unsafe_split_Tensor { + using schema = ::std::vector (const at::Tensor &, c10::SymInt, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unsafe_split"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]"; + static ::std::vector call(const at::Tensor & self, c10::SymInt split_size, int64_t dim); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim); +}; + +struct TORCH_API unsafe_split_Tensor_out { + using schema = void (const at::Tensor &, c10::SymInt, int64_t, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unsafe_split"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()"; + static void call(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h new file mode 100644 index 0000000000000000000000000000000000000000..aba5e57625ec88ec2145a1677d40d509fed4a65c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] +inline ::std::vector unsafe_split_with_sizes(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes::call(self, c10::fromIntArrayRefSlow(split_sizes), dim); +} +namespace symint { + template >> + ::std::vector unsafe_split_with_sizes(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes::call(self, c10::fromIntArrayRefSlow(split_sizes), dim); + } +} + +// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] +inline ::std::vector unsafe_split_with_sizes_symint(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes::call(self, split_sizes, dim); +} +namespace symint { + template >> + ::std::vector unsafe_split_with_sizes(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes::call(self, split_sizes, dim); + } +} + +// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () +inline void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out); +} +namespace symint { + template >> + void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out); + } +} + +// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () +inline void unsafe_split_with_sizes_outf(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) { + return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out); +} +namespace symint { + template >> + void unsafe_split_with_sizes_outf(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) { + return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out); + } +} + +// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () +inline void unsafe_split_with_sizes_symint_out(at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out); +} +namespace symint { + template >> + void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) { + return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out); + } +} + +// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () +inline void unsafe_split_with_sizes_symint_outf(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) { + return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out); +} +namespace symint { + template >> + void unsafe_split_with_sizes_outf(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) { + return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8d2905d80cd04a9e19a7f5f0ff07dba8e4bd87c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & upsample_bilinear2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1079ca478abeab92be748e6e7f96271b9f6caf41 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor upsample_nearest2d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_nearest2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & upsample_nearest2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..f509fe9a7b4b775ac8ae85a5694b33416b333a97 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_upsample_nearest3d_backward : public at::impl::MetaBase { + + + void meta(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h new file mode 100644 index 0000000000000000000000000000000000000000..9cb89bf96a592c91aedfd4fad33f25b2ddbef542 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::zero_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & zero_(at::Tensor & self) { + return at::_ops::zero_::call(self); +} + +// aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & zero_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::zero_out::call(self, out); +} +// aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & zero_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::zero_out::call(self, out); +} + +// aten::zero(Tensor self) -> Tensor +inline at::Tensor zero(const at::Tensor & self) { + return at::_ops::zero::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..c1771b96ff4094c38527df0e3e08e3796637654a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::xpu::detail { + +// The real implementation of XPUHooksInterface +struct XPUHooks : public at::XPUHooksInterface { + XPUHooks(at::XPUHooksArgs) {} + void init() const override; + bool hasXPU() const override; + std::string showConfig() const override; + int32_t getGlobalIdxFromDevice(const at::Device& device) const override; + const Generator& getDefaultGenerator( + DeviceIndex device_index = -1) const override; + Generator getNewGenerator(DeviceIndex device_index = -1) const override; + Device getDeviceFromPtr(void* data) const override; + c10::DeviceIndex getNumGPUs() const override; + DeviceIndex current_device() const override; + void deviceSynchronize(DeviceIndex device_index) const override; + Allocator* getPinnedMemoryAllocator() const override; + + bool isBuilt() const override { + return true; + } + bool isAvailable() const override; + bool isPinnedPtr(const void* data) const override; + bool hasPrimaryContext(DeviceIndex device_index) const override; + DeviceIndex deviceCount() const override; + DeviceIndex getCurrentDevice() const override; +}; + +} // namespace at::xpu::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..b66f075ec73fb77290e317e911c66e4497ca1469 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h @@ -0,0 +1,455 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +using CaptureId_t = unsigned long long; +// first is set if the instance is created by CUDAGraph::capture_begin. +// second is set if the instance is created by at::cuda::graph_pool_handle. +using MempoolId_t = std::pair; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + +// A DataPtr is a unique pointer (with an attached deleter and some +// context for the deleter) to some memory, which also records what +// device is for its data. +// +// nullptr DataPtrs can still have a nontrivial device; this allows +// us to treat zero-size allocations uniformly with non-zero allocations. +// +class C10_API DataPtr { + private: + c10::detail::UniqueVoidPtr ptr_; + Device device_; + + public: + // Choice of CPU here is arbitrary; if there's an "undefined" device + // we could use that too + DataPtr() : device_(DeviceType::CPU) {} + DataPtr(void* data, Device device) : ptr_(data), device_(device) {} + DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device) + : ptr_(data, ctx, ctx_deleter), device_(device) {} + void* operator->() const { + return ptr_.get(); + } + C10_ALWAYS_INLINE bool /* success */ unsafe_reset_data_and_ctx( + void* new_data_and_ctx) { + return ptr_.unsafe_reset_data_and_ctx(new_data_and_ctx); + } + void clear() { + ptr_.clear(); + } + void* get() const { + return ptr_.get(); + } + void* mutable_get() { + return ptr_.get(); + } + void* get_context() const { + return ptr_.get_context(); + } + void* release_context() { + return ptr_.release_context(); + } + std::unique_ptr&& move_context() { + return ptr_.move_context(); + } + operator bool() const { + return static_cast(ptr_); + } + template + T* cast_context(DeleterFnPtr expected_deleter) const { + return ptr_.cast_context(expected_deleter); + } + DeleterFnPtr get_deleter() const { + return ptr_.get_deleter(); + } + /** + * Compare the deleter in a DataPtr to expected_deleter. + * If it matches, replace the deleter with new_deleter + * and return true; otherwise, does nothing and returns + * false. + * + * In general, it is not safe to unconditionally set the + * deleter on a DataPtr, because you don't know what + * the deleter is, and thus will have a hard time properly + * disposing of the deleter without storing the original + * deleter (this is difficult to do, because DeleterFnPtr + * is not a closure, and because the context on DataPtr is + * only a single word, you generally don't have enough + * space to store both the original deleter and its context). + * However, in some cases, you know /exactly/ what the deleter + * is, and you have a new deleter that manually wraps + * the old one. In this case, you can safely swap the deleter + * after asserting that the deleters line up. + * + * What are the requirements on new_deleter? It must still + * properly dispose of the void* pointer passed in as its argument, + * where void* is whatever the context of the original deleter + * is. So in general, you expect the new deleter to look something + * like this: + * + * [](void* ptr) { + * some_new_stuff(ptr); + * get_orig_allocator()->raw_deleter(ptr); + * } + * + * Note that it won't work to close over the original + * allocator; you don't have enough space to do that! Also, + * it's unsafe to assume that the passed in pointer in + * question is the memory pointer in question; it might not + * be; be sure to read the source code of the Allocator + * in question to confirm this. + */ + [[nodiscard]] bool compare_exchange_deleter( + DeleterFnPtr expected_deleter, + DeleterFnPtr new_deleter) { + return ptr_.compare_exchange_deleter(expected_deleter, new_deleter); + } + Device device() const { + return device_; + } + // Unsafely mutates the device on a DataPtr. Under normal use, + // you should never actually need to call this function. + // We need this for the implementation of the hack detailed + // in Note [Masquerading as CUDA] + void unsafe_set_device(Device device) { + device_ = device; + } +}; + +// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a +// CPU nullptr + +inline bool operator==(const DataPtr& dp, std::nullptr_t) noexcept { + return !dp; +} +inline bool operator==(std::nullptr_t, const DataPtr& dp) noexcept { + return !dp; +} +inline bool operator!=(const DataPtr& dp, std::nullptr_t) noexcept { + return dp; +} +inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept { + return dp; +} + +// Note [raw_allocate/raw_deallocate and Thrust] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Thrust's support for custom allocators requires us to write something +// like this: +// +// class ThrustAllocator { +// char* allocate(size_t); +// void deallocate(char*, size_t); +// }; +// +// This is not good for our unique_ptr based allocator interface, as +// there is no way to get to the context when we free. +// +// However, in some cases the context is exactly the same as +// the data pointer. In this case, we can support the "raw" +// allocate and deallocate interface. This is what +// raw_deleter signifies. By default, it returns a nullptr, which means that +// the raw interface is not implemented. Be sure to implement it whenever +// possible, or the raw interface will incorrectly reported as unsupported, +// when it is actually possible. + +// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions) +struct C10_API Allocator { + virtual ~Allocator() = default; + + virtual DataPtr allocate(size_t n) = 0; + + // Clones an allocation that came from this allocator. + // + // To perform the copy, this function calls `copy_data`, which + // must be implemented by derived classes. + // + // Note that this explicitly ignores any context that may have been + // attached to the input data. + // + // Requires: input data was allocated by the same allocator. + DataPtr clone(const void* data, std::size_t n); + + // Checks if DataPtr has a simple context, not wrapped with any out of the + // ordinary contexts. + virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const; + + // If this returns a non nullptr, it means that allocate() + // is guaranteed to return a unique_ptr with this deleter attached; + // it means the rawAllocate and rawDeallocate APIs are safe to use. + // This function MUST always return the same BoundDeleter. + virtual DeleterFnPtr raw_deleter() const { + return nullptr; + } + void* raw_allocate(size_t n) { + auto dptr = allocate(n); + AT_ASSERT(dptr.get() == dptr.get_context()); + return dptr.release_context(); + } + void raw_deallocate(void* ptr) { + auto d = raw_deleter(); + AT_ASSERT(d); + d(ptr); + } + + // Copies data from one allocation to another. + // Pure virtual, so derived classes must define behavior. + // Derived class implementation can simply call `default_copy_data` + // to use `std::memcpy`. + // + // Requires: src and dest were allocated by this allocator + // Requires: src and dest both have length >= count + virtual void copy_data(void* dest, const void* src, std::size_t count) + const = 0; + + protected: + // Uses `std::memcpy` to copy data. + // Child classes can use this as `copy_data` when an alternative copy + // API is not needed. + void default_copy_data(void* dest, const void* src, std::size_t count) const; +}; + +// This context is used to generate DataPtr which have arbitrary +// std::function deleters associated with them. In some user facing +// functions, we give a (user-friendly) interface for constructing +// tensors from external data which take an arbitrary std::function +// deleter. Grep for InefficientStdFunctionContext to find these +// occurrences. +// +// This context is inefficient because we have to do a dynamic +// allocation InefficientStdFunctionContext, on top of the dynamic +// allocation which is implied by std::function itself. +struct C10_API InefficientStdFunctionContext { + void* ptr_{nullptr}; + std::function deleter_; + InefficientStdFunctionContext(void* ptr, std::function deleter) + : ptr_(ptr), deleter_(std::move(deleter)) {} + InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete; + InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept + : ptr_(std::exchange(rhs.ptr_, nullptr)), + deleter_(std::move(rhs.deleter_)) {} + InefficientStdFunctionContext& operator=( + const InefficientStdFunctionContext&) = delete; + // NOLINTNEXTLINE(*-noexcept-move-*) + InefficientStdFunctionContext& operator=( + InefficientStdFunctionContext&& rhs) { + this->~InefficientStdFunctionContext(); + ptr_ = std::exchange(rhs.ptr_, nullptr); + deleter_ = std::move(rhs.deleter_); + return *this; + } + ~InefficientStdFunctionContext() { + if (deleter_) { + deleter_(ptr_); + } + } + static DataPtr makeDataPtr( + void* ptr, + std::function deleter, + Device device); +}; + +/** Set the allocator for DeviceType `t`. The passed in allocator pointer is + * expected to have static lifetime; this function does NOT take ownership + * of the raw pointer. (The reason for this is to prevent existing pointers + * to an allocator of a particular device from being invalidated when + * SetAllocator is called.) + * + * Also note that this is not thread-safe, and we assume this function will + * only be called during initialization. + * + * The 'priority' flag is introduced when we want to overwrite the default + * allocator, since the allocators are set statically. The default priority + * is 0, which means the lowest. Only higher or equal priority can overwrite + * existing ones. + */ +C10_API void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0); +C10_API Allocator* GetAllocator(const DeviceType& t); + +template +struct AllocatorRegisterer { + explicit AllocatorRegisterer(Allocator* alloc) { + SetAllocator(t, alloc); + } +}; + +#define REGISTER_ALLOCATOR(t, f) \ + namespace { \ + static c10::AllocatorRegisterer g_allocator_d(f); \ + } + +// An interface for reporting thread local memory usage +// per device +struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase { + /** + * alloc_size corresponds to the size of the ptr. + * + * total_allocated corresponds to total allocated memory. + * + * total_reserved corresponds to total size of memory pool, both used and + * unused, if applicable. + */ + virtual void reportMemoryUsage( + void* ptr, + int64_t alloc_size, + size_t total_allocated, + size_t total_reserved, + Device device) = 0; + + virtual void reportOutOfMemory( + int64_t alloc_size, + size_t total_allocated, + size_t total_reserved, + Device device); + + virtual bool memoryProfilingEnabled() const = 0; +}; + +C10_API bool memoryProfilingEnabled(); +C10_API void reportMemoryUsageToProfiler( + void* ptr, + int64_t alloc_size, + size_t total_allocated, + size_t total_reserved, + Device device); + +C10_API void reportOutOfMemoryToProfiler( + int64_t alloc_size, + size_t total_allocated, + size_t total_reserved, + Device device); + +// used to hold traceback information in allocators +// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions) +struct GatheredContext { + virtual ~GatheredContext() = default; +}; + +namespace CachingAllocator { +struct Stat { + void increase(size_t amount) { + current += static_cast(amount); + peak = std::max(current, peak); + allocated += static_cast(amount); + } + + void decrease(size_t amount) { + current -= static_cast(amount); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + current >= 0, + "Negative tracked stat in device allocator (likely logic error)."); + freed += static_cast(amount); + } + + void reset_accumulated() { + allocated = 0; + freed = 0; + } + + void reset_peak() { + peak = current; + } + + int64_t current = 0; + int64_t peak = 0; + int64_t allocated = 0; + int64_t freed = 0; +}; + +enum struct StatType : uint64_t { + AGGREGATE = 0, + SMALL_POOL = 1, + LARGE_POOL = 2, + NUM_TYPES = 3 // remember to update this whenever a new stat type is added +}; + +using StatArray = std::array(StatType::NUM_TYPES)>; +using StatTypes = std::array(StatType::NUM_TYPES)>; + +template +void for_each_selected_stat_type(const StatTypes& stat_types, Func f) { + for (const auto stat_type : c10::irange(stat_types.size())) { + if (stat_types[stat_type]) { + f(stat_type); + } + } +} + +// Structure for keeping timing information +struct DurationStat { + void increase(int64_t amount) { + total += amount; + count += 1; + max = std::max(amount, max); + if (min == 0) { + min = amount; + } else { + min = std::min(amount, min); + } + } + + void reset_accumulated() { + total = 0; + count = 0; + } + + void reset_peak() { + min = 0; + max = 0; + } + + int64_t total = 0; + int64_t max = 0; + int64_t min = 0; + int64_t count = 0; +}; + +// Size pretty-printer +inline std::string format_size(uint64_t size) { + std::ostringstream os; + os.precision(2); + os << std::fixed; + if (size <= 1024) { + os << size << " bytes"; + } else if (size <= 1048576) { + os << (static_cast(size) / 1024.0); + os << " KiB"; + } else if (size <= 1073741824ULL) { + os << static_cast(size) / 1048576.0; + os << " MiB"; + } else { + os << static_cast(size) / 1073741824.0; + os << " GiB"; + } + return os.str(); +} + +} // namespace CachingAllocator +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h new file mode 100644 index 0000000000000000000000000000000000000000..ab6a23d24d0884d72c869947857c02c22584b9c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h @@ -0,0 +1,390 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace c10::CachingAllocator { + +// "large" allocations may be packed in 20 MiB blocks +constexpr size_t kLargeBuffer = 20971520; +// "small" allocations are packed in 2 MiB blocks +constexpr size_t kSmallBuffer = 2097152; +// all sizes are rounded to at least 512 bytes +constexpr size_t kMinBlockSize = 512; +// largest "small" allocation is 1 MiB +constexpr size_t kSmallSize = 1048576; +// allocations between 1 and 10 MiB may use kLargeBuffer +constexpr size_t kMinLargeAlloc = 10485760; +// round up large allocations to 2 MiB +constexpr size_t kRoundLarge = 2097152; + +// A utility class for tokenizing allocator configuration strings into discrete +// parts. For example, the config string: +// "key1:val1,key2:[val2,val3]" +// is tokenized into: +// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]", +// +// Tokens include keys, values, and special characters (':', ',', '[', ']'). +// Whitespace is ignored. +class ConfigTokenizer { + public: + explicit ConfigTokenizer(const std::string& env) { + std::string buffer; + for (char ch : env) { + if (ch == ',' || ch == ':' || ch == '[' || ch == ']') { + if (!buffer.empty()) { + config_.emplace_back(std::move(buffer)); + buffer.clear(); + } + config_.emplace_back(1, ch); + } else if (!std::isspace(static_cast(ch))) { + buffer += ch; + } + } + if (!buffer.empty()) { + config_.emplace_back(std::move(buffer)); + } + } + + const std::string& operator[](size_t i) const { + TORCH_INTERNAL_ASSERT( + i < config_.size(), "Index out of bounds in ConfigTokenizer"); + return config_[i]; + } + + size_t size() const { + return config_.size(); + } + + bool checkToken(size_t i, const std::string& token) const { + checkIndex(i); + return config_[i] == token; + } + + size_t toSizeT(size_t i) const { + checkIndex(i); + return std::stoull(config_[i]); + } + + double toDouble(size_t i) const { + checkIndex(i); + return std::stod(config_[i]); + } + + bool toBool(size_t i) const { + checkIndex(i); + const auto& token = config_[i]; + if (token == "True") { + return true; + } else if (token == "False") { + return false; + } else { + TORCH_CHECK_VALUE( + false, + "Expected 'True' or 'False' at index ", + i, + " in ConfigTokenizer but got '", + token, + "'"); + } + } + + // Skips the current token group and returns the index of the value token. + // Assumes the current index `i` points to a key name in a key-value pair. + size_t skipKey(size_t i) const { + // Expect a colon after the key + checkToken(++i, ":"); + + ++i; // Move to the value + checkIndex(i); + if (config_[i] != "[") { + // Value is a single token (not a list) -> return its index + return i; + } + + // Skip tokens inside the list until matching ']' + // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions) + while (++i < config_.size() && config_[i] != "]") { + } + + TORCH_INTERNAL_ASSERT( + i < config_.size(), + "Expected closing bracket ']' in ConfigTokenizer but reached end of config"); + + return i; // Return the index of the closing ']' + } + + private: + void checkIndex(size_t i) const { + TORCH_INTERNAL_ASSERT( + i < config_.size(), "Index out of bounds in ConfigTokenizer"); + } + + std::vector config_; +}; + +/** + * Note [AcceleratorAllocatorConfig design] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * This class configures memory allocation for both device and host memory. A + * single `AcceleratorAllocatorConfig` instance is shared across all accelerator + * backends, such as CUDA and XPU, under the assumption that relevant + * environment variables apply uniformly to all accelerators. Device-specific + * configuration extensions are supported via hooks (see + * `registerDeviceConfigParserHook`). + * + * Recommended design: + * - Place common configurations in `AcceleratorAllocatorConfig`. + * - Extend backend-specific configurations in corresponding device-specific + * classes, such as `CUDAAllocatorConfig`, etc. + * + * Scope: + * - Configuration options must be environment-variable driven. + * + * Naming Convention: + * - Public API names in `AcceleratorAllocatorConfig` should be device-generic. + * - Members prefixed with `pinned_` are specific to the host/pinned allocator. + * - Environment variable names should be generic across backends. + * - Comma-separated key-value pairs in the format: `key:value`. Use square + * brackets `[]` for list values Example: `key1:123, key2:[val1,val2]` + * + * Environment Variables: + * - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`. + * - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported + * with lower priority. + */ + +class C10_API AcceleratorAllocatorConfig { + public: + static AcceleratorAllocatorConfig& instance(); + + C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig); + AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete; + AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete; + ~AcceleratorAllocatorConfig() = default; + + /* Device allocator settings */ + + // Returns the maximum block size (in MB) that is allowed to be split. The + // default is unlimited (all blocks can be split). + static size_t max_split_size() { + return instance().max_split_size_; + } + + // Returns the maximum block size (in MB) that is allowed to be rounded up + // without requiring splitting when searching for a free block. The default is + // 20 MiB. + static size_t max_non_split_rounding_size() { + return instance().max_non_split_rounding_size_; + } + + // Return the number of divisions used when rounding up allocation sizes (in + // MB) to the nearest power-of-2 boundary. + static size_t roundup_power2_divisions(size_t size); + + // Returns the vector of division factors used for rounding up allocation + // sizes. These divisions apply to size intervals between 1MB and 64GB. + static const std::vector& roundup_power2_divisions() { + return instance().roundup_power2_divisions_; + } + + // Returns the threshold that triggers garbage collection when the ratio of + // used memory to maximum allowed memory exceeds this value. The default is 0, + // meaning no garbage collection is triggered. The value should be in the + // range (0.0, 1.0). + static double garbage_collection_threshold() { + return instance().garbage_collection_threshold_; + } + + // Returns whether the expandable segment feature is enabled. This allows the + // allocator to start with one segment that grows as needed, rather than + // creating a new segment for each allocation. Default is false (expandable + // segments disabled). + static bool use_expandable_segments() { + return instance().use_expandable_segments_; + } + + /* Host allocator settings */ + + // Returns whether the pinned host allocator uses background threads for + // processing events. This is useful for improving performance in scenarios + // where many small allocations are made. Default is false (background threads + // disabled). + static bool pinned_use_background_threads() { + return instance().pinned_use_background_threads_; + } + + /* Settings for both device and host allocator */ + + // Returns the current allocator settings as a string. This string is useful + // to expand device-specific allocator configurations + static std::string last_allocator_settings() { + std::lock_guard lock(instance().last_allocator_settings_mutex_); + return instance().last_allocator_settings_; + } + + // Use `Construct On First Use Idiom` to avoid `Static Initialization Order` + // issue. + static std::unordered_set& getMutableKeys() { + static std::unordered_set keys{ + "max_split_size_mb", + "max_non_split_rounding_mb", + "garbage_collection_threshold", + "roundup_power2_divisions", + "expandable_segments", + "pinned_use_background_threads"}; + return keys; + } + + // Returns the set of valid keys for the allocator configuration. + // This set is used to validate the presence and correctness of keys in + // device-specific configuration parsers. + static const std::unordered_set& getKeys() { + return getMutableKeys(); + } + + // Registers a device-specific configuration parser hook and its key. This + // allows backends to parse additional device-specific configuration options + // from the environment variable. The hook should be a function that takes a + // string (the environment variable value) and parses it to set + // device-specific configuration options. The hook will be called when the + // environment variable is parsed. If a hook is already registered, it will be + // replaced with the new one. + static void registerDeviceConfigParserHook( + std::function&& hook, + const std::unordered_set& keys) { + device_config_parser_hook_ = std::move(hook); + auto& mutable_keys = getMutableKeys(); + for (auto& key : keys) { + TORCH_CHECK_VALUE( + mutable_keys.insert(key).second, + "Duplicated key '", + key, + "' found in device-specific configuration parser hook registration"); + } + } + + // Calls the registered device-specific configuration parser hook with the + // provided environment string. This allows backends to parse additional + // device-specific configuration options from the environment variable. + // If no hook is registered, this function does nothing. + static void callDeviceConfigParserHook(const std::string& env) { + if (device_config_parser_hook_) { + device_config_parser_hook_(env); + } + } + + // Parses the environment variable `env` to update the allocator settings. + // If the environment variable is not set, it does nothing. + // The configuration string should be a comma-separated list of key-value + // pairs, where each key is a configuration option and the value is the + // corresponding setting. For example: + // "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true" + void parseArgs(const std::string& env); + + private: + AcceleratorAllocatorConfig(); + + /* Internal functions for device allocator */ + + // Parse `max_split_size_mb` from environment variable. + size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i); + // Parse `max_non_split_rounding_mb` from environment variable. + size_t parseMaxNonSplitRoundingSize( + const ConfigTokenizer& tokenizer, + size_t i); + // Parse `garbage_collection_threshold` from environment variable. + size_t parseGarbageCollectionThreshold( + const ConfigTokenizer& tokenizer, + size_t i); + // Parse `roundup_power2_divisions` from environment variable. + size_t parseRoundUpPower2Divisions( + const ConfigTokenizer& tokenizer, + size_t i); + // Parse `expandable_segments` from environment variable. + size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i); + + /* Internal functions for host allocator */ + + // Parse `pinned_use_background_threads` from environment variable. + size_t parsePinnedUseBackgroundThreads( + const ConfigTokenizer& tokenizer, + size_t i); + + /* The following members are specifically used for the device allocator. */ + + // The maximum block size that is allowed to be split. + std::atomic max_split_size_{std::numeric_limits::max()}; + // The maximum allowable extra size of a memory block without requiring + // splitting when searching for a free block. + std::atomic max_non_split_rounding_size_{kLargeBuffer}; + // Used to store how memory allocations of different sizes should be rounded + // up to the nearest power of 2 divisions. + std::vector roundup_power2_divisions_; + // The threshold that triggers garbage collection when the ratio of used + // memory to maximum allowed memory exceeds this value. + std::atomic garbage_collection_threshold_{0}; + // A flag to enable expandable segments feature. + std::atomic use_expandable_segments_{false}; + + /* The following members are specifically used for the host allocator. */ + + // A flag to enable background thread for processing events. + std::atomic pinned_use_background_threads_{false}; + + /* The following members are used for both device and host allocator. */ + + // Record the last allocator config environment setting. + std::mutex last_allocator_settings_mutex_; + std::string last_allocator_settings_; + + // Optional hook for parsing additional device-specific allocator settings. + // This allows backends (e.g., CUDA, XPU) to register a custom parser for + // their own environment configuration extensions. + inline static std::function + device_config_parser_hook_{nullptr}; +}; + +C10_API inline void setAllocatorSettings(const std::string& env) { + AcceleratorAllocatorConfig::instance().parseArgs(env); + AcceleratorAllocatorConfig::callDeviceConfigParserHook(env); +} + +C10_API inline std::string getAllocatorSettings() { + return AcceleratorAllocatorConfig::instance().last_allocator_settings(); +} + +struct DeviceConfigParserHookRegistry { + explicit DeviceConfigParserHookRegistry( + std::function&& hook, + const std::unordered_set& keys) { + // Use static method to avoid static initialization order fiasco issues + AcceleratorAllocatorConfig::registerDeviceConfigParserHook( + std::move(hook), keys); + } +}; + +// Assume each config parser has `parseArgs` and `getKeys` methods +#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(parser_cls) \ + namespace { \ + static at::CachingAllocator::DeviceConfigParserHookRegistry \ + g_device_config_parse_hook_registry_instance( \ + [](const std::string& env) { \ + parser_cls::instance().parseArgs(env); \ + }, \ + parser_cls::getKeys()); \ + } + +} // namespace c10::CachingAllocator + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h new file mode 100644 index 0000000000000000000000000000000000000000..9d596b01d233dad00702dcad5269f146672861c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h @@ -0,0 +1,90 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +// Structure used to pack all the thread local boolean +// flags used by autograd +struct C10_API AutogradState { + static AutogradState& get_tls_state(); + static void set_tls_state(AutogradState state); + + AutogradState( + bool grad_mode, + bool inference_mode, + bool fw_grad_mode, + bool multithreading_enabled) + : graph_exec_group_(std::nullopt), + grad_mode_(grad_mode), + inference_mode_(inference_mode), + fw_grad_mode_(fw_grad_mode), + multithreading_enabled_(multithreading_enabled), + view_replay_enabled_(false) {} + + void set_grad_mode(bool enabled) { + grad_mode_ = enabled; + } + + void set_fw_grad_mode(bool enabled) { + fw_grad_mode_ = enabled; + } + + void set_inference_mode(bool enabled) { + inference_mode_ = enabled; + } + + void set_multithreading_enabled(bool multithreading_enabled) { + multithreading_enabled_ = multithreading_enabled; + } + + void set_view_replay_enabled(bool view_replay_enabled) { + view_replay_enabled_ = view_replay_enabled; + } + + void set_graph_exec_group(std::optional group) { + graph_exec_group_ = std::move(group); + } + + bool get_grad_mode() const { + return grad_mode_; + } + + bool get_fw_grad_mode() const { + return fw_grad_mode_; + } + + bool get_inference_mode() const { + return inference_mode_; + } + + bool get_multithreading_enabled() const { + return multithreading_enabled_; + } + + bool get_view_replay_enabled() const { + return view_replay_enabled_; + } + + const std::optional& get_graph_exec_group() const { + return graph_exec_group_; + } + + private: + std::optional graph_exec_group_; + bool grad_mode_ : 1; + bool inference_mode_ : 1; + bool fw_grad_mode_ : 1; + bool multithreading_enabled_ : 1; + // NOLINTNEXTLINE(cppcoreguidelines-use-default-member-init) + bool view_replay_enabled_ : 1; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h new file mode 100644 index 0000000000000000000000000000000000000000..d26c0089ae024b876be0df2821e3f562737ff35d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h @@ -0,0 +1,414 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + +namespace c10 { + +/** + * This legacy enum class defines the set of backends supported by old school, + * code generated Type-based ATen. A "backend" in this sense roughly + * corresponds to the cartesian product of (device type, layout), but restricted + * only to combinations which we actually have kernels for. Backend does NOT + * include dtype. + * + * The reason we are sunsetting this enum class is because it doesn't allow for + * open registration; e.g., if you want to add SparseXLA, you'd have to + * edit this enum; you wouldn't be able to do it out of tree. DispatchKey is + * the replacement for Backend which supports open registration. + * + * NB: The concept of 'Backend' here disagrees with the notion of backend + * exposed to users in torch.backends. Backend here is something like "CPU" + * or "SparseCUDA"; backend in torch.backends is something like "MKL" or + * "CUDNN". + */ +enum class Backend { + CPU, + CUDA, + HIP, + VE, + FPGA, + IPU, + XPU, + SparseCPU, + SparseCUDA, + SparseCsrCPU, + SparseCsrCUDA, + SparseCsrMPS, + SparseMPS, + SparseHIP, + SparseVE, + SparseXPU, + SparsePrivateUse1, + SparseCsrHIP, + SparseCsrVE, + SparseCsrXPU, + SparseCsrPrivateUse1, + MAIA, + XLA, + Vulkan, + Metal, + Meta, + QuantizedCPU, + QuantizedCUDA, + QuantizedXPU, + QuantizedPrivateUse1, + Undefined, + MkldnnCPU, + MPS, + HPU, + Lazy, + MTIA, + PrivateUse1, + NumOptions +}; + +inline Backend dispatchKeyToBackend(DispatchKey t) { + if (t == DispatchKey::CPU || t == DispatchKey::AutogradCPU) { + return Backend::CPU; + } else if (t == DispatchKey::CUDA || t == DispatchKey::AutogradCUDA) { + return Backend::CUDA; + } else if (t == DispatchKey::HIP) { + return Backend::HIP; + } else if (t == DispatchKey::VE) { + return Backend::VE; + } else if (t == DispatchKey::FPGA) { + return Backend::FPGA; + } else if (t == DispatchKey::MAIA || t == DispatchKey::AutogradMAIA) { + return Backend::MAIA; + } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) { + return Backend::XLA; + } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) { + return Backend::Lazy; + } else if (t == DispatchKey::MPS || t == DispatchKey::AutogradMPS) { + return Backend::MPS; + } else if (t == DispatchKey::Vulkan) { + return Backend::Vulkan; + } else if (t == DispatchKey::Metal) { + return Backend::Metal; + } else if (t == DispatchKey::Meta) { + return Backend::Meta; + } else if (t == DispatchKey::SparseCPU) { + return Backend::SparseCPU; + } else if (t == DispatchKey::SparseCUDA) { + return Backend::SparseCUDA; + } else if (t == DispatchKey::SparseMPS) { + return Backend::SparseMPS; + } else if (t == DispatchKey::SparseCsrMPS) { + return Backend::SparseCsrMPS; + } else if (t == DispatchKey::SparseHIP) { + return Backend::SparseHIP; + } else if (t == DispatchKey::SparseVE) { + return Backend::SparseVE; + } else if (t == DispatchKey::SparsePrivateUse1) { + return Backend::SparsePrivateUse1; + } else if (t == DispatchKey::SparseCsrCPU) { + return Backend::SparseCsrCPU; + } else if (t == DispatchKey::SparseCsrCUDA) { + return Backend::SparseCsrCUDA; + } else if (t == DispatchKey::SparseCsrHIP) { + return Backend::SparseCsrHIP; + } else if (t == DispatchKey::SparseCsrVE) { + return Backend::SparseCsrVE; + } else if (t == DispatchKey::SparseCsrPrivateUse1) { + return Backend::SparseCsrPrivateUse1; + } else if (t == DispatchKey::MkldnnCPU) { + return Backend::MkldnnCPU; + } else if (t == DispatchKey::QuantizedCPU) { + return Backend::QuantizedCPU; + } else if (t == DispatchKey::QuantizedCUDA) { + return Backend::QuantizedCUDA; + } else if (t == DispatchKey::IPU || t == DispatchKey::AutogradIPU) { + return Backend::IPU; + } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) { + return Backend::XPU; + } else if (t == DispatchKey::SparseXPU) { + return Backend::SparseXPU; + } else if (t == DispatchKey::SparseCsrXPU) { + return Backend::SparseCsrXPU; + } else if (t == DispatchKey::QuantizedXPU) { + return Backend::QuantizedXPU; + } else if (t == DispatchKey::QuantizedPrivateUse1) { + return Backend::QuantizedPrivateUse1; + } else if (t == DispatchKey::HPU || t == DispatchKey::AutogradHPU) { + return Backend::HPU; + } else if (t == DispatchKey::MTIA || t == DispatchKey::AutogradMTIA) { + return Backend::MTIA; + } else if ( + t == DispatchKey::PrivateUse1 || t == DispatchKey::AutogradPrivateUse1) { + return Backend::PrivateUse1; + } else if (t == DispatchKey::Undefined) { + return Backend::Undefined; + } else { + TORCH_CHECK(false, "Unrecognized tensor type ID: ", t); + } +} + +inline DispatchKey backendToDispatchKey(Backend b) { + switch (b) { + case Backend::CPU: + return DispatchKey::CPU; + case Backend::CUDA: + return DispatchKey::CUDA; + case Backend::HIP: + return DispatchKey::HIP; + case Backend::VE: + return DispatchKey::VE; + case Backend::FPGA: + return DispatchKey::FPGA; + case Backend::MAIA: + return DispatchKey::MAIA; + case Backend::XLA: + return DispatchKey::XLA; + case Backend::Lazy: + return DispatchKey::Lazy; + case Backend::IPU: + return DispatchKey::IPU; + case Backend::XPU: + return DispatchKey::XPU; + case Backend::SparseXPU: + return DispatchKey::SparseXPU; + case Backend::SparseCsrXPU: + return DispatchKey::SparseCsrXPU; + case Backend::SparseCPU: + return DispatchKey::SparseCPU; + case Backend::SparseCUDA: + return DispatchKey::SparseCUDA; + case Backend::SparseMPS: + return DispatchKey::SparseMPS; + case Backend::SparseCsrMPS: + return DispatchKey::SparseCsrMPS; + case Backend::SparseHIP: + return DispatchKey::SparseHIP; + case Backend::SparseVE: + return DispatchKey::SparseVE; + case Backend::SparsePrivateUse1: + return DispatchKey::SparsePrivateUse1; + case Backend::SparseCsrCPU: + return DispatchKey::SparseCsrCPU; + case Backend::SparseCsrCUDA: + return DispatchKey::SparseCsrCUDA; + case Backend::SparseCsrHIP: + return DispatchKey::SparseCsrHIP; + case Backend::SparseCsrVE: + return DispatchKey::SparseCsrVE; + case Backend::SparseCsrPrivateUse1: + return DispatchKey::SparseCsrPrivateUse1; + case Backend::MkldnnCPU: + return DispatchKey::MkldnnCPU; + case Backend::Vulkan: + return DispatchKey::Vulkan; + case Backend::Metal: + return DispatchKey::Metal; + case Backend::Meta: + return DispatchKey::Meta; + case Backend::QuantizedCPU: + return DispatchKey::QuantizedCPU; + case Backend::QuantizedCUDA: + return DispatchKey::QuantizedCUDA; + case Backend::QuantizedPrivateUse1: + return DispatchKey::QuantizedPrivateUse1; + case Backend::Undefined: + return DispatchKey::Undefined; + case Backend::MPS: + return DispatchKey::MPS; + case Backend::HPU: + return DispatchKey::HPU; + case Backend::MTIA: + return DispatchKey::MTIA; + case Backend::PrivateUse1: + return DispatchKey::PrivateUse1; + default: + TORCH_CHECK(false, "Unknown backend"); + } +} + +inline DeviceType backendToDeviceType(Backend b) { + switch (b) { + case Backend::CPU: + case Backend::MkldnnCPU: + case Backend::SparseCPU: + case Backend::SparseCsrCPU: + case Backend::QuantizedCPU: + return DeviceType::CPU; + case Backend::CUDA: + case Backend::SparseCUDA: + case Backend::QuantizedCUDA: + case Backend::SparseCsrCUDA: + return DeviceType::CUDA; + case Backend::HIP: + return DeviceType::HIP; + case Backend::VE: + return DeviceType::VE; + case Backend::FPGA: + return DeviceType::FPGA; + case Backend::MAIA: + return DeviceType::MAIA; + case Backend::XLA: + return DeviceType::XLA; + case Backend::Lazy: + return DeviceType::Lazy; + case Backend::SparseHIP: + return DeviceType::HIP; + case Backend::SparseVE: + return DeviceType::VE; + case Backend::SparseCsrHIP: + return DeviceType::HIP; + case Backend::SparseCsrVE: + return DeviceType::VE; + case Backend::IPU: + return DeviceType::IPU; + case Backend::XPU: + case Backend::SparseXPU: + case Backend::SparseCsrXPU: + case Backend::QuantizedXPU: + return DeviceType::XPU; + case Backend::Vulkan: + return DeviceType::Vulkan; + case Backend::Metal: + return DeviceType::Metal; + case Backend::Meta: + return DeviceType::Meta; + case Backend::MPS: + case Backend::SparseMPS: + case Backend::SparseCsrMPS: + return DeviceType::MPS; + case Backend::HPU: + return DeviceType::HPU; + case Backend::MTIA: + return DeviceType::MTIA; + case Backend::PrivateUse1: + case Backend::SparsePrivateUse1: + case Backend::SparseCsrPrivateUse1: + case Backend::QuantizedPrivateUse1: + return DeviceType::PrivateUse1; + case Backend::Undefined: + TORCH_CHECK(false, "Undefined backend is not a valid device type"); + default: + TORCH_CHECK(false, "Unknown backend"); + } +} + +inline const char* toString(Backend b) { + switch (b) { + case Backend::CPU: + return "CPU"; + case Backend::CUDA: + return "CUDA"; + case Backend::HIP: + return "HIP"; + case Backend::VE: + return "VE"; + case Backend::FPGA: + return "FPGA"; + case Backend::XPU: + return "XPU"; + case Backend::IPU: + return "IPU"; + case Backend::MAIA: + return "MAIA"; + case Backend::XLA: + return "XLA"; + case Backend::Lazy: + return "Lazy"; + case Backend::MPS: + return "MPS"; + case Backend::SparseCPU: + return "SparseCPU"; + case Backend::SparseCUDA: + return "SparseCUDA"; + case Backend::SparseMPS: + return "SparseMPS"; + case Backend::SparseCsrMPS: + return "SparseCsrMPS"; + case Backend::SparseHIP: + return "SparseHIP"; + case Backend::SparseVE: + return "SparseVE"; + case Backend::SparseXPU: + return "SparseXPU"; + case Backend::SparsePrivateUse1: + return "SparsePrivateUse1"; + case Backend::SparseCsrCPU: + return "SparseCsrCPU"; + case Backend::SparseCsrCUDA: + return "SparseCsrCUDA"; + case Backend::SparseCsrHIP: + return "SparseCsrHIP"; + case Backend::SparseCsrVE: + return "SparseCsrVE"; + case Backend::SparseCsrXPU: + return "SparseCsrXPU"; + case Backend::SparseCsrPrivateUse1: + return "SparseCsrPrivateUse1"; + case Backend::MkldnnCPU: + return "MkldnnCPU"; + case Backend::Vulkan: + return "Vulkan"; + case Backend::Metal: + return "Metal"; + case Backend::Meta: + return "Meta"; + case Backend::QuantizedCPU: + return "QuantizedCPU"; + case Backend::QuantizedCUDA: + return "QuantizedCUDA"; + case Backend::QuantizedXPU: + return "QuantizedXPU"; + case Backend::QuantizedPrivateUse1: + return "QuantizedPrivateUse1"; + case Backend::HPU: + return "HPU"; + case Backend::MTIA: + return "MTIA"; + case Backend::PrivateUse1: + return "PrivateUseOne"; + default: + return "UNKNOWN_BACKEND"; + } +} + +inline bool isSparse(Backend b) { + switch (b) { + case Backend::SparseXPU: + case Backend::SparseCPU: + case Backend::SparseCUDA: + case Backend::SparseMPS: + case Backend::SparseHIP: + case Backend::SparseVE: + case Backend::SparsePrivateUse1: + return true; + default: + return false; + } +} + +inline bool isSparseCsr(Backend b) { + switch (b) { + case Backend::SparseCsrXPU: + case Backend::SparseCsrCPU: + case Backend::SparseCsrCUDA: + case Backend::SparseCsrHIP: + case Backend::SparseCsrVE: + case Backend::SparseCsrPrivateUse1: + return true; + default: + return false; + } +} + +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..d43d48e32ee794092b23a488cbb8518a6d5d2623 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h @@ -0,0 +1,64 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +// TODO: rename to c10 +C10_DECLARE_bool(caffe2_report_cpu_memory_usage); + +namespace c10 { + +using MemoryDeleter = void (*)(void*); + +// A helper function that is basically doing nothing. +C10_API void NoDelete(void* /*unused*/); + +// A simple struct that is used to report C10's memory allocation, +// deallocation status and out-of-memory events to the profiler +class C10_API ProfiledCPUMemoryReporter { + public: + ProfiledCPUMemoryReporter() = default; + void New(void* ptr, size_t nbytes); + void OutOfMemory(size_t nbytes); + void Delete(void* ptr); + + private: + std::mutex mutex_; + std::unordered_map size_table_; + size_t allocated_ = 0; + size_t log_cnt_ = 0; +}; + +C10_API ProfiledCPUMemoryReporter& profiledCPUMemoryReporter(); + +// Get the CPU Allocator. +C10_API at::Allocator* GetCPUAllocator(); +// Sets the CPU allocator to the given allocator: the caller gives away the +// ownership of the pointer. +C10_API void SetCPUAllocator(at::Allocator* alloc, uint8_t priority = 0); + +// Get the Default CPU Allocator +C10_API at::Allocator* GetDefaultCPUAllocator(); + +// Get the Default Mobile CPU Allocator +C10_API at::Allocator* GetDefaultMobileCPUAllocator(); + +// The CPUCachingAllocator is experimental and might disappear in the future. +// The only place that uses it is in StaticRuntime. +// Set the CPU Caching Allocator +C10_API void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority = 0); +// Get the CPU Caching Allocator +C10_API Allocator* GetCPUCachingAllocator(); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..23b413de834aae788e8f763f60cd75ec7750dbea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h @@ -0,0 +1,126 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::CachingDeviceAllocator { + +using namespace c10::CachingAllocator; + +// Struct containing memory allocator summary statistics for a device. +struct DeviceStats { + // COUNT: allocations requested by client code + StatArray allocation; + // COUNT: number of allocated segments from device memory allocation. + StatArray segment; + // COUNT: number of active memory blocks (allocated or used by stream) + StatArray active; + // COUNT: number of inactive, split memory blocks (unallocated but can't be + // released via device memory deallocation) + StatArray inactive_split; + + // SUM: bytes allocated by this memory allocator + StatArray allocated_bytes; + // SUM: bytes reserved by this memory allocator (both free and used) + StatArray reserved_bytes; + // SUM: bytes within active memory blocks + StatArray active_bytes; + // SUM: bytes within inactive, split memory blocks + StatArray inactive_split_bytes; + // SUM: bytes requested by client code + StatArray requested_bytes; + + // COUNT: total number of failed calls to device malloc necessitating cache + // flushes. + int64_t num_alloc_retries = 0; + + // COUNT: total number of OOMs (i.e. failed calls to device memory allocation + // after cache flush) + int64_t num_ooms = 0; + + // COUNT: total number of oversize blocks allocated from pool + Stat oversize_allocations; + + // COUNT: total number of oversize blocks requiring malloc + Stat oversize_segments; + + // COUNT: total number of synchronize_and_free_events() calls + int64_t num_sync_all_streams = 0; + + // COUNT: total number of device memory allocation calls. This includes both + // mapped and malloced memory. + int64_t num_device_alloc = 0; + + // COUNT: total number of device memory deallocation calls. This includes both + // un-mapped and free memory. + int64_t num_device_free = 0; + + // SIZE: maximum block size that is allowed to be split. + int64_t max_split_size = 0; +}; + +} // namespace c10::CachingDeviceAllocator + +namespace c10 { + +using CaptureId_t = unsigned long long; + +// first is set if the instance is created by Graph mode capture_begin. +// second is set if the instance is created by Graph mode graph_pool_handle. +using MempoolId_t = std::pair; + +struct C10_API DeviceAllocator : public c10::Allocator { + DeviceAllocator(); + ~DeviceAllocator() override; + + // Returns true if the allocator has been properly initialized and is ready + // for use + virtual bool initialized() = 0; + + // Releases all cached device memory from the specified memory pool back to + // the system + virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; + + // Associates a memory allocation with a stream to establish dependency + // tracking. Prevents memory reuse until all operations on the specified + // stream complete + virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0; + + // Retrieves comprehensive memory statistics for the specified device, + // including allocation patterns, usage metrics + virtual CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) = 0; + + // Resets cumulative allocation statistics for the specified device to zero + virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; + + // Resets peak memory usage statistics for the specified device + virtual void resetPeakStats(c10::DeviceIndex device) = 0; + + // Return the free memory size and total memory size in bytes for the + // specified device. + virtual std::pair getMemoryInfo(c10::DeviceIndex device) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "getMemoryInfo is not implemented for this allocator yet."); + } +}; + +// This function is used to get the DeviceAllocator for a specific device type +// and keep backward compatibility with c10::GetAllocator. +C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) { + TORCH_CHECK( + t != DeviceType::CPU, + "getDeviceAllocator is not supported for CPU device type."); + auto* allocator = c10::GetAllocator(t); + auto* device_allocator = dynamic_cast(allocator); + TORCH_INTERNAL_ASSERT( + device_allocator, "Allocator for ", t, " is not a DeviceAllocator."); + return device_allocator; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h new file mode 100644 index 0000000000000000000000000000000000000000..28dd52759e8de0f4f2f2947e96ccd0dd7467a95c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { + +/** + * Represent a function pointer as a C++ type. + * This allows using the function pointer as a type + * in a template and calling it from inside the template + * allows the compiler to inline the call because it + * knows the function pointer at compile time. + * + * Example 1: + * int add(int a, int b) {return a + b;} + * using Add = TORCH_FN_TYPE(add); + * template struct Executor { + * int execute(int a, int b) { + * return Func::func_ptr()(a, b); + * } + * }; + * Executor executor; + * EXPECT_EQ(3, executor.execute(1, 2)); + * + * Example 2: + * int add(int a, int b) {return a + b;} + * template int execute(Func, int a, int b) { + * return Func::func_ptr()(a, b); + * } + * EXPECT_EQ(3, execute(TORCH_FN(add), 1, 2)); + */ +template +struct CompileTimeFunctionPointer final { + static_assert( + guts::is_function_type::value, + "TORCH_FN can only wrap function types."); + using FuncType = FuncType_; + + static constexpr FuncType* func_ptr() { + return func_ptr_; + } +}; + +template +struct is_compile_time_function_pointer : std::false_type {}; +template +struct is_compile_time_function_pointer< + CompileTimeFunctionPointer> : std::true_type {}; + +} // namespace c10 + +#define TORCH_FN_TYPE(func) \ + ::c10::CompileTimeFunctionPointer< \ + std::remove_pointer_t>, \ + func> +#define TORCH_FN(func) TORCH_FN_TYPE(func)() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..22a3cf2104d1c55c0d18681906cc4ae9c2c85800 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +// Unlike other SymNodeImpl, this cannot be "dispatched" conventionally, +// as it typically needs to defer to another SymNodeImpl +// +// Can either represent a bool, int (don't support float yet) this is useful +// for representing otherwise unrepresentable large negative integer constant. +template +class C10_API ConstantSymNodeImpl : public SymNodeImpl { + static_assert( + ::std::is_same_v || ::std::is_same_v, + "ConstantSymNodeImpl can only accept int64_t or bool types"); + + public: + ConstantSymNodeImpl(T val) : value_(val) {} + + bool is_int() override { + return is_int_(); + } + bool is_bool() override { + return is_bool_(); + } + bool is_float() override { + return false; + } + int64_t guard_int( + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) override { + TORCH_CHECK(is_int(), "not an int"); + return int_(); + } + bool guard_bool( + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) override { + TORCH_CHECK(is_bool(), "not a bool"); + return bool_(); + } + double guard_float( + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) override { + TORCH_CHECK(false, "not a float"); + } + int64_t int_() override { + TORCH_CHECK(is_int(), "not an int"); + return ::std::get(value_); + } + bool bool_() override { + TORCH_CHECK(is_bool(), "not a bool"); + return ::std::get(value_); + } + bool has_hint() override { + return true; + } + c10::SymNode eq(const c10::SymNode& other) override; + c10::SymNode ne(const c10::SymNode& other) override; + c10::SymNode ge(const c10::SymNode& other) override; + c10::SymNode le(const c10::SymNode& other) override; + c10::SymNode lt(const c10::SymNode& other) override; + c10::SymNode gt(const c10::SymNode& other) override; + c10::SymNode mul(const c10::SymNode& other) override; + ::std::string str() override { + if constexpr (is_int_()) { + return ::std::to_string(::std::get(value_)); + } else { + return ::std::get(value_) ? "true" : "false"; + } + } + std::optional constant_int() override { + if constexpr (is_int_()) { + return ::std::get(value_); + } else { + return std::nullopt; + } + } + std::optional constant_bool() override { + if constexpr (is_bool_()) { + return ::std::get(value_); + } else { + return std::nullopt; + } + } + bool is_constant() override { + return true; + } + bool is_symbolic() override { + return false; + } + + private: + ::std::variant value_; + + static constexpr bool is_int_() { + return ::std::is_same_v; + } + static constexpr bool is_bool_() { + return ::std::is_same_v; + } +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h new file mode 100644 index 0000000000000000000000000000000000000000..014903df018c3db2b2df40ca72ee4cd40ebf21c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h @@ -0,0 +1,314 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +#include +#include + +namespace c10 { + +template +bool _compute_contiguous(ArrayRef sizes, ArrayRef strides, T numel) { + if (numel == 0) { + return true; + } + + T expected_stride = 1; + // NB: make sure we do signed arithmetic + for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) { + const auto& size_d = sizes[d]; + if (size_d == 1) { + continue; + } + + if (strides[d] != expected_stride) { + return false; + } + expected_stride *= size_d; + } + return true; +} + +// Return a SymBool with underlying symbolic expression that represents +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. +inline static c10::SymBool _compute_contiguous_sym( + ArrayRef sizes, + ArrayRef strides, + const c10::SymInt& numel) { + // If this return true, the tensor is contiguous indeed. Otherwise it could be + // either. + auto is_contiguous_or_false = [&]() { + if (TORCH_GUARD_OR_FALSE(sym_eq(numel, 0))) { + return true; + } + + // When calculating the expected stride, we can choose to multiply + // with max(1, size[d]) or size[d]. Regardless, this is ok for this + // function. Why? + // (1) If size[d] == 0, then the tensor is contiguous and if + // we return true or false it won't break this function. + // (2) If size[d] is not 0, then max(1,size[d]) and size[d] are equal. + // Therefore, if we choose to use max(1, size[d]) or size[d] to + // calculate the expected stride, the result is the same. + // + // We symbolically check both paths to maximize the cases where this + // function returns true. This is because make_contiguous_strides_for adds + // the max symbolically, and in some other situations the max might not be + // there. And we want to ensure we return true in both cases. + c10::SymInt expected_stride = 1; + c10::SymInt expected_stride_max = 1; + // NB: make sure we do signed arithmetic + for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) { + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride)) && + TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride_max))) { + return false; + } + expected_stride_max *= sizes[d].max(1); + expected_stride *= sizes[d]; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed to + // avoid symbolic evaluation perf issues. + if (is_contiguous_or_false()) { + return c10::SymBool(true); + } + + // Build a single expression that represents contiguity and return it. + c10::SymBool is_empty = sym_eq(numel, 0); + c10::SymBool is_contiguous_cond = true; + + c10::SymInt expected_stride = 1; + for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) { + const auto& size_d = sizes[d]; + is_contiguous_cond = is_contiguous_cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride = expected_stride * size_d; + } + return is_contiguous_cond.sym_or(is_empty); +} + +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_2d_sym does not. Only use this function +// when inputs are hinted. +template +bool _compute_channels_last_contiguous_2d( + ArrayRef sizes, + ArrayRef strides) { + // Please don't combine these code, constant array is used here to let + // compiler fully unroll the loop to get better performance + switch (sizes.size()) { + case 4: { + T expected = 1; + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + if (size_d != 1) { + if (strides[d] != expected) { + return false; + } + expected *= size_d; + } + } + return true; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 3: + // TODO dim == 3 case will be enabled once it is fully tested + return false; + default: + return false; + } +} + +// Return a SymBool with underlying symbolic expression that represents +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. +inline static c10::SymBool _compute_channels_last_contiguous_2d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 4: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 3: + // TODO dim == 3 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_3d_sym does not. Only use this function +// when inputs are hinted. +template +bool _compute_channels_last_contiguous_3d( + ArrayRef sizes, + ArrayRef strides) { + // Please don't combine these code, constant array is used here to let + // compiler fully unroll the loop to get better performance + switch (sizes.size()) { + case 5: { + T expected = 1; + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + if (size_d != 1) { + if (strides[d] != expected) { + return false; + } + expected *= size_d; + } + } + return true; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 4: + // TODO dim == 4 case will be enabled once it is fully tested + return false; + default: + return false; + } +} + +inline static c10::SymBool _compute_channels_last_contiguous_3d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 5: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 4: + // TODO dim == 4 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + +template +bool _compute_non_overlapping_and_dense( + ArrayRef sizes, + ArrayRef strides) { + auto dim = sizes.size(); + if (dim == 1) { + return sizes[0] < 2 || strides[0] == 1; + } + SmallVector perm; + perm.resize(dim); + for (const auto i : c10::irange(dim)) { + perm[i] = i; + } + // Sort by strides, leaving 0 and 1 sized dims at the end of the array + std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) { + if (sizes[a] < 2) { + return false; + } else if (sizes[b] < 2) { + return true; + } + return strides[a] < strides[b]; + }); + T require_stride = 1; + for (const auto i : c10::irange(dim)) { + const auto& size_perm_i = sizes[perm[i]]; + if (size_perm_i < 2) { + return true; + } + if (strides[perm[i]] != require_stride) { + return false; + } + require_stride *= size_perm_i; + } + return true; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h new file mode 100644 index 0000000000000000000000000000000000000000..bc2632794299da5a6c9c5d30be0b4591600bab2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +using CopyBytesFunction = void (*)( + size_t nbytes, + const void* src, + Device src_device, + void* dst, + Device dst_device); + +struct C10_API _CopyBytesFunctionRegisterer { + _CopyBytesFunctionRegisterer( + DeviceType from, + DeviceType to, + CopyBytesFunction func_sync, + CopyBytesFunction func_async = nullptr); +}; + +#define REGISTER_COPY_BYTES_FUNCTION(from, to, ...) \ + namespace { \ + static _CopyBytesFunctionRegisterer C10_ANONYMOUS_VARIABLE( \ + g_copy_function)(from, to, __VA_ARGS__); \ + } + +/* + * WARNING: Implementations for this function are currently registered from + * ATen and caffe2, not yet from c10. Don't use this if not either ATen + * or caffe2 is present as well. + * We can't move them yet, because the CUDA implementations aren't unified yet + * between ATen and caffe2. + * We're planning to move the implementations into c10/backend/xxx + * to make c10 self contained again. + */ +C10_API void CopyBytes( + size_t nbytes, + const void* src, + Device src_device, + void* dst, + Device dst_device, + bool async); +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h new file mode 100644 index 0000000000000000000000000000000000000000..240c173ca22ae28ab20e243890b2f8a054156fa5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace caffe2 { +class TypeMeta; +} // namespace caffe2 + +namespace c10 { +C10_API void set_default_dtype(caffe2::TypeMeta dtype); +C10_API const caffe2::TypeMeta get_default_dtype(); +C10_API ScalarType get_default_dtype_as_scalartype(); +C10_API const caffe2::TypeMeta get_default_complex_dtype(); +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..8d5e66ec405ddeb1494d987a034cf1b945663667 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +struct TensorOptions; + +/// Like TensorOptions, but all fields are guaranteed to be filled. +struct DefaultTensorOptions { + DefaultTensorOptions() = default; + + caffe2::TypeMeta dtype() const noexcept { + return dtype_; + } + Device device() const noexcept { + return device_; + } + Layout layout() const noexcept { + return layout_; + } + bool requires_grad() const noexcept { + return requires_grad_; + } + + // Defined in TensorOptions.h + inline DefaultTensorOptions& merge(const TensorOptions& options); + + private: + caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make(); // 64-bit + Device device_ = at::kCPU; // 32-bit + Layout layout_ = at::kStrided; // 8-bit + bool requires_grad_ = false; // 8-bit +}; + +inline const DefaultTensorOptions& getDefaultTensorOptions() { + static const auto options = DefaultTensorOptions(); + return options; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h new file mode 100644 index 0000000000000000000000000000000000000000..d3380f434c6c8284476ac3bc662fd88e10289a86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h @@ -0,0 +1,221 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace c10 { + +/// An index representing a specific device; e.g., the 1 in GPU 1. +/// A DeviceIndex is not independently meaningful without knowing +/// the DeviceType it is associated; try to use Device rather than +/// DeviceIndex directly. +using DeviceIndex = int8_t; + +/// Represents a compute device on which a tensor is located. A device is +/// uniquely identified by a type, which specifies the type of machine it is +/// (e.g. CPU or CUDA GPU), and a device index or ordinal, which identifies the +/// specific compute device when there is more than one of a certain type. The +/// device index is optional, and in its defaulted state represents (abstractly) +/// "the current device". Further, there are two constraints on the value of the +/// device index, if one is explicitly stored: +/// 1. A negative index represents the current device, a non-negative index +/// represents a specific, concrete device, +/// 2. When the device type is CPU, the device index must be zero. +struct C10_API Device final { + using Type = DeviceType; + + /// Constructs a new `Device` from a `DeviceType` and an optional device + /// index. + /* implicit */ Device(DeviceType type, DeviceIndex index = -1) + : type_(type), index_(index) { + validate(); + } + + /// Constructs a `Device` from a string description, for convenience. + /// The string supplied must follow the following schema: + /// `(cpu|cuda)[:]` + /// where `cpu` or `cuda` specifies the device type, and + /// `:` optionally specifies a device index. + /* implicit */ Device(const std::string& device_string); + + /// Returns true if the type and index of this `Device` matches that of + /// `other`. + bool operator==(const Device& other) const noexcept { + return this->type_ == other.type_ && this->index_ == other.index_; + } + + /// Returns true if the type or index of this `Device` differs from that of + /// `other`. + bool operator!=(const Device& other) const noexcept { + return !(*this == other); + } + + /// Sets the device index. + void set_index(DeviceIndex index) { + index_ = index; + } + + /// Returns the type of device this is. + DeviceType type() const noexcept { + return type_; + } + + /// Returns the optional index. + DeviceIndex index() const noexcept { + return index_; + } + + /// Returns true if the device has a non-default index. + bool has_index() const noexcept { + return index_ != -1; + } + + /// Return true if the device is of CUDA type. + bool is_cuda() const noexcept { + return type_ == DeviceType::CUDA; + } + + /// Return true if the device is of PrivateUse1 type. + bool is_privateuseone() const noexcept { + return type_ == DeviceType::PrivateUse1; + } + + /// Return true if the device is of MPS type. + bool is_mps() const noexcept { + return type_ == DeviceType::MPS; + } + + /// Return true if the device is of HIP type. + bool is_hip() const noexcept { + return type_ == DeviceType::HIP; + } + + /// Return true if the device is of VE type. + bool is_ve() const noexcept { + return type_ == DeviceType::VE; + } + + /// Return true if the device is of XPU type. + bool is_xpu() const noexcept { + return type_ == DeviceType::XPU; + } + + /// Return true if the device is of IPU type. + bool is_ipu() const noexcept { + return type_ == DeviceType::IPU; + } + + /// Return true if the device is of XLA type. + bool is_xla() const noexcept { + return type_ == DeviceType::XLA; + } + + /// Return true if the device is of MTIA type. + bool is_mtia() const noexcept { + return type_ == DeviceType::MTIA; + } + + /// Return true if the device is of HPU type. + bool is_hpu() const noexcept { + return type_ == DeviceType::HPU; + } + + /// Return true if the device is of Lazy type. + bool is_lazy() const noexcept { + return type_ == DeviceType::Lazy; + } + + /// Return true if the device is of Vulkan type. + bool is_vulkan() const noexcept { + return type_ == DeviceType::Vulkan; + } + + /// Return true if the device is of Metal type. + bool is_metal() const noexcept { + return type_ == DeviceType::Metal; + } + + /// Return true if the device is of MAIA type. + bool is_maia() const noexcept { + return type_ == DeviceType::MAIA; + } + + /// Return true if the device is of META type. + bool is_meta() const noexcept { + return type_ == DeviceType::Meta; + } + + /// Return true if the device is of CPU type. + bool is_cpu() const noexcept { + return type_ == DeviceType::CPU; + } + + /// Return true if the device supports arbitrary strides. + bool supports_as_strided() const noexcept { + return type_ != DeviceType::IPU && type_ != DeviceType::XLA && + type_ != DeviceType::Lazy; + } + + /// Same string as returned from operator<<. + std::string str() const; + + private: + DeviceType type_; + DeviceIndex index_ = -1; + void validate() { + // Removing these checks in release builds noticeably improves + // performance in micro-benchmarks. + // This is safe to do, because backends that use the DeviceIndex + // have a later check when we actually try to switch to that device. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + index_ >= -1, + "Device index must be -1 or non-negative, got ", + static_cast(index_)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !is_cpu() || index_ <= 0, + "CPU device index must be -1 or zero, got ", + static_cast(index_)); + } +}; + +C10_API std::ostream& operator<<(std::ostream& stream, const Device& device); + +} // namespace c10 + +namespace std { +template <> +struct hash { + size_t operator()(c10::Device d) const noexcept { + // Are you here because this static assert failed? Make sure you ensure + // that the bitmasking code below is updated accordingly! + static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit"); + static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit"); + // Note [Hazard when concatenating signed integers] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // We must first convert to a same-sized unsigned type, before promoting to + // the result type, to prevent sign extension when any of the values is -1. + // If sign extension occurs, you'll clobber all of the values in the MSB + // half of the resulting integer. + // + // Technically, by C/C++ integer promotion rules, we only need one of the + // uint32_t casts to the result type, but we put in both for explicitness's + // sake. + uint32_t bits = static_cast(static_cast(d.type())) + << 16 | + static_cast(static_cast(d.index())); + return std::hash{}(bits); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h new file mode 100644 index 0000000000000000000000000000000000000000..b2b179b4d2d82385aefe1f1b79cb2069120500d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include + +namespace c10 { + +template +class DeviceArray { + public: + DeviceArray(c10::Allocator& allocator, size_t size) + : data_ptr_(allocator.allocate(size * sizeof(T))) { + static_assert(std::is_trivial_v, "T must be a trivial type"); + TORCH_INTERNAL_ASSERT( + 0 == (reinterpret_cast(data_ptr_.get()) % alignof(T)), + "c10::DeviceArray: Allocated memory is not aligned for this data type"); + } + + T* get() { + return static_cast(data_ptr_.get()); + } + + private: + c10::DataPtr data_ptr_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h new file mode 100644 index 0000000000000000000000000000000000000000..85477281261bed35e2652ddc471c9bae4042707a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h @@ -0,0 +1,81 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +constexpr size_t NUMBER_OF_DEVICE_CAPABILITIES = NumScalarTypes; + +// Generate bitfields for each scalar type +#define DEFINE_SCALAR_TYPE(_1, n) unsigned int has_##n : 1; + +// Generate enum indices for each scalar type +#define DEFINE_SCALAR_ENUM(_1, name) kIndex_##name, + +enum ScalarTypeIndex { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_ENUM) +}; + +/** + * @brief DeviceCapability represents the the common capabilities that all + * devices should support. + * + * This struct provides a compact way to represent the common capabilities that + * all devices should support. Includes the following capabilities: + * - Supported data types + * + * Purpose + * - Enable device-specific optimizations based on supported capabilities + * + * Contract + * + * Supported data types: + * - Each bitfield represents support for one device capability + * - Bit value 1 means the capability is supported, 0 means not supported + * - The struct is initialized with all capabilities enabled by default + * + * @note Adding New Capabilities + * + * 1. Define the new capability in the `DeviceCapability` struct + * 2. Update the support of the new capability in each accelerator + * implementation + * 3. Add the new capability to the returned PyObject Dictionary + */ +struct C10_API DeviceCapability { + union { + struct { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_TYPE) + } supported_scalar_types; + uint64_t capability_bits; // Allow direct bit manipulation + } capability_data; + + // Default constructor with all capabilities enabled. + DeviceCapability() { + capability_data.capability_bits = + ((1ULL << NUMBER_OF_DEVICE_CAPABILITIES) - 1); + } + + // Iterate supported ScalarTypes without allocating a vector + template + void forEachSupportedScalarType(F&& visitor) const { +#define VISIT_SCALAR_TYPE(_1, n) \ + if (capability_data.supported_scalar_types.has_##n) { \ + visitor(ScalarType::n); \ + } + + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(VISIT_SCALAR_TYPE) + +#undef VISIT_SCALAR_TYPE + } +}; + +#undef DEFINE_SCALAR_ENUM +#undef DEFINE_SCALAR_TYPE +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..389ac29d10029d915279857f4fb4e2ffeb880307 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h @@ -0,0 +1,207 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +/// RAII guard that sets a certain default device in its constructor, and +/// changes it back to the device that was originally active upon destruction. +/// +/// The device is always reset to the one that was active at the time of +/// construction of the guard. Even if you `set_device` after construction, the +/// destructor will still reset the device to the one that was active at +/// construction time. +/// +/// This device guard does NOT have an uninitialized state; it is guaranteed +/// to reset a device on exit. If you are in a situation where you *might* +/// want to setup a guard (i.e., are looking for the moral equivalent +/// of std::optional), see OptionalDeviceGuard. +class DeviceGuard { + public: + /// No default constructor; see Note [Omitted default constructor from RAII] + explicit DeviceGuard() = delete; + + /// Set the current device to the passed Device. + explicit DeviceGuard(Device device) : guard_(device) {} + + /// This constructor is for testing only. + explicit DeviceGuard( + Device device, + const impl::DeviceGuardImplInterface* impl) + : guard_(device, impl) {} + + ~DeviceGuard() = default; + + /// Copy is disallowed + DeviceGuard(const DeviceGuard&) = delete; + DeviceGuard& operator=(const DeviceGuard&) = delete; + + /// Move is disallowed, as DeviceGuard does not have an uninitialized state, + /// which is required for moves on types with nontrivial destructors. + DeviceGuard(DeviceGuard&& other) = delete; + DeviceGuard& operator=(DeviceGuard&& other) = delete; + + /// Sets the device to the given one. The specified device must be consistent + /// with the device type originally specified during guard construction. + /// + /// TODO: The consistency check here is inconsistent with StreamGuard's + /// behavior with set_stream, where a stream on a different device than + /// the original one isn't an error; we just reset the stream and then + /// switch devices. + void reset_device(at::Device device) { + guard_.reset_device(device); + } + + /// This method is for testing only. + void reset_device( + at::Device device, + const impl::DeviceGuardImplInterface* impl) { + guard_.reset_device(device, impl); + } + + /// Sets the device index to the given one. The device type is inferred + /// from the original device type the guard was constructed with. + void set_index(DeviceIndex index) { + guard_.set_index(index); + } + + /// Returns the device that was set at the time the guard was constructed. + Device original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device. + Device current_device() const { + return guard_.current_device(); + } + + private: + impl::InlineDeviceGuard guard_; +}; + +/** + * A OptionalDeviceGuard is an RAII class that sets a device to some value on + * initialization, and resets the device to its original value on destruction. + * Morally, a OptionalDeviceGuard is equivalent to std::optional, + * but with extra constructors and methods as appropriate. + * + * Besides its obvious use (optionally applying a DeviceGuard), + * OptionalDeviceGuard is often also used for the following idiom: + * + * OptionalDeviceGuard g; + * for (const auto& t : tensors) { + * g.set_device(t.device()); + * do_something_with(t); + * } + * + * This usage is marginally more efficient than constructing a DeviceGuard every + * iteration of the for loop, as it avoids an unnecessary device reset. + * + * Unlike DeviceGuard, a OptionalDeviceGuard may be uninitialized. This occurs + * when you use the nullary constructor, or pass a nullopt to the constructor. + * Uninitialized OptionalDeviceGuards do *nothing*; they do not know what the + * original device was and they do not reset on destruction. This is why + * original_device() and current_device() return std::optional rather + * than Device (as they do in DeviceGuard), and also is why we didn't just + * provide OptionalDeviceGuard by default and hide DeviceGuard from users. + * + * The semantics of an OptionalDeviceGuard are exactly explained by thinking + * of it as an std::optional. In particular, an initialized + * OptionalDeviceGuard doesn't restore device to its value at construction; it + * restores device to its value *at initialization*. So if you have the + * program: + * + * setDevice(1); + * OptionalDeviceGuard g; + * setDevice(2); + * g.reset_device(Device(DeviceType::CUDA, 3)); // initializes! + * + * On destruction, g will reset device to 2, rather than 1. + * + * An uninitialized OptionalDeviceGuard is distinct from a (initialized) + * DeviceGuard whose original_device_ and current_device_ match, since the + * DeviceGuard will still reset the device to original_device_. + */ +class OptionalDeviceGuard { + public: + /// Create an uninitialized guard. Set the guard later using reset_device. + explicit OptionalDeviceGuard() = default; + + /// Initialize the guard, setting the current device to the passed Device. + explicit OptionalDeviceGuard(Device device) : guard_(device) {} + + /// Initialize the guard if a Device is passed; otherwise leave the + /// guard uninitialized. + explicit OptionalDeviceGuard(std::optional device) : guard_(device) {} + + /// Constructor for testing only. + explicit OptionalDeviceGuard( + Device device, + const impl::DeviceGuardImplInterface* impl) + : guard_(device, impl) {} + + ~OptionalDeviceGuard() = default; + /// Copy is disallowed + OptionalDeviceGuard(const OptionalDeviceGuard&) = delete; + OptionalDeviceGuard& operator=(const OptionalDeviceGuard&) = delete; + + /// Move is disallowed + /// See Note [Explicit initialization of optional fields] + /// and // Note [Move construction for RAII guards is tricky] + /// for rationale. + OptionalDeviceGuard(OptionalDeviceGuard&& other) = delete; + OptionalDeviceGuard& operator=(OptionalDeviceGuard&& other) = delete; + + /// Sets the device to the given one. The specified device must be consistent + /// with the device type originally specified during guard construction. + void reset_device(at::Device device) { + guard_.reset_device(device); + } + + /// For testing only + void reset_device( + at::Device device, + const impl::DeviceGuardImplInterface* impl) { + guard_.reset_device(device, impl); + } + + /// Returns the device that was set at the time the guard was constructed. + std::optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via reset_device. + std::optional current_device() const { + return guard_.current_device(); + } + + private: + impl::InlineOptionalDeviceGuard guard_; +}; + +// Note [Whither the DeviceGuard boilerplate] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Design note: in principle, we could avoid these wrappers using: +// +// using DeviceGuard = impl::InlineDeviceGuard; +// using OptionalDeviceGuard = +// impl::InlineOptionalDeviceGuard; +// +// But the error messages are worse, and our users can't just look at the +// header file to find out what's going on. Furthermore, for specializations +// like CUDAStreamGuard, it can be profitable to replace some interfaces with +// refined types (e.g., return CUDAStream instead of Stream). So, we eat +// the boilerplate and write out the API explicitly. + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h new file mode 100644 index 0000000000000000000000000000000000000000..3847b5e2650e4100d19dc0031747769f709b92f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync +// your changes into torch/headeronly/core/DeviceType.h. +#include + +#include +#include + +namespace c10 { + +C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false); + +C10_API bool isValidDeviceType(DeviceType d); + +C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type); + +C10_API void register_privateuse1_backend(const std::string& backend_name); +C10_API std::string get_privateuse1_backend(bool lower_case = true); + +C10_API bool is_privateuse1_backend_registered(); + +} // namespace c10 + +namespace torch { +// NOLINTNEXTLINE(misc-unused-using-decls) +using c10::DeviceType; +} // namespace torch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h new file mode 100644 index 0000000000000000000000000000000000000000..2aa647574ccbc1112d10a5558255d9a5b625a9b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h @@ -0,0 +1,750 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +// Semantically, each value of BackendComponent identifies a "backend" for our +// dispatch. Some functionalities that we may dispatch to are allowed to +// register different handlers for each backend. The BackendComponent is then +// used to figure out which backend implementation to dispatch to. + +// In implementation terms, the backend component identifies a specific "bit" in +// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom +// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to +// functionalities. When we encounter a functionality bit that is known to be +// customizable per-backend, then we also look at the lower BackendComponent +// bits and take the highest bit to determine which backend's implementation to +// use. + +// WARNING! If you add a new backend component to the end of this list, +// make sure you register it before Meta. +// Meta must be at the end so that meta key in tls triggers meta kernels. +// (But you shouldn't: private use keys should have higher precedence than all +// built-in keys) + +// If you add a new (non-privateuse) backend here, +// make sure to add an Autograd fallthrough kernel +// in aten/src/ATen/core/VariableFallbackKernel.cpp + +#define C10_FORALL_BACKEND_COMPONENTS(_, extra) \ + _(CPU, extra) \ + _(CUDA, extra) \ + _(HIP, extra) \ + _(XLA, extra) \ + _(MPS, extra) \ + _(IPU, extra) \ + _(XPU, extra) \ + _(HPU, extra) \ + _(VE, extra) \ + _(Lazy, extra) \ + _(MTIA, extra) \ + _(MAIA, extra) \ + _(PrivateUse1, extra) \ + _(PrivateUse2, extra) \ + _(PrivateUse3, extra) \ + _(Meta, extra) + +// WARNING! If we add a new per-backend functionality key that has higher +// priority than Autograd, then make sure you update EndOfRuntimeBackendKeys + +#define C10_FORALL_FUNCTIONALITY_KEYS(_) \ + _(Dense, ) \ + _(Quantized, Quantized) \ + _(Sparse, Sparse) \ + _(SparseCsr, SparseCsr) \ + _(NestedTensor, NestedTensor) \ + _(AutogradFunctionality, Autograd) + +enum class BackendComponent : uint8_t { + + // A "backend" is colloquially used to refer to handlers for dispatch + // which actually implement the numerics of an operation in question. + // + // Due to the nature of the enum, these backends are specified in + // an ordered way, but for most backends this order is not semantically + // meaningful (e.g., it's valid to reorder these backends without changing + // semantics). The only situation when backend ordering is meaningful + // is when the backend participates in multiple dispatch with another + // backend; e.g., CPU and CUDA (cuda must have higher priority). + + // These keys don't correspond to individual kernels. + // Instead, they represent the backends that are allowed to override specific + // pieces of functionality: + // - dense kernels (e.g. DispatchKey::CPU) + // - sparse kernels (e.g. DispatchKey::SparseCPU) + // - quantized kernels (e.g. DispatchKey::QuantizedCPU) + // - autograd kernels (e.g. DispatchKey::AutogradCPU) + // We reserve space in the runtime operator table for this full cross product + // of + // [backends in this enum] x [keys below that are explicitly marked as having + // per-backend functionality] + // + // A meta tensor is a tensor without any data associated with it. (They + // have also colloquially been referred to as tensors on the "null" device). + // A meta tensor can be used to dry run operators without actually doing any + // computation, e.g., add on two meta tensors would give you another meta + // tensor with the output shape and dtype, but wouldn't actually add anything. + + InvalidBit = 0, +#define DEFINE_BACKEND_COMPONENT(n, _) n##Bit, + C10_FORALL_BACKEND_COMPONENTS(DEFINE_BACKEND_COMPONENT, unused) +#undef DEFINE_BACKEND_COMPONENT + + // Define an alias to represent end of backend dispatch keys. + // If you add new backend keys after PrivateUse3, please also update it here. + EndOfBackendKeys = MetaBit, +}; + +// Semantically, a dispatch key identifies a possible "level" in our +// dispatch, for which a handler may be registered. Each handler corresponds +// to a type of functionality. +// +// In implementation terms, the dispatch key identifies a specific "bit" in a +// DispatchKeySet. Higher bit indexes get handled by dispatching first (because +// we "count leading zeros" when we extract the highest priority dispatch +// key.) +// +// Note [DispatchKey Classification] +// This enum actually contains several types of keys, which are explained +// in more detail further down: +// (1) non-customizable backends (e.g. FPGA) +// (2) non-customizable functionalities (e.g. Functionalize) +// (3) functionalized that are customizable per backend (e.g. Dense, Sparse, +// AutogradFunctionality) (4) per-backend instances of customizable +// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g. +// CompositeImplicitAutograd) +// +// Of the categories above, it's important to note: +// (a) which keys are assigned individual bits in a DispatchKeySet +// (b) which keys are assigned individual slots in the runtime operator table +// ("Runtime keys") +// +// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet. +// (1), (2) and (4) all get their own dedicated slots in the runtime operator +// table. + +// See Note [DispatchKeySet Internal Representation] for more details. +// +// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py +enum class DispatchKey : uint16_t { + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // This is not a "real" functionality, but it exists to give us a "nullopt" + // element we can return for cases when a DispatchKeySet contains no elements. + // You can think a more semantically accurate definition of DispatchKey is: + // + // using DispatchKey = std::optional + // + // and Undefined == nullopt. We didn't actually represent + // it this way because std::optional would take two + // words, when DispatchKey fits in eight bits. + + Undefined = 0, + + // Define an alias for Undefined to represent CatchAll (long term + // this will get eliminated, but for now it's convenient) + CatchAll = Undefined, + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ // + // Every value in the enum (up to EndOfFunctionalityKeys) + // corresponds to an individual "functionality" that can be dispatched to. + // This is represented in the DispatchKeySet by assigning each of these enum + // values + // to each of the remaining (64 - len(BackendComponent)) bits. + // + // Most of these functionalities have a single handler assigned to them, + // making them "runtime keys". + // That map to a single slot in the runtime operator table. + // + // A few functionalities are allowed to be customizable per backend. + // See [Note: Per-Backend Functionality Dispatch Keys] for details. + + // See [Note: Per-Backend Functionality Dispatch Keys] + Dense, + + // Below are non-extensible backends. + // These are backends that currently don't have their own overrides for + // Autograd/Sparse/Quantized kernels, + // and we therefore don't waste space in the runtime operator table allocating + // space for them. + // If any of these backends ever need to customize, e.g., Autograd, then we'll + // need to add a DispatchKey::*Bit for them. + + // TODO: put this in BackendComponents + FPGA, // Xilinx support lives out of tree at + // https://gitlab.com/pytorch-complex/vitis_kernels + + Vulkan, // TODO: put this in BackendComponents + Metal, // TODO: put this in BackendComponents + + // See [Note: Per-Backend Functionality Dispatch Keys] + Quantized, + + // This backend is to support custom RNGs; it lets you go + // to a different kernel if you pass in a generator that is not a + // traditional CPUGeneratorImpl/CUDAGeneratorImpl. To make use of this + // key: + // 1) set it as a second parameter of at::Generator constructor call in + // the user-defined PRNG class. + // 2) use it as a dispatch key while registering custom kernels + // (templatized kernels specialized for user-defined PRNG class) + // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp + CustomRNGKeyId, + + // TODO: Make Mkldnn a functionality key, so we can give it Meta + // support + // Here are backends which specify more specialized operators + // based on the layout of the tensor. Note that the sparse backends + // are one case where ordering matters: sparse multi-dispatches with + // the corresponding dense tensors, and must be handled before them. + MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp + // NB: not to be confused with MKLDNN, which is Caffe2 only + + // See [Note: Per-Backend Functionality Dispatch Keys] + Sparse, + + SparseCsr, + + NestedTensor, + + // In some situations, it is not immediately obvious what the correct + // backend for function is, because the function in question doesn't + // have any "tensor" arguments. In this case, a BackendSelect function + // can be registered to implement the custom determination of the + // correct backend. + BackendSelect, + + Python, + + // Out-of-core key for Fake Tensor in torchdistx. + // See https://pytorch.org/torchdistx/latest/fake_tensor.html + // TODO: delete this in favor of Python-implemented fake tensor + Fake, + // See Note [Out-of-tree vmap+grad prototype]. The purpose of this key + // is to insert code after the "autograd subsystem" runs, so this key should + // be directly after ADInplaceOrView and all of the autograd keys. + FuncTorchDynamicLayerBackMode, + + // Alias and mutation removal. + // If some backends want to opt into only alias removal or only mutation + // removal, + // we can consider adding separate keys dedicated to those individual passes. + // See Note [Functionalization Pass In Core] for details. + Functionalize, + + // The named dispatch key is set for any tensors with named dimensions. + // Although we have a dispatch key for named tensors, for historical reasons, + // this dispatch key doesn't do any of the substantive functionality for named + // tensor (though, hypothetically, it could!) At the moment, it's just + // responsible for letting us give good error messages when operations + // don't support named tensors. + // + // NB: If you ever consider moving named tensor functionality into + // this dispatch key, note that it might be necessary add another dispatch + // key that triggers before composite operators, in case a composite operator + // has named dimension propagation that doesn't match that of its + // constituent parts. + // TODO: delete this once torchdim lands in functorch + Named, + + // The Conjugate dispatch key is set for any tensors that need to perform + // conjugation + // This is implemented at a dispatch level right before any backends run + Conjugate, + + // The Negative dispatch key is set for any tensors that need to perform + // negation + // This is implemented at a dispatch level right before any backends run + Negative, + + ZeroTensor, // registered at build/aten/src/ATen/RegisterZeroTensor.cpp + + // Note [ADInplaceOrView key] + // ADInplaceOrView key is used by inplace or view ops to register a kernel + // that does additional setup for future autograd computation. + // + // 1. For inplace ops this kernel does version bump + // 2. For view ops this kernel does `as_view` setup where we properly setup + // DifferentiableViewMeta on the view tensors. + // + // For other ops it's fallthrough kernel since there's no extra + // work to do. + // + // Note [Dream: skip VariableType kernel when requires_grad=false] + // + // In an ideal world where we can skip VariableType kernel for inputs + // with requires_grad=false, instead of a fallthrough kernel, we'll + // register a kernel shown below to all functional ops as well: + // torch::Tensor my_functional_op(...) { + // { + // // Note for every op in VariableType, you need to go through + // // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the + // // key to TLS excluded set. If you don't go through it at all, + // // inplace/view ops called through `at::` inside your backend + // // kernel will dispatch to ADInplaceOrView kernels and do a lot + // // of extra work. + // at::AutoDispatchBelowADInplaceOrView guard; + // at::redispatch::my_functional_op(...); + // } + // } + // But this work is currently blocked since it adds an extra dispatch + // for all ops and it's non-trivial overhead at model level(a few percents). + // Thus our current approach takes advantage of the fact every kernel go + // through VariableType kernel first and pulls the + // `at::AutoDispatchBelowADInplaceOrView` guard of functional ops + // up to the `VariableType` kernel. Thus we only add the extra dispatch + // to view/inplace ops to minimize its perf impact to real models. + ADInplaceOrView, + // Note [Alias Dispatch Key : Autograd] + // All backends are oblivious to autograd; autograd is handled as a + // layer which happens on top of all backends. It inspects the autograd + // metadata of all inputs, determines what autograd metadata should be + // constructed by the output, and otherwise defers to the backend to + // actually do the numeric computation. Autograd contains + // the bulk of this logic. + + // Autograd is now an alias dispatch key which by default maps to all + // backend-specific autograd keys. + // Backend-specific allow backends to override the default kernel registered + // to Autograd key as needed. + // For example, XLA wants to define autograd for einsum directly. + // Registering a custom autograd implementation at the XLA key won't work + // because we process Autograd before XLA. This key has higher priority and + // gets processed first. You generally should NOT redispatch after handling + // autograd here (since that would result in execution of the Autograd + // operator, which you're trying to skip). In AutogradXLA implementations, + // you are responsible for handling autograd yourself, or deferring to other + // operators which support autograd. + + // Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and + // reserved user-defined backends. All other in-tree backends share the + // AutogradOther key. We can add specific autograd key for those backends + // upon request. + AutogradOther, + + // See [Note: Per-Backend Functionality Dispatch Keys] + AutogradFunctionality, + + // NestedTensor is an example of something that isn't a "real backend" + // (because it mostly consists of redispatching kernels) + // but it would like to override autograd functionality in C++. + // We can handle cases like this by adding an extra functionality key + // exclusively for handling autograd for NestedTensor. + // lives out of tree at + // https://github.com/pytorch/nestedtensor + AutogradNestedTensor, + + Tracer, + + // TODO: make Autocast a functionality key + // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed + // and inputs are saved for backward in the post-autocast type. + AutocastCPU, + AutocastMTIA, + AutocastMAIA, + AutocastXPU, + AutocastIPU, + AutocastHPU, + AutocastXLA, + // AutocastXLA is only being used for TPUs. XLA GPUs continue to use + // AutocastCUDA. + AutocastMPS, + AutocastCUDA, + AutocastPrivateUse1, + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // There are a number of alternative modes which may want to handle before + // autograd; for example, error checking, tracing, profiling or vmap. They + // go here. + + FuncTorchBatched, // See Note [Out-of-tree vmap+grad prototype] + + // Dispatch key for BatchedTensorImpl wrapping a nested tensor. + BatchedNestedTensor, + + FuncTorchVmapMode, // See Note [Out-of-tree vmap+grad prototype] + + // This is the dispatch key for BatchedTensorImpl, which is used to implement + // batching rules for vmap. + Batched, + + // When we are inside a vmap, all tensors dispatch on this key. + // See Note: [DispatchKey::VmapMode usage] for more details. + VmapMode, + + FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype] + + // Out-of-core key for Deferred Module Initialization in torchdistx. + // See https://pytorch.org/torchdistx/latest/deferred_init.html + DeferredInit, + + // Used by Python key logic to know the set of tls on entry to the dispatcher + // This kernel assumes it is the top-most non-functorch-related DispatchKey. + // If you add a key above, make sure to update the fallback implementation for + // this. + PythonTLSSnapshot, + + // This key should be at the very top of the dispatcher + FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] + + // TESTING: This is intended to be a generic testing tensor type id. + // Don't use it for anything real; its only acceptable use is within a single + // process test. Use it by creating a TensorImpl with this DispatchKey, and + // then registering operators to operate on this type id. See + // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example. + TESTING_ONLY_GenericWrapper, + + // TESTING: This is intended to be a generic testing tensor type id. + // Don't use it for anything real; its only acceptable use is within a ingle + // process test. Use it by toggling the mode on and off via + // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators + // to operate on this type id. See + // aten/src/ATen/core/dispatch/backend_fallback_test.cpp + // for a usage example + TESTING_ONLY_GenericMode, + + // This key is used for pre-dispatch tracing in make_fx. + // It has lower priority than the PythonDispatcher key + // because we use the PythonDispatcher to intercept the key from python, + // and avoid having to implement it in C++. + PreDispatch, + + // This is a bypass that allows you to skip running the C++ dispatcher + // entirely + PythonDispatcher, + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + EndOfFunctionalityKeys, // End of functionality keys. + +// ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ // +// Here are backends which you think of as traditionally specifying +// how to implement operations on some device. + +#define DEFINE_PER_BACKEND_KEYS_FOR_BACKEND(n, prefix) prefix##n, + +#define DEFINE_PER_BACKEND_KEYS(fullname, prefix) \ + StartOf##fullname##Backends, \ + C10_FORALL_BACKEND_COMPONENTS( \ + DEFINE_PER_BACKEND_KEYS_FOR_BACKEND, prefix) \ + EndOf##fullname##Backends = prefix##Meta, + + C10_FORALL_FUNCTIONALITY_KEYS(DEFINE_PER_BACKEND_KEYS) + +#undef DEFINE_PER_BACKEND_KEYS +#undef DEFINE_PER_BACKEND_KEYS_FOR_BACKEND + + EndOfRuntimeBackendKeys = EndOfAutogradFunctionalityBackends, + + // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // Note [Alias Dispatch Keys] + // Alias dispatch keys are synthetic dispatch keys which map to multiple + // runtime dispatch keys. Alisa keys have precedence, but they are always + // lower precedence than runtime keys. You can register a kernel to an + // alias key, the kernel might be populated to the mapped runtime keys + // during dispatch table computation. + // If a runtime dispatch key has multiple kernels from alias keys, which + // kernel wins is done based on the precedence of alias keys (but runtime + // keys always have precedence over alias keys). + // Alias keys won't be directly called during runtime. + + // See Note [Alias Dispatch Key : Autograd] + Autograd, + CompositeImplicitAutograd, // registered at + // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp + + // Note: The alias keyset for FuncTorchBatchedDecomposition is disjoint from + // all + // other alias keysets + // and so precedence order doesn't matter + FuncTorchBatchedDecomposition, // registered at + // build/aten/src/ATen/RegisterFuncTorchBatchedDecomposition.cpp + // Note: The alias keyset for CompositeImplicitAutogradNestedTensor is + // disjoint from all other alias keysets + CompositeImplicitAutogradNestedTensor, // registered at + // build/aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp + CompositeExplicitAutograd, // registered at + // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp + // See Note [CompositeExplicitAutogradNonFunctional Key] + CompositeExplicitAutogradNonFunctional, // registered at + // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp + + // Define an alias key to represent end of alias dispatch keys. + // If you add new alias keys after Autograd, please also update it here. + StartOfAliasKeys = Autograd, + EndOfAliasKeys = CompositeExplicitAutogradNonFunctional, // + + // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // The aliases exist for backwards compatibility reasons, they shouldn't + // be used + CPUTensorId = CPU, + CUDATensorId = CUDA, + DefaultBackend = CompositeExplicitAutograd, + PrivateUse1_PreAutograd = AutogradPrivateUse1, + PrivateUse2_PreAutograd = AutogradPrivateUse2, + PrivateUse3_PreAutograd = AutogradPrivateUse3, + Autocast = AutocastCUDA, +}; + +// Note [Private use DispatchKey] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Private use tensor IDs are preallocated tensor type IDs for use in user +// applications. Similar to private use fields in HTTP, they can be used +// by end users for experimental or private applications, without needing +// to "standardize" the tensor ID (which would be done by submitting a PR +// to PyTorch to add your type ID). +// +// Private use tensor IDs are appropriate to use if you want to experiment +// with adding a new tensor type (without having to patch PyTorch first) or +// have a private, non-distributed application that needs to make use of a +// new tensor type. Private use tensor IDs are NOT appropriate to use for +// libraries intended to be distributed to further users: please contact +// the PyTorch developers to get a type ID registered in this case. +// +// We provide two classes of private user tensor id: regular DispatchKeys +// and Autograd DispatchKeys. DispatchKeys serve the role of ordinary "backend" +// DispatchKeys; if you were adding support for a new type of accelerator, you +// would use a backend DispatchKey, and ideally automatically reuse +// AutogradOther definitions already defined in PyTorch. AutogradPrivateUse +// DispatchKeys serve as "wrapper" DispatchKeys: they are only necessary for +// tensors that compose multiple internal tensors, and for cases when the +// built-in autograd formulas for operators are not appropriate. + +static_assert( + (static_cast(BackendComponent::EndOfBackendKeys) + + static_cast(DispatchKey::EndOfFunctionalityKeys)) <= 64, + "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)" + " both map to backend and functionality bits" + " into a 64-bit bitmask; you must have less than 64 total entries between them"); + +// Check if a DispatchKey is an alias mapping to other runtime keys. +constexpr bool isAliasDispatchKey(DispatchKey k) { + return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys; +} + +// [Note: Per-Backend Functionality Dispatch Keys] +// Check if a DispatchKey is a per-backend functionality key +// Any functionalities that can be customized per-backend should be added here. +// These keys correspond to functionalities that can be customized individually +// per backend. While they only take up one bit in the `DispatchKeySet` bitset, +// they map to (# backends) slots in the operator table. +// Each of these keys also has a separate set of "runtime keys" in the dispatch +// key enum, per backend, which *do* map to the individual operator table slots. +// For example, the "Sparse" key maps to an individual bit in the +// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual +// slots in the runtime operator table. + +constexpr bool isPerBackendFunctionalityKey(DispatchKey k) { + if (k == DispatchKey::Dense || k == DispatchKey::Quantized || + k == DispatchKey::Sparse || k == DispatchKey::SparseCsr || + k == DispatchKey::AutogradFunctionality || + k == DispatchKey::NestedTensor) { + return true; + } else { + return false; + } +} + +// Note that this includes Undefined in the total count. +// BUT EndOfFunctionalityKeys is its own (placeholder) key. +// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3. +// In the above example, there are 3 total functionality keys. +constexpr uint8_t num_functionality_keys = + static_cast(DispatchKey::EndOfFunctionalityKeys); + +constexpr uint8_t num_backends = + static_cast(BackendComponent::EndOfBackendKeys); + +// Note [No More Than 16 Backends] +// Search for this note to find places in the code where the "no more than 16 +// backends" invariant is baked in. +static_assert( + static_cast(BackendComponent::EndOfBackendKeys) <= 16, + "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \ +there are a few places where this invariant is baked in"); + +constexpr uint8_t numPerBackendFunctionalityKeys() { + uint8_t count = 0; + for (uint8_t k = 0; k <= num_functionality_keys; ++k) { + if (isPerBackendFunctionalityKey(static_cast(k))) + ++count; + } + return count; +} + +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) +// See [Note: Trimmed Mobile Dispatch Keys] +constexpr uint16_t num_runtime_entries = 8; +#else +constexpr uint16_t num_runtime_entries = num_functionality_keys + + (numPerBackendFunctionalityKeys() * (num_backends - 1)); +#endif + +// See Note [No More Than 16 Backends] +constexpr uint16_t full_backend_mask = + (static_cast(1) << num_backends) - 1; + +C10_API const char* toString(DispatchKey /*t*/); +C10_API const char* toString(BackendComponent /*t*/); +C10_API std::ostream& operator<<(std::ostream& /*str*/, DispatchKey /*rhs*/); +C10_API std::ostream& operator<<( + std::ostream& /*str*/, + BackendComponent /*rhs*/); + +C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k); + +// Parses a string into a dispatch key. +// If the string cannot be correctly parsed, throws an exception. +C10_API c10::DispatchKey parseDispatchKey(const std::string& k); + +// These are some convenience identifiers for dispatch keys which are +// shorter to type than their long counterparts. Note that some of these +// dispatch keys directly correspond to DeviceType; and most APIs that +// accept DispatchKey also accept DeviceType; e.g., +// torch::dispatch(torch::kCPU, ...) is also valid. +constexpr DispatchKey kAutograd = DispatchKey::Autograd; + +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr BackendComponent toBackendComponent(DispatchKey k) { + if (k >= DispatchKey::StartOfDenseBackends && + k <= DispatchKey::EndOfDenseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfDenseBackends)); + } else if ( + k >= DispatchKey::StartOfQuantizedBackends && + k <= DispatchKey::EndOfQuantizedBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfQuantizedBackends)); + } else if ( + k >= DispatchKey::StartOfSparseBackends && + k <= DispatchKey::EndOfSparseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfSparseBackends)); + } else if ( + k >= DispatchKey::StartOfSparseCsrBackends && + k <= DispatchKey::EndOfSparseCsrBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfSparseCsrBackends)); + } else if ( + k >= DispatchKey::StartOfNestedTensorBackends && + k <= DispatchKey::EndOfNestedTensorBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfNestedTensorBackends)); + } else if ( + k >= DispatchKey::StartOfAutogradFunctionalityBackends && + k <= DispatchKey::EndOfAutogradFunctionalityBackends) { + return static_cast( + static_cast(k) - + static_cast( + DispatchKey::StartOfAutogradFunctionalityBackends)); + } else { + return BackendComponent::InvalidBit; + } +} + +constexpr DispatchKey toFunctionalityKey(DispatchKey k) { + if (k <= DispatchKey::EndOfFunctionalityKeys) { + return k; + } else if (k <= DispatchKey::EndOfDenseBackends) { + return DispatchKey::Dense; + } else if (k <= DispatchKey::EndOfQuantizedBackends) { + return DispatchKey::Quantized; + } else if (k <= DispatchKey::EndOfSparseBackends) { + return DispatchKey::Sparse; + } else if (k <= DispatchKey::EndOfSparseCsrBackends) { + return DispatchKey::SparseCsr; + } else if (k <= DispatchKey::EndOfNestedTensorBackends) { + return DispatchKey::NestedTensor; + } else if (k <= DispatchKey::EndOfAutogradFunctionalityBackends) { + return DispatchKey::AutogradFunctionality; + } else { + return DispatchKey::Undefined; + } +} + +BackendComponent toBackendComponent(DeviceType device_type); + +// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns +// DispatchKey::CUDA. +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr DispatchKey toRuntimePerBackendFunctionalityKey( + DispatchKey functionality_k, + BackendComponent backend_k) { + if (functionality_k == DispatchKey::Dense) { + return static_cast( + static_cast(DispatchKey::StartOfDenseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Sparse) { + return static_cast( + static_cast(DispatchKey::StartOfSparseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::SparseCsr) { + return static_cast( + static_cast(DispatchKey::StartOfSparseCsrBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Quantized) { + return static_cast( + static_cast(DispatchKey::StartOfQuantizedBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::NestedTensor) { + return static_cast( + static_cast(DispatchKey::StartOfNestedTensorBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::AutogradFunctionality) { + return static_cast( + static_cast( + DispatchKey::StartOfAutogradFunctionalityBackends) + + static_cast(backend_k)); + } + return DispatchKey::Undefined; +} + +} // namespace c10 + +namespace torch { +// Expose the constant, but not the TYPE (DispatchKey is an implementation +// detail!) +// NOLINTNEXTLINE(misc-unused-using-decls) +using c10::kAutograd; +} // namespace torch + +// NB: You really shouldn't use this instance; this enum is guaranteed +// to be pretty small so a regular array should be acceptable. +namespace std { +template <> +struct hash { + typedef size_t result_type; + typedef c10::DispatchKey argument_type; + + size_t operator()(c10::DispatchKey x) const { + return static_cast(x); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h new file mode 100644 index 0000000000000000000000000000000000000000..ec3aff4e0c2295b2490cd29d30aa1117e6bb0441 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h @@ -0,0 +1,977 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + +namespace c10 { + +struct FunctionalityOffsetAndMask { + // empty constructor shouldn't be used; only needed to initialize + // the array before populating it. + FunctionalityOffsetAndMask() = default; + FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask) + : offset(offset), mask(mask) {} + // This needs to big enough to cover the size of the operator table. + uint16_t offset{}; + // See Note [No More Than 16 Backends] + // This mask needs to be big enough to mask all of the backend bits. + // We probably don't ever want to have more than 16 backend bits, so uint16_t + // should be enough. + uint16_t mask{}; +}; +static_assert( + c10::num_runtime_entries < 65536, + "The dispatcher currently only supports up to 2^16 runtime entries"); + +C10_API std::array +initializeFunctionalityOffsetsAndMasks(); + +C10_ALWAYS_INLINE static const std:: + array& + offsetsAndMasks() { + static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks(); + return offsets_and_masks_; +} + +// A representation of a set of DispatchKeys. A DispatchKeySet contains both +// "functionality" bits and "backend bits", and every tensor holds its own +// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the +// keyset on every input tensor, or’ing them together, and dispatching to a +// specific piece of functionality. The functionality bits are *ordered*. When +// multiple functionality bits are set, we use the highest priority +// functionality. Similarly, multiple backend bits can theoretically be set if +// you call an operator with multiple tensors from difference devices (e.g. CPU +// and CUDA), although support for mixed device dispatch is limited (the only +// kernels that gracefully handle mixed device inputs for now are cuda kernels +// that take in a scalar cpu tensor). + +// A representation of a set of DispatchKeys. A tensor may have multiple +// tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the +// DispatchKeySet specifies what type ids apply. The internal representation is +// as a 64-bit bit set (this means only 64 tensor type ids are supported). +// +// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like +// "what is the highest priority DispatchKey in the set"? (The set itself is +// not ordered; two sets with the same ids will always have the ids ordered in +// the same way.) +// +// Note [DispatchKeySet Internal Representation] +// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects +// that get passed around at runtime. +// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset +// and individual dispatch keys. +// +// First: why do we have this distinction, and why not map every dispatch key +// directly to a bit? This is mostly because we have several types of +// functionalities that different backends would like to customize. For example, +// we have: +// - "Dense": CPU, CUDA, XLA, ... (~12 keys) +// - "Sparse": SparseCPU, SparseCUDA, ... +// - "SparseCsr": SparseCsrCPU, SparseCsrCUDA, ... +// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ... +// - "Autograd": AutogradCPU, AutogradCUDA, Autograd XLA, ... +// The problem is that total number of keys grows quadratically with [# +// backends] x [# functionalities], making it very difficult to map each key +// directly to a bit in a bitset without dramatically increasing the size of the +// bitset over time. +// +// The two enums (BackendComponent and DispatchKey) can be divided roughly into +// 5 categories. +// +// (1) "Building block" keys +// (a) backends: Everything in the BackendComponent enum (e.g. CPUBit, +// CUDABit) (b) functionalities: (per-backend) functionality-bit DispatchKeys +// (e.g. AutogradFunctionality, SparseCsr, Sparse, Dense) +// (2) "Runtime" keys +// (a) "non-customizable backends" (e.g. FPGA) +// (b) "non-customizable functionalities" (e.g. Functionalize) +// (c) "per-backend instances of customizable functionalities" (e.g. CPU, +// SparseCPU, AutogradCPU) +// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys]) +// +// (1) Building block keys always correspond to individual bits in a +// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual +// runtime keys. e.g. +// auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit, +// DispatchKey::Dense}); +// // The keyset has the runtime dense-cpu key. +// dense_cpu_ks.has(DispatchKey::CPU); +// // And it contains the building block keys too. +// dense_cpu_ks.has(DispatchKey::CPUBit); +// dense_cpu_ks.has(DispatchKey::Dense); +// +// Not every backend and not every functionality counts as a "building block +// key". This is mostly to give us more levers to pull in the design space. +// Backend keys and functionality keys that count as "building blocks" will +// contribute to a full cross product of functionality that can be overridden. +// +// For example, right now we have at least 12 "backend" building +// blocks (CPU, CUDA, XLA, ...) and at least 5 "functionality" +// building blocks (Dense, Sparse, SparseCsr, Quantized, +// AutogradFunctionality, ...). These keys together allow every +// dispatcher operator to be customized in up to 12*4 different +// ways. Each of those requires a slot in the operator table of every +// dispatcher operator. Not every piece of functionality necessarily +// needs to be customizable per-backend, and not every backend +// necessarily needs to be able to customize every type of +// functionality. +// +// +// (2) Every runtime key corresponds directly to a slot in an operator's runtime +// dispatch table, and you can directly register kernels to a runtime dispatch +// key. +// +// For per-backend functionalities like "Dense" or "AutogradFunctionality", +// you can think of the corresponding runtime dispatch keys as "instances" of +// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all +// runtime instances of the "Dense" building block key. + +// (2a) and (2b) are represented identically in the DispatchKeySet logic: +// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT +// customizable per backend. +// In order to do so, we'd need to promote it to a per-backend functionality +// "building block" key. +// - non-customizable backends (e.g. FPGA) can NOT customize existing +// functionality like Sparse, Autograd, etc. +// In order to do so, we'd need to promote it to a backend "building block" +// key. +// +// In both cases, these keys directly correspond to runtime slots in the +// operator table. +// +// +// (3) "Alias" keys +// See Note [Alias Dispatch Keys] +// +// Final note: for anyone making future changes to the Dispatcher + +// DispatchKeySet internals, there's a closed PR with a basic +// python-implementation of the Dispatcher that might be useful in quickly +// testing out and validating changes. See it at +// https://github.com/pytorch/pytorch/pull/68743 + +// An undefined tensor is one with an empty tensor type set. +class DispatchKeySet final { + public: + enum Full { FULL }; + enum FullAfter { FULL_AFTER }; + enum Raw { RAW }; + + // NB: default constructor representation as zero is MANDATORY as + // use of DispatchKeySet in TLS requires this. + constexpr DispatchKeySet() = default; + + constexpr DispatchKeySet(Full /*unused*/) + : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {} + + constexpr DispatchKeySet(FullAfter /*unused*/, DispatchKey t) + // LSB after t are OK, but not t itself. + // "functionalities" have a notion of ordering (e.g. Autograd > Sparse > + // Quantized > Dense). But backends don't really have an ordering. + // Therefore, we're enforcing that FullAfter can only be used on + // "functionality" keys. + : repr_( + (1ULL + << (num_backends + static_cast(toFunctionalityKey(t)) - + 1)) - + 1) { + *this = add(DispatchKey::PythonDispatcher); + } + + // Public version of DispatchKeySet(uint64_t) API; external users + // must be explicit when they do this! + constexpr DispatchKeySet(Raw /*unused*/, uint64_t x) : repr_(x) {} + + constexpr explicit DispatchKeySet(BackendComponent k) { + if (k == BackendComponent::InvalidBit) { + repr_ = 0; + } else { + repr_ = 1ULL << (static_cast(k) - 1); + } + } + + constexpr explicit DispatchKeySet(DispatchKey k) { + // NOLINTNEXTLINE(bugprone-branch-clone) + if (k == DispatchKey::Undefined) { + // Case 1: handle Undefined specifically + repr_ = 0; + } else if (k <= DispatchKey::EndOfFunctionalityKeys) { + // Case 2: handle "functionality-only" keys + // These keys have a functionality bit set, but no backend bits + // These can technically be either: + // - valid runtime keys (e.g. DispatchKey::AutogradOther, + // DispatchKey::FuncTorchBatched, etc) + // - "building block" keys that aren't actual runtime keys (e.g. + // DispatchKey::Dense or Sparse) + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(k) - 1); + repr_ = functionality_val; + } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) { + // Case 3: "runtime" keys that have a functionality bit AND a backend bit. + // First compute which bit to flip for the functionality. + auto functionality_k = toFunctionalityKey(k); + // The - 1 is because Undefined is technically a "functionality" that + // doesn't show up in the bitset. So e.g. Dense is technically the second + // functionality, but the lowest functionality bit. + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(functionality_k) - 1); + + // then compute which bit to flip for the backend + // Case 4a: handle the runtime instances of "per-backend functionality" + // keys For example, given DispatchKey::CPU, we should set: + // - the Dense functionality bit + // - the CPUBit backend bit + // first compute which bit to flip for the backend + auto backend_k = toBackendComponent(k); + uint64_t backend_val = backend_k == BackendComponent::InvalidBit + ? 0 + : 1ULL << (static_cast(backend_k) - 1); + repr_ = functionality_val + backend_val; + } else { + // At this point, we should have covered every case except for alias keys. + // Technically it would be possible to add alias dispatch keys to a + // DispatchKeySet, but the semantics are a little confusing and this + // currently isn't needed anywhere. + repr_ = 0; + } + } + + constexpr uint64_t keys_to_repr(std::initializer_list ks) { + uint64_t repr = 0; + for (auto k : ks) { + repr |= DispatchKeySet(k).repr_; + } + return repr; + } + + constexpr uint64_t backend_bits_to_repr( + std::initializer_list ks) { + uint64_t repr = 0; + for (auto k : ks) { + repr |= DispatchKeySet(k).repr_; + } + return repr; + } + + explicit constexpr DispatchKeySet(std::initializer_list ks) + : repr_(keys_to_repr(ks)) {} + + explicit constexpr DispatchKeySet(std::initializer_list ks) + // Note: for some reason, putting this logic directly in the constructor + // appears to fail to compile on CUDA 10.1. + // See an example internal failure at + // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr + : repr_(backend_bits_to_repr(ks)) {} + + // Test if a DispatchKey is in the set + inline bool has(DispatchKey t) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined); + return has_all(DispatchKeySet(t)); + } + constexpr bool has_backend(BackendComponent t) const { + return has_all(DispatchKeySet(t)); + } + + // Test if a DispatchKey is in the set + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if all of them are in the current set. + constexpr bool has_all(DispatchKeySet ks) const { + return static_cast((repr_ & ks.repr_) == ks.repr_); + } + + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if any of them are in the current set. This could technically + // be pretty easily implemented using has(). It is strictly a perf + // optimization though. There are many places in the code base where we want + // to test for multiple functionality keys together. HOWEVER, runtime + // per-backend functionality keys aren't allowed to be used with this + // function, because you can end up with weird results. e.g. + // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU)) + // would return true. + inline bool has_any(DispatchKeySet ks) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + // Either there are no backend bits in the input keyset + ((ks.repr_ & full_backend_mask) == 0) || + // or there are no per-backend-functionality bits + // See [Note: Per-Backend Functionality Dispatch Keys] + ((ks & + DispatchKeySet({ + DispatchKey::Dense, + DispatchKey::Quantized, + DispatchKey::Sparse, + DispatchKey::SparseCsr, + DispatchKey::AutogradFunctionality, + }) + .repr_) == 0)); + return static_cast((repr_ & ks.repr_) != 0); + } + // Test if DispatchKeySet is a superset of ks. + bool isSupersetOf(DispatchKeySet ks) const { + return (repr_ & ks.repr_) == ks.repr_; + } + // Perform set union + constexpr DispatchKeySet operator|(DispatchKeySet other) const { + return DispatchKeySet(repr_ | other.repr_); + } + // Perform set intersection + constexpr DispatchKeySet operator&(DispatchKeySet other) const { + return DispatchKeySet(repr_ & other.repr_); + } + // Compute the set difference self - other, + // but ONLY for the functionality keys. + // Any backend bits set on self will remain unchanged. + // See Note [Removing keys from DispatchKeySet Only Affects Functionality + // Keys] + constexpr DispatchKeySet operator-(DispatchKeySet other) const { + return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_)); + } + + // Compute self ^ other + constexpr DispatchKeySet operator^(DispatchKeySet other) const { + return DispatchKeySet(repr_ ^ other.repr_); + } + bool operator==(DispatchKeySet other) const { + return repr_ == other.repr_; + } + bool operator!=(DispatchKeySet other) const { + return repr_ != other.repr_; + } + // Add a DispatchKey to the DispatchKey set. Does NOT mutate, + // returns the extended DispatchKeySet! + [[nodiscard]] constexpr DispatchKeySet add(DispatchKey t) const { + return *this | DispatchKeySet(t); + } + [[nodiscard]] constexpr DispatchKeySet add(DispatchKeySet ks) const { + return *this | ks; + } + + // Remove a DispatchKey from the DispatchKey set. + // This is generally not an operation you should be doing + // (it's used to implement the printing overload, operator<<) + // + // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys] + // Only functionality bits are allowed to be removed from a keyset. + // For now, we're only allowing removal of "functionality bits" from the + // keyset, which is specifically needed by the fallthrough key calculation + // logic. Why is removing backend bits problematic? Consider this example: + // + // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA, + // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA) + // DispatchKeySet([DispatchKey.CPU, + // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA) + // + // What do we want to happen? + // Technically, we'd like it to be true that after removal, + // the first keyset still has the CUDA dispatch key while the second doesn't. + // Unfortunately there's no way to represent that, because the two keysets are + // represented the same way internally: functionality bits: Autograd, Dense + // backend bits: CPU, CUDA + // + // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd" + // bit from the bitset. + [[nodiscard]] constexpr DispatchKeySet remove(DispatchKey t) const { + return DispatchKeySet( + repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask)); + } + // You're allowed to remove a backend bit from a DispatchKeySet, + // but you have to be explicit about it (remove_backend() instead of + // remove()). + constexpr DispatchKeySet remove_backend(BackendComponent b) const { + return DispatchKeySet(repr_ & ~(DispatchKeySet(b).repr_)); + } + // Is the set empty? (AKA undefined tensor) + bool empty() const { + return repr_ == 0; + } + uint64_t raw_repr() const { + return repr_; + } + + static DispatchKeySet from_raw_repr(uint64_t x) { + return DispatchKeySet(RAW, x); + } + + DispatchKey highestFunctionalityKey() const { + auto functionality_idx = indexOfHighestBit(); + // This means that none of the functionality bits were set. + if (functionality_idx < num_backends) + return DispatchKey::Undefined; + // The first num_backend bits in the keyset don't correspond to real + // dispatch keys. + return static_cast(functionality_idx - num_backends); + } + + // This is similar like toBackendComponent(DispatchKey), but less restrictive. + // toBackendComponent() errors out if the key that it was passed has no + // backend bits, which is useful for error checking. We need a version of that + // here that can also handle "fake" backends like FPGA, because they need to + // map to the AutogradOther key. For those backends, we return + // BackendComponent::InvalidBit. + BackendComponent highestBackendKey() const { + // mask to mask out functionality bits + auto backend_idx = + DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit(); + // all zeros across the backend bits means that no backend bits are set. + if (backend_idx == 0) + return BackendComponent::InvalidBit; + return static_cast(backend_idx); + } + + // returns the DispatchKey of highest priority in the set. + DispatchKey highestPriorityTypeId() const { + auto functionality_k = highestFunctionalityKey(); + if (isPerBackendFunctionalityKey(functionality_k)) { + return toRuntimePerBackendFunctionalityKey( + functionality_k, highestBackendKey()); + } + return functionality_k; + } + + // Returns the index of the most-significant bit in the keyset. + // This is used to as part of the calculation into the operator table to get: + // - the highest "functionality" bit in the keyset. + // - the highest "backend" bit in the keyset. + uint8_t indexOfHighestBit() const { + return 64 - llvm::countLeadingZeros(repr_); + } + +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) + // [Note: Trimmed Mobile Dispatch Keys] + /** + * The method below maps the dispatch key in the enum DispatchKey to an + * integer index in the dispatchTable_ array in OperatorEntry. The array + * is trimmed for mobile to reduce peak memory usage since it's + * unnecessary to reserve additional space for dispatch keys that will + * never be used on mobile. + */ + int getDispatchTableIndexForDispatchKeySet() const { + auto dk = highestPriorityTypeId(); + switch (dk) { + case DispatchKey::Undefined: + return 0; + case DispatchKey::CPU: + return 1; + case DispatchKey::QuantizedCPU: + return 2; + case DispatchKey::SparseCPU: + return 3; + case DispatchKey::BackendSelect: + return 4; + case DispatchKey::ADInplaceOrView: + return 5; + case DispatchKey::AutogradOther: + return 6; + case DispatchKey::AutogradCPU: + return 7; + default: + return -1; + } + } +#else + // returns the index in the operator table of highest priority key in the the + // keyset Note that we could in theory implement this using + // highestPriorityTypeId(), but this code is very hotpath and we can do it + // faster without it. + int getDispatchTableIndexForDispatchKeySet() const { + auto functionality_idx = + DispatchKeySet(repr_ >> num_backends).indexOfHighestBit(); + auto offset_and_mask = offsetsAndMasks()[functionality_idx]; + // Mask the functionality bits out first, then right-shift by 1. + // right-shifting by 1 because everything is zero-indexed. + // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should + // give us an offset of 1, etc. + auto backend_idx = + DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit(); + return offset_and_mask.offset + backend_idx; + } +#endif + + // returns the "index" of the highest priority backend in the keyset. + // This is pretty similar to getBackendKey(), but: + // - It's hotpath code (part of the runtime bitset calculation) + // - I's returns an integer index, not an enum value + // - Everything is shifted to the right by 1. + // BackendComponent::InvalidBit is technically the lowest enum value, + // but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2, + // etc. + uint64_t getBackendIndex() const { + return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit(); + } + + private: + constexpr DispatchKeySet(uint64_t repr) : repr_(repr) {} + uint64_t repr_ = 0; + + public: + // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys + // in the set. The iterator is only invalidated by the destruction of the + // underlying DispatchKeySet as the iterator stores a pointer to the raw + // representation of the DispatchKeySet. Note: When we encounter a per-backend + // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend + // in the keyset, for that functionality. For example, if the next + // functionality key to iterate over is Autograd, and the backend bits in the + // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit], + // then the next two keys we return will be DispatchKey::AutogradCPU, + // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than + // CUDA in DispatchKey.h). + class iterator { + public: + using self_type = iterator; + using iterator_category = std::input_iterator_tag; + using value_type = DispatchKey; + using difference_type = ptrdiff_t; + using reference = value_type&; + using pointer = value_type*; + // final mask value should mask out the entire keyset + static const uint8_t end_iter_mask_val = + num_backends + num_functionality_keys; + // final key value should be the last DispatchKey + static const uint8_t end_iter_key_val = num_functionality_keys; + + // current_dispatchkey_idx_ will iterate through all functionality bits. + // current_backendcomponent_idx_ will iterate through all backend bits. + explicit iterator( + const uint64_t* data_ptr, + uint8_t next_functionality = num_backends, + uint8_t next_backend = 0) + : data_ptr_(data_ptr), + next_functionality_(next_functionality), + next_backend_(next_backend), + // These are in an invalid state at construction time, and set by the + // first increment call + current_dispatchkey_idx_(end_iter_key_val), + current_backendcomponent_idx_(end_iter_key_val) { + // Go to the first key in the set + TORCH_INTERNAL_ASSERT( + next_functionality_ >= num_backends, + "num_backends=", + static_cast(num_backends), + "next_functionality_=", + static_cast(next_functionality_)); + ++(*this); + } + + C10_API self_type& operator++(); + + self_type operator++(int) { + self_type previous_iterator = *this; + ++(*this); + return previous_iterator; + } + + bool operator==(const self_type& rhs) const { + return next_functionality_ == rhs.next_functionality_ && + current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ && + next_backend_ == rhs.next_backend_ && + current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_; + } + bool operator!=(const self_type& rhs) const { + return next_functionality_ != rhs.next_functionality_ || + current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ || + next_backend_ != rhs.next_backend_ || + current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_; + } + DispatchKey operator*() const { + auto functionality_key = + static_cast(current_dispatchkey_idx_); + if (isPerBackendFunctionalityKey(functionality_key)) { + auto next_key = toRuntimePerBackendFunctionalityKey( + functionality_key, + static_cast(current_backendcomponent_idx_)); + // We expect all of the Dense, Sparse, Quantized, and Autograd keys to + // be ordered the same way with respect to their backends + TORCH_INTERNAL_ASSERT( + toBackendComponent(next_key) == + static_cast(current_backendcomponent_idx_), + "Tried to map functionality key ", + toString(functionality_key), + " and backend bit ", + toString( + static_cast(current_backendcomponent_idx_)), + " to a runtime key, but ended up with ", + toString(next_key), + ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.", + " Please double check that enum for inconsistencies."); + return next_key; + } else { + return functionality_key; + } + } + + private: + const uint64_t* data_ptr_; + uint8_t next_functionality_; + uint8_t next_backend_; + uint8_t current_dispatchkey_idx_; + uint8_t current_backendcomponent_idx_; + }; + + public: + // Returns iterator to the first key in the set. If no keys are in the + // set, then will return the end iterator. + iterator begin() const { + return iterator(&repr_); + } + + // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat + // this as the end iterator. + iterator end() const { + return iterator(&repr_, iterator::end_iter_mask_val); + } +}; + +C10_API std::string toString(DispatchKeySet /*ts*/); +C10_API std::ostream& operator<<(std::ostream& /*os*/, DispatchKeySet /*ts*/); + +inline int getDispatchTableIndexForDispatchKey(DispatchKey k) { + return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet(); +} + +// Alias key DispatchKey::Autograd maps to +// (autograd_dispatch_keyset x full_backend_mask) +// NB: keys in this set also get associated with CompositeImplicitAutograd +// +// Note [autograd_dispatch_keyset Does Not Include Backend Bits] +// We don't want to include any backend bits (BackendComponent::CPUBit, etc) +// directly in autograd_dispatch_keyset. +// Why? keysets like autograd_dispatch_keyset are commonly used to remove +// autograd keys from a DispatchKeySet throughout the code base. However, you +// are only allowed to remove functionality bits from a keyset, not backend +// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality +// Keys] for details. To be consistent and avoid confusion, we're explicitly +// setting up autograd_dispatch_keyset to not have any backend bits. +constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({ + DispatchKey::AutogradFunctionality, + DispatchKey::AutogradOther, + DispatchKey::AutogradNestedTensor, +}); + +constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({ + DispatchKey::AutocastCPU, + DispatchKey::AutocastMPS, + DispatchKey::AutocastCUDA, + DispatchKey::AutocastXPU, + DispatchKey::AutocastIPU, + DispatchKey::AutocastHPU, + DispatchKey::AutocastXLA, + DispatchKey::AutocastPrivateUse1, + DispatchKey::AutocastMTIA, + DispatchKey::AutocastMAIA, +}); + +// See Note [TLS Initialization] +constexpr DispatchKeySet default_included_set = DispatchKeySet({ + DispatchKey::BackendSelect, + DispatchKey::ADInplaceOrView, +}); + +constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ + DispatchKey::AutocastCPU, + DispatchKey::AutocastMPS, + DispatchKey::AutocastCUDA, + DispatchKey::AutocastXPU, + DispatchKey::AutocastIPU, + DispatchKey::AutocastHPU, + DispatchKey::AutocastXLA, + DispatchKey::AutocastPrivateUse1, + DispatchKey::AutocastMTIA, + DispatchKey::AutocastMAIA, +}); + +constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = + autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); + +constexpr DispatchKeySet python_ks = DispatchKeySet({ + DispatchKey::Python, + DispatchKey::PythonTLSSnapshot, +}); + +constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); + +constexpr DispatchKeySet sparse_csr_ks = DispatchKeySet(DispatchKey::SparseCsr); + +constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU); + +// backend dispatch keys that map to DispatchKey::AutogradOther +// NB: keys in this set also get associated with CompositeImplicitAutograd +constexpr DispatchKeySet autogradother_backends = + DispatchKeySet( + // HIP and VE aren't in this list: they now have their own backend bits + // which means that they can now have their own Autograd keys. + // Technically, HIP will now redispatch to its own custom AutogradHIP + // slot in the runtime table. + {DispatchKey::FPGA, + DispatchKey::Vulkan, + DispatchKey::Metal, + DispatchKey::CustomRNGKeyId, + DispatchKey::MkldnnCPU, + // Sparse and Quantized backends also live here. + DispatchKey::Sparse, + DispatchKey::SparseCsr, + DispatchKey::Quantized}) + // Including the backend bits because this keyset is used during op + // registration, which requires looping over all runtime autogradother + // backend keys. + | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); + +// The set of dispatch keys that come after autograd +// n.b. this relies on the fact that AutogradOther is currently the lowest +// Autograd key +constexpr DispatchKeySet after_autograd_keyset = + DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::AutogradOther); + +// The set of dispatch keys that come after ADInplaceOrView +constexpr DispatchKeySet after_ADInplaceOrView_keyset = DispatchKeySet( + DispatchKeySet::FULL_AFTER, + c10::DispatchKey::ADInplaceOrView); + +// The set of dispatch keys that come after Functionalize +constexpr DispatchKeySet after_func_keyset = + DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::Functionalize) + .remove( + // NOTE: we also need to remove ADInplaceOrView from the keyset when + // redispatching after the func kernels. This is because we're not + // calling the same op; we originally called an inplace op, and now + // we aren't. The original key calculation figured out which keys + // were Fallthrough based on the inplace op. That means that it did + // not include the ADInPlaceOrView kernel as a fallthrough key. + // However, we WANT the ADInPlaceOrView kernel to be ignored now + // that we're calling an out-of-place op. Re-invoking + // Dispatcher::call would re-run the Fallthrough key calculation and + // get us that, But at::redispatch is more performant. We can get + // away with it by explicitly removing the key here. + c10::DispatchKey::ADInplaceOrView); + +constexpr DispatchKeySet backend_bitset_mask = + DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1); + +constexpr auto inplace_or_view_ks = + DispatchKeySet(DispatchKey::ADInplaceOrView); +constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU); +constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU); +constexpr auto autograd_mtia_ks = DispatchKeySet(DispatchKey::AutogradMTIA); +constexpr auto autograd_maia_ks = DispatchKeySet(DispatchKey::AutogradMAIA); +constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU); +constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA); +constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA); +constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy); +constexpr auto autograd_meta_ks = DispatchKeySet(DispatchKey::AutogradMeta); +constexpr auto autograd_mps_ks = DispatchKeySet(DispatchKey::AutogradMPS); +constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU); +constexpr auto autograd_privateuse1_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse1); +constexpr auto autograd_privateuse2_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse2); +constexpr auto autograd_privateuse3_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse3); +constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther); +constexpr auto autograd_nested = + DispatchKeySet(DispatchKey::AutogradNestedTensor); +// keyset corresponding to functorch keys that have their own dedicated +// TensorImpl subclass. +constexpr auto functorch_transforms_ks = DispatchKeySet( + {DispatchKey::FuncTorchBatched, + DispatchKey::FuncTorchVmapMode, + DispatchKey::Batched, + DispatchKey::VmapMode, + DispatchKey::FuncTorchGradWrapper}); + +constexpr auto functorch_batched_ks = + DispatchKeySet({DispatchKey::FuncTorchBatched}); + +// This keyset has: +// (1) the functionality bits corresponding to backends (dense, sparse, +// quantized) (2) all of the backend bits set +constexpr DispatchKeySet backend_functionality_keys = + DispatchKeySet({ + DispatchKey::Dense, + DispatchKey::Quantized, + DispatchKey::Sparse, + DispatchKey::SparseCsr, + }) | + DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); + +struct OpTableOffsetAndMask { + uint16_t offset; + uint16_t backend_mask; +}; + +static_assert( + num_backends <= 16, + "Right now we expect the number of backends not to exceed 16. In the (unlikely) event" + " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too."); + +// true if t is a backend dispatch key +C10_API bool isBackendDispatchKey(DispatchKey t); + +// Resolve alias dispatch key to DispatchKeySet if applicable +C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t); + +// Resolve alias dispatch key to DispatchKeySet if applicable, +// and check if k is a part of that set +C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k); + +// Returns a DispatchKeySet of all backend keys mapped to Autograd dispatch key +// t, DispatchKeySet is empty if t is not alias of DispatchKey::Autograd. +C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t); + +// Returns a DispatchKeySet of autograd related keys mapped to backend. +// for a given backend key, use the associated autograd key. +// for non-backend keys, use AutogradOther as a default. +// Note: it's convenient and fast to return a default here rather than (say) +// returning an std::optional, or throwing. But it makes callers +// responsible for either a) enforcing the invariant that only backend keys +// be passed as arguments, or b) interpreting our return value carefully. +inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) { + switch (t) { + case BackendComponent::CPUBit: + return inplace_or_view_ks | autograd_cpu_ks; + case BackendComponent::IPUBit: + return inplace_or_view_ks | autograd_ipu_ks; + case BackendComponent::MTIABit: + return inplace_or_view_ks | autograd_mtia_ks; + case BackendComponent::MAIABit: + return inplace_or_view_ks | autograd_maia_ks; + case BackendComponent::XPUBit: + return inplace_or_view_ks | autograd_xpu_ks; + case BackendComponent::CUDABit: + return inplace_or_view_ks | autograd_cuda_ks; + case BackendComponent::XLABit: + return inplace_or_view_ks | autograd_xla_ks; + case BackendComponent::LazyBit: + return inplace_or_view_ks | autograd_lazy_ks; + case BackendComponent::MetaBit: + return inplace_or_view_ks | autograd_meta_ks; + case BackendComponent::MPSBit: + return inplace_or_view_ks | autograd_mps_ks; + case BackendComponent::HPUBit: + return inplace_or_view_ks | autograd_hpu_ks; + case BackendComponent::PrivateUse1Bit: + return inplace_or_view_ks | autograd_privateuse1_ks; + case BackendComponent::PrivateUse2Bit: + return inplace_or_view_ks | autograd_privateuse2_ks; + case BackendComponent::PrivateUse3Bit: + return inplace_or_view_ks | autograd_privateuse3_ks; + default: + return inplace_or_view_ks | autograd_other_ks; + } +} + +// Returns a DispatchKeySet of autocast related keys mapped to backend. +inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) { + constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU); + constexpr auto autocast_mtia_ks = DispatchKeySet(DispatchKey::AutocastMTIA); + constexpr auto autocast_maia_ks = DispatchKeySet(DispatchKey::AutocastMAIA); + constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU); + constexpr auto autocast_ipu_ks = DispatchKeySet(DispatchKey::AutocastIPU); + constexpr auto autocast_hpu_ks = DispatchKeySet(DispatchKey::AutocastHPU); + constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA); + constexpr auto autocast_xla_ks = DispatchKeySet(DispatchKey::AutocastXLA); + constexpr auto autocast_privateuse1_ks = + DispatchKeySet(DispatchKey::AutocastPrivateUse1); + constexpr auto autocast_mps_ks = DispatchKeySet(DispatchKey::AutocastMPS); + switch (t) { + case BackendComponent::CPUBit: + return autocast_cpu_ks; + case BackendComponent::MTIABit: + return autocast_mtia_ks; + case BackendComponent::MAIABit: + return autocast_maia_ks; + case BackendComponent::XPUBit: + return autocast_xpu_ks; + case BackendComponent::IPUBit: + return autocast_ipu_ks; + case BackendComponent::HPUBit: + return autocast_hpu_ks; + case BackendComponent::CUDABit: + return autocast_cuda_ks; + case BackendComponent::XLABit: + return autocast_xla_ks; + case BackendComponent::PrivateUse1Bit: + return autocast_privateuse1_ks; + case BackendComponent::MPSBit: + return autocast_mps_ks; + default: + return DispatchKeySet(); + } +} + +// returns the "backend" DispatchKey of highest priority in the set. +// This is basically like highestBackendKey(), except that we have some +// "functionality" bits that correspond to backends (Sparse, Quantized) +inline DispatchKey highestPriorityBackendTypeId(DispatchKeySet ks) { + return (ks & backend_functionality_keys).highestPriorityTypeId(); +} + +// This API exists because we have a use case for checking +// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined) +// in OperatorEntry.cpp but we disallow it in has() API. +C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias); + +// Historically, every tensor only had a single DispatchKey, and it was always +// something like CPU, and there wasn't any of this business where TLS +// could cause the DispatchKey of a tensor to change. But we still have some +// legacy code that is still using DispatchKey for things like instanceof +// checks; if at all possible, refactor the code to stop using DispatchKey in +// those cases. +inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) { + // NB: If you add any extra keys that can be stored in TensorImpl on + // top of existing "backend" keys like CPU/CUDA, you need to add it + // here. At the moment, autograd keys and ADInplaceOrView key need this + // treatment; + return (s - autograd_dispatch_keyset_with_ADInplaceOrView - + autocast_dispatch_keyset - + DispatchKeySet( + {DispatchKey::Functionalize, + DispatchKey::PythonTLSSnapshot, + DispatchKey::FuncTorchGradWrapper, + DispatchKey::FuncTorchVmapMode, + DispatchKey::FuncTorchBatched, + DispatchKey::Python})) + .highestPriorityTypeId(); +} + +template +using is_not_DispatchKeySet = std::negation>; + +// Given a function type, constructs a function_traits type that drops the first +// parameter type if the first parameter is of type DispatchKeySet. NB: +// DispatchKeySet is currently explicitly hidden from JIT (mainly to avoid +// pushing unnecessary arguments on the stack - see Note [ Plumbing Keys Through +// the Dispatcher] for details). If at any point in the future we need to expose +// this type to JIT, revisit the usage of this type alias. +template +using remove_DispatchKeySet_arg_from_func = guts::make_function_traits_t< + typename guts::infer_function_traits_t::return_type, + typename std::conditional_t< + std::is_same_v< + DispatchKeySet, + typename guts::typelist::head_with_default_t< + void, + typename guts::infer_function_traits_t< + FuncType>::parameter_types>>, + guts::typelist::drop_if_nonempty_t< + typename guts::infer_function_traits_t::parameter_types, + 1>, + typename guts::infer_function_traits_t::parameter_types>>; +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h new file mode 100644 index 0000000000000000000000000000000000000000..d0f0f0b27c97bf7521a09fae5c6d7c04d9e0b46e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h @@ -0,0 +1,134 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace c10 { + +// Dynamic type casting utils: +// - fetch_and_cast +// - cast_and_store +// +// fetch_and_cast fetch a value with dynamic type specified by a ScalarType +// from a void pointer and cast it to a static type. +// +// cast_and_store casts a static typed value into dynamic type specified +// by a ScalarType, and store it into a void pointer. +// +// NOTE: +// +// Dynamic casting allows us to support type promotion without blowing up +// the combination space: For example, without dynamic cast, in order to +// implement `add_` with type promotion, we would need something like +// +// AT_DISPATCH_ALL_TYPES(output.dtype(), +// AT_DISPATCH_ALL_TYPES(input1.dtype(), +// AT_DISPATCH_ALL_TYPES(input2.dtype(), +// [](arg0_t a, arg1_t b) -> out_t { return a + b; } +// ) +// ) +// ) +// +// If we support N dtypes, the above code would generate the a+b kernel for +// all the N * N * N different supported types, the compilation time and +// binary size would become horrible. +// +// Dynamic casting might sounds like a bad idea in terms of performance. +// Especially if you ever do it in a loop, you are going to do a billion tests. +// But in practice it is not as bad as it might look: +// +// - on CPU, this is a branch that always has the same outcome, therefore +// hopefully the branch predictor could do the job pretty well +// - on GPU, these branches will not diverge, so we could still have the same +// warp executing the same line of code +// - Most kernels, like `add`, are bandwidth bound, adding a few clock cycles to +// check an integer does not hurt the performance much because the ALUs would +// wait for load instructions anyway. +// +// For the discussion and benchmark, refer to: +// - https://github.com/pytorch/pytorch/pull/28343 +// - https://github.com/pytorch/pytorch/pull/28344 +// - https://github.com/pytorch/pytorch/pull/28345 +// + +#ifdef C10_HOST_DEVICE +#define ERROR_UNSUPPORTED_CAST CUDA_KERNEL_ASSERT(false); +#else +#define ERROR_UNSUPPORTED_CAST TORCH_CHECK(false, "Unexpected scalar type"); +#endif + +// Fetch a value with dynamic type src_type from ptr, and cast it to static type +// dest_t. +#define FETCH_AND_CAST_CASE(type, scalartype) \ + case ScalarType::scalartype: \ + return c10::convert(c10::load(ptr)); + +template +C10_HOST_DEVICE inline dest_t fetch_and_cast( + const ScalarType src_type, + const void* ptr) { + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + switch (src_type) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(FETCH_AND_CAST_CASE) + FETCH_AND_CAST_CASE(uint16_t, UInt16) + FETCH_AND_CAST_CASE(uint32_t, UInt32) + FETCH_AND_CAST_CASE(uint64_t, UInt64) + default: + ERROR_UNSUPPORTED_CAST + } + C10_DIAGNOSTIC_POP() + return dest_t(0); // just to avoid compiler warning +} + +// Cast a value with static type src_t into dynamic dest_type, and store it to +// ptr. +#define CAST_AND_STORE_CASE(type, scalartype) \ + case ScalarType::scalartype: \ + *(type*)ptr = c10::convert(value); \ + return; +template +C10_HOST_DEVICE inline void cast_and_store( + const ScalarType dest_type, + void* ptr, + src_t value) { + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + switch (dest_type) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(CAST_AND_STORE_CASE) + CAST_AND_STORE_CASE(uint16_t, UInt16) + CAST_AND_STORE_CASE(uint32_t, UInt32) + CAST_AND_STORE_CASE(uint64_t, UInt64) + default:; + } + C10_DIAGNOSTIC_POP() + ERROR_UNSUPPORTED_CAST +} + +#define DEFINE_UNCASTABLE(T, scalartype_) \ + template <> \ + C10_HOST_DEVICE inline T fetch_and_cast( \ + const ScalarType src_type, const void* ptr) { \ + CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == src_type); \ + return c10::load(ptr); \ + } \ + template <> \ + C10_HOST_DEVICE inline void cast_and_store( \ + const ScalarType dest_type, void* ptr, T value) { \ + CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == dest_type); \ + *(T*)ptr = value; \ + } + +AT_FORALL_QINT_TYPES(DEFINE_UNCASTABLE) + +#undef FETCH_AND_CAST_CASE +#undef CAST_AND_STORE_CASE +#undef DEFINE_UNCASTABLE +#undef ERROR_UNSUPPORTED_CAST + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h new file mode 100644 index 0000000000000000000000000000000000000000..aed1a213bfb4724b5019909adafc237297262f9e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h @@ -0,0 +1,142 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/** + * A backend-generic movable, not copyable, not thread-safe event. + * + * The design of this event follows that of CUDA and HIP events. These events + * are recorded and waited on by streams and can be rerecorded to, + * each rerecording essentially creating a new version of the event. + * For example, if (in CPU time), stream X is asked to record E, + * stream Y waits on E, and stream X is asked to record E again, then Y will + * wait for X to finish the first call to record and not the second, because + * it's waiting on the first version of event E, not the second. + * Querying an event only returns the status of its most recent version. + * + * Backend-generic events are implemented by this class and + * impl::InlineEvent. In addition to these events there are also + * some backend-specific events, like ATen's CUDAEvent. Each of these + * classes has its own use. + * + * impl::InlineEvent<...> or a backend-specific event should be + * preferred when the backend is known at compile time and known to + * be compiled. Backend-specific events may have additional functionality. + * + * This Event should be used if a particular backend may not be available, + * or the backend required is not known at compile time. + * + * These generic events are built on top of DeviceGuardImpls, analogous + * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls," + * is no longer entirely accurate, as these classes implement the + * backend-specific logic for a generic backend interface. + * + * See DeviceGuardImplInterface.h for a list of all supported flags. + */ + +struct Event final { + // Constructors + Event() = delete; + Event( + const DeviceType _device_type, + const EventFlag _flag = EventFlag::PYTORCH_DEFAULT) + : impl_{_device_type, _flag} {} + + // Copy constructor and copy assignment operator (deleted) + Event(const Event&) = delete; + Event& operator=(const Event&) = delete; + + // Move constructor and move assignment operator + Event(Event&&) noexcept = default; + Event& operator=(Event&&) noexcept = default; + + // Destructor + ~Event() = default; + + // Getters + Device device() const noexcept { + return Device(device_type(), device_index()); + } + DeviceType device_type() const noexcept { + return impl_.device_type(); + } + DeviceIndex device_index() const noexcept { + return impl_.device_index(); + } + EventFlag flag() const noexcept { + return impl_.flag(); + } + bool was_marked_for_recording() const noexcept { + return impl_.was_marked_for_recording(); + } + + /** + * Calls record() if and only if record() has never been called for this + * event. Note: because Event is not thread-safe recordOnce() may call + * record() multiple times if called from multiple threads. + */ + void recordOnce(const Stream& stream) { + impl_.recordOnce(stream); + } + + /** + * Increments the event's version and enqueues a job with this version + * in the stream's work queue. When the stream process that job + * it notifies all streams waiting on / blocked by that version of the + * event to continue and marks that version as recorded. + * */ + void record(const Stream& stream) { + impl_.record(stream); + } + + /** + * Does nothing if the event has not been scheduled to be recorded. + * If the event was previously enqueued to be recorded, a command + * to wait for the version of the event that exists at the time of this call + * is inserted in the stream's work queue. + * When the stream reaches this command it will stop processing + * additional commands until that version of the event is marked as recorded. + */ + void block(const Stream& stream) const { + impl_.block(stream); + } + + /** + * Returns true if (and only if) + * (1) the event has never been scheduled to be recorded + * (2) the current version is marked as recorded. + * Returns false otherwise. + */ + bool query() const { + return impl_.query(); + } + + double elapsedTime(const Event& event) const { + return impl_.elapsedTime(event.impl_); + } + + void* eventId() const { + return impl_.eventId(); + } + + void synchronize() const { + impl_.synchronize(); + } + + private: + impl::InlineEvent impl_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..7d7aac9243ffbbfc4f79471ebceee04ced485219 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h @@ -0,0 +1,116 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +/** + * Note [Generator] + * ~~~~~~~~~~~~~~~~ + * A Pseudo Random Number Generator (PRNG) is an engine that uses an algorithm + * to generate a seemingly random sequence of numbers, that may be later be used + * in creating a random distribution. Such an engine almost always maintains a + * state and requires a seed to start off the creation of random numbers. Often + * times, users have found it beneficial to be able to explicitly create, + * retain, and destroy PRNG states and also be able to have control over the + * seed value. + * + * A Generator in ATen gives users the ability to read, write and modify a PRNG + * engine. For instance, it does so by letting users seed a PRNG engine, fork + * the state of the engine, etc. + * + * By default, there is one generator per device, and a device's generator is + * lazily created. A user can use the torch.Generator() api to create their own + * generator. Currently torch.Generator() can only create a CPUGeneratorImpl. + */ + +/** + * Note [Acquire lock when using random generators] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Generator and its derived classes are NOT thread-safe. Please note that most + * of the places where we have inserted locking for generators are historically + * based, and we haven't actually checked that everything is truly thread safe + * (and it probably isn't). Please use the public mutex_ when using any methods + * from these classes, except for the read-only methods. You can learn about the + * usage by looking into the unittests (aten/src/ATen/cpu_generator_test.cpp) + * and other places where we have used lock_guard. + * + * TODO: Look into changing the threading semantics of Generators in ATen (e.g., + * making them non-thread safe and instead making the generator state + * splittable, to accommodate forks into other threads). + */ + +namespace c10 { + +// The default seed is selected to be a large number +// with good distribution of 0s and 1s in bit representation +constexpr uint64_t default_rng_seed_val = 67280421310721; + +struct C10_API GeneratorImpl : public c10::intrusive_ptr_target { + // Constructors + GeneratorImpl(Device device_in, DispatchKeySet key_set); + + // Delete all copy and move assignment in favor of clone() + // method + GeneratorImpl(const GeneratorImpl& other) = delete; + GeneratorImpl(GeneratorImpl&& other) = delete; + GeneratorImpl& operator=(const GeneratorImpl& other) = delete; + GeneratorImpl& operator=(GeneratorImpl&& other) = delete; + + ~GeneratorImpl() override = default; + c10::intrusive_ptr clone() const; + + // Common methods for all generators + virtual void set_current_seed(uint64_t seed) = 0; + virtual void set_offset(uint64_t offset) = 0; + virtual uint64_t get_offset() const = 0; + virtual uint64_t current_seed() const = 0; + virtual uint64_t seed() = 0; + virtual void set_state(const c10::TensorImpl& new_state) = 0; + virtual c10::intrusive_ptr get_state() const = 0; + virtual void graphsafe_set_state( + const c10::intrusive_ptr& new_state); + virtual c10::intrusive_ptr graphsafe_get_state() const; + Device device() const; + + // See Note [Acquire lock when using random generators] + std::mutex mutex_; + + DispatchKeySet key_set() const { + return key_set_; + } + + inline void set_pyobj(PyObject* pyobj) noexcept { + pyobj_ = pyobj; + } + + inline PyObject* pyobj() const noexcept { + return pyobj_; + } + + protected: + Device device_; + DispatchKeySet key_set_; + PyObject* pyobj_ = nullptr; + + virtual GeneratorImpl* clone_impl() const = 0; +}; + +namespace detail { + +C10_API uint64_t getNonDeterministicRandom(bool is_cuda = false); + +} // namespace detail + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h new file mode 100644 index 0000000000000000000000000000000000000000..391b293f9f005af1035dbf9e43be91bf5b353bed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h @@ -0,0 +1,57 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { + +struct C10_API GradMode { + static bool is_enabled(); + static void set_enabled(bool enabled); +}; + +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct C10_API AutoGradMode { + AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) { + GradMode::set_enabled(enabled); + } + AutoGradMode(const AutoGradMode&) = delete; + AutoGradMode(AutoGradMode&&) = delete; + AutoGradMode& operator=(const AutoGradMode&) = delete; + AutoGradMode& operator=(AutoGradMode&&) = delete; + ~AutoGradMode() { + GradMode::set_enabled(prev_mode); + } + bool prev_mode; +}; + +// A RAII, thread local (!) guard that stops future operations from building +// gradients. +struct C10_API NoGradGuard : public AutoGradMode { + NoGradGuard() : AutoGradMode(/*enabled=*/false) {} +}; + +// A RAII, thread local (!) guard that enables or disables forward grad mode +// upon construction, and sets it back to the original value upon destruction. +struct C10_API AutoFwGradMode { + AutoFwGradMode(bool enabled) + : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) { + AutogradState::get_tls_state().set_fw_grad_mode(enabled); + } + AutoFwGradMode(const AutoFwGradMode&) = delete; + AutoFwGradMode(AutoFwGradMode&&) = delete; + AutoFwGradMode& operator=(const AutoFwGradMode&) = delete; + AutoFwGradMode& operator=(AutoFwGradMode&&) = delete; + ~AutoFwGradMode() { + AutogradState::get_tls_state().set_fw_grad_mode(prev_mode); + } + bool prev_mode; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h new file mode 100644 index 0000000000000000000000000000000000000000..8da25b5427e61d250268a352f11757a4e1d7ab24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h @@ -0,0 +1,96 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +// A RAII, thread local (!) guard that enables or disables inference mode upon +// construction, and sets it back to the original value upon destruction. +struct C10_API InferenceMode { + // Note [Expected TLS state in InferenceMode]: + // InferenceMode: ADInplaceOrView not in + // raw_local_dispatch_key_set.included(), + // Autograd in raw_local_dispatch_key_set.excluded() + // GradMode is disabled. + // NormalMode: ADInplaceOrView in raw_local_dispatch_key_set.included(), + // Autograd not in raw_local_dispatch_key_set.excluded() + // GradMode is enabled by default unless toggled manually + // through other APIs, e.g. NoGradGuard. + // + // Invariant: + // - ADInplaceOrView is never in the excluded set + // - Autograd is never in the included set + // - Setting InferenceMode will set GradMode accordingly, but not vice versa. + // + // 1. Why do we put ADInplaceOrView in included set outside InferenceMode? + // + // Inplace update to inference tensor outside InferenceMode is not + // allowed. See Note [Inplace update inference tensor] for more details. + // Without going through ADInplaceOrView kernel, we cannot throw error + // for `inference_tensor.add_(1)` case. + // + // 2. Why not put ADInplaceOrView in the excluded set inside InferenceMode? + // + // For example: + // torch::Tensor a = torch::ones({1, 2, 3}).set_requires_grad(true); + // torch::Tensor k = a + 2; + // { + // c10::InferenceMode guard(true); + // k.add_(2); + // } + // `k.add_(2)` still need to go through ADInplaceOrView kernel so that it's + // prepared for future autograd. + // + // 3. Why does setting InferenceMode also set GradMode? + // + // This is required since InferenceMode is a faster and more restrictive + // version of NoGradGuard. All runtime checks using GradMode::is_enabled() + // are applicable to InferenceMode as well, e.g. + // `tensorTypeInCurrentExecutionContext` in interpreter.cpp. + InferenceMode(bool enabled = true) + : prev_mode(AutogradState::get_tls_state()), + prev_keyset(c10::impl::tls_local_dispatch_key_set()) { + // Enabling inference mode means disabling grad modes + // And disabling inference mode means enabling grad modes + AutogradState::set_tls_state(AutogradState( + /* grad_mode */ !enabled, + /* inference_mode */ enabled, + /* fw_grad_mode */ !enabled, + /* multithreading_enabled*/ !enabled)); + DispatchKeySet included = enabled + ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView) + : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView); + DispatchKeySet excluded = enabled + ? (prev_keyset.excluded_ | c10::autograd_dispatch_keyset) + : (prev_keyset.excluded_ - c10::autograd_dispatch_keyset); + c10::impl::PODLocalDispatchKeySet cur_keyset{}; + cur_keyset.set_included(included); + cur_keyset.set_excluded(excluded); + c10::impl::_force_tls_local_dispatch_key_set(cur_keyset); + } + + InferenceMode(const InferenceMode&) = delete; + InferenceMode(InferenceMode&&) = delete; + InferenceMode& operator=(const InferenceMode&) = delete; + InferenceMode& operator=(InferenceMode&&) = delete; + + ~InferenceMode() { + AutogradState::set_tls_state(prev_mode); + c10::impl::_force_tls_local_dispatch_key_set(prev_keyset); + } + static bool is_enabled(); + + private: + AutogradState prev_mode; + c10::impl::LocalDispatchKeySet prev_keyset; +}; +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h new file mode 100644 index 0000000000000000000000000000000000000000..194e1863cb18cf2759f2c4e3e1ace298efd76150 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +namespace c10 { + +inline Layout layout_from_backend(Backend backend) { + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + switch (backend) { + case Backend::SparseCPU: + case Backend::SparseCUDA: + case Backend::SparseMPS: + case Backend::SparseHIP: + case Backend::SparseVE: + case Backend::SparseXPU: + case Backend::SparsePrivateUse1: + return Layout::Sparse; + case Backend::MkldnnCPU: + return Layout::Mkldnn; + case Backend::SparseCsrCPU: + case Backend::SparseCsrCUDA: + case Backend::SparseCsrMPS: + case Backend::SparseCsrHIP: + case Backend::SparseCsrVE: + case Backend::SparseCsrXPU: + TORCH_CHECK( + false, + "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU|MPS) to a unique layout."); + default: + return Layout::Strided; + } + C10_DIAGNOSTIC_POP() +} + +inline std::ostream& operator<<(std::ostream& stream, at::Layout layout) { + switch (layout) { + case at::kStrided: + return stream << "Strided"; + case at::kSparse: + return stream << "Sparse"; + case at::kSparseCsr: + return stream << "SparseCsr"; + case at::kSparseCsc: + return stream << "SparseCsc"; + case at::kSparseBsr: + return stream << "SparseBsr"; + case at::kSparseBsc: + return stream << "SparseBsc"; + case at::kMkldnn: + return stream << "Mkldnn"; + case at::kJagged: + return stream << "Jagged"; + case Layout::NumOptions: + default: + TORCH_CHECK(false, "Unknown layout"); + } +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h new file mode 100644 index 0000000000000000000000000000000000000000..63cdb757952b073d957fc91c33357136c1287679 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h @@ -0,0 +1,268 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +#include +#include + +namespace c10 { + +// If you are seeing this, it means that this call site was not checked if +// the memory format could be preserved, and it was switched to old default +// behaviour of contiguous +#define LEGACY_CONTIGUOUS_MEMORY_FORMAT c10::get_contiguous_memory_format() + +inline std::ostream& operator<<( + std::ostream& stream, + at::MemoryFormat memory_format) { + switch (memory_format) { + case MemoryFormat::Preserve: + return stream << "Preserve"; + case MemoryFormat::Contiguous: + return stream << "Contiguous"; + case MemoryFormat::ChannelsLast: + return stream << "ChannelsLast"; + case MemoryFormat::ChannelsLast3d: + return stream << "ChannelsLast3d"; + case MemoryFormat::NumOptions: + default: + TORCH_CHECK(false, "Unknown memory format ", memory_format); + } +} + +// Note: Hardcoded the channel last stride indices here to get better +// performance +template +inline std::vector get_channels_last_strides_2d(ArrayRef sizes) { + std::vector strides(sizes.size()); + switch (sizes.size()) { + case 4: + strides[1] = 1; + strides[3] = sizes[1]; + strides[2] = strides[3] * sizes[3]; + strides[0] = strides[2] * sizes[2]; + return strides; + case 3: + strides[0] = 1; + strides[2] = sizes[0]; + strides[1] = strides[2] * sizes[2]; + return strides; + default: + TORCH_INTERNAL_ASSERT( + false, "ChannelsLast2d doesn't support size ", sizes.size()); + } +} + +inline std::vector get_channels_last_strides_2d(IntArrayRef sizes) { + return get_channels_last_strides_2d(sizes); +} + +template +std::vector get_channels_last_strides_3d(ArrayRef sizes) { + std::vector strides(sizes.size()); + switch (sizes.size()) { + case 5: + strides[1] = 1; + strides[4] = sizes[1]; + strides[3] = strides[4] * sizes[4]; + strides[2] = strides[3] * sizes[3]; + strides[0] = strides[2] * sizes[2]; + return strides; + case 4: + strides[0] = 1; + strides[3] = sizes[0]; + strides[2] = strides[3] * sizes[3]; + strides[1] = strides[2] * sizes[2]; + return strides; + default: + TORCH_INTERNAL_ASSERT( + false, "ChannelsLast3d doesn't support size ", sizes.size()); + } +} + +inline std::vector get_channels_last_strides_3d(IntArrayRef sizes) { + return get_channels_last_strides_3d(sizes); +} + +// NOTE: +// Below are Helper functions for is_channels_last_strides_xd. +// 1. Please do not combine these helper functions, each helper function handles +// exactly one case of sizes + memory_format, by doing this, the strides indices +// will be a constant array and we can access it using constant index number, +// the compiler will fully unroll the loop on strides indices to gain a better +// performance. +// 2. No error check in helper function, caller ensures the correctness of the +// input +// 3. All helper functions have similar comments, only 1st helper function is +// commented here. +template +inline bool is_channels_last_strides_2d_s4( + const ArrayRef sizes, + const ArrayRef strides) { + T min = 0; + // special case for trivial C dimension. default to NCHW + if (strides[1] == 0) { + return false; + } + // loop strides indices + for (auto& d : {1, 3, 2, 0}) { + if (sizes[d] == 0) { + return false; + } + if (strides[d] < min) { + return false; + } + // Fallback to NCHW as default layout for ambiguous cases + // This is the flaw of implicit memory_format from strides. + // N111 tensor with identical strides for size 1 dimension; + // Two cases could lead us here: + // a. N111 contiguous Tensor ([N,1,1,1]@[1,1,1,1]) + // b. N11W contiguous Tensor sliced on the W-dimension. + // ([N,1,1,1]@[W,W,W,W]) + if (d == 0 && min == strides[1]) { + return false; + } + // This is necessary to: + // 1. distinguish the memory_format of N1H1; + // [H, 1, 1, 1] channels_last stride + // [H, H, 1, 1] contiguous stride + // 2. permutation of 1C1W: + // [1, C, 1, H]@[HC, H, H, 1] transpose(1, 3) + // [1, H, 1, C]@[HC, 1, H, H] shouldn't be identified as channels_last + min = strides[d]; + if (sizes[d] > 1) { + min *= sizes[d]; + } + } + return true; +} + +template +inline bool is_channels_last_strides_3d_s5( + const ArrayRef sizes, + const ArrayRef strides) { + T min = 0; + if (strides[1] == 0) { + return false; + } + for (auto& d : {1, 4, 3, 2, 0}) { + if (sizes[d] == 0) { + return false; + } + if (strides[d] < min) { + return false; + } + if (d == 0 && min == strides[1]) { + return false; + } + min = strides[d]; + if (sizes[d] > 1) { + min *= sizes[d]; + } + } + return true; +} + +// Note [Ambiguous is_channels_last_strides_xd] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// The flaw of carrying memory_format implicitly through strides is very hard +// to WAR properly. issue #24090 +// Without the history of permutation, we can't infer the memory_format of a +// tensor from the snapshot of its size & stride +// e.g. +// +// 1. We can NOT specify the memory_format of N111 tensor through strides in a +// meaningful way; +// +// 2. Two path that ended up with identical size/stride +// N11W contiguous tensor sliced at w-dimension becomes [N,1,1,1]@[W,W,W,W] +// NC11 channels_last tensor sliced at c-dimension becomes [N,1,1,1]@[C,C,C,C] +// So if we see a tensor [N,1,1,1]@[X,X,X,X], there's no way for us to infer +// the memory_format of the original tensor. +// +// Due to the limitations, our temporary WAR `is_channels_last_strides` does the +// best effort to infer whether the original memory_format of a tensor is +// at::MemoryFormat::ChannelsLast. The two objectives of this function (ordered +// by their importance): +// 1. Ensure that normal shape manipulation does not accidentally change the +// MemoryFormat of an existing tensor. +// 2. Allows user to mark MemoryFormat::ChannelsLast to tensors; +// +// The function does so via checking strides of the tensor, including strides of +// size-1 dimensions. Although conventionally PyTorch implies no restriction on +// trivial stride (stride for size-1 dimension). +// +// Note that this approach is a compromise. We did not solve the problem +// completely. Many cases we will not be able to infer the correct memory +// format. +// The implementation of `is_channels_last_strides` is to serve the objectives: +// MemoryFormat::ChannelsLast has to be explicitly opted-in (no accidental +// conversion); Best effort to maintain the ChannelsLast flag. +// +// Due to the fact that this is not a bulletproof solution, through testing +// (aten/src/ATen/test/memory_format_test.cpp) +// a. we ensure that the common tasks are supported; +// a. we identify corner cases where the implementation compromises on. +// +// By the time accumulated permutation is enabled to replace implicit +// memory_format through strides, we should be updating our tests and fix the +// issues in our tests. +// +// We use Channels Last 2d as an example above. +// This is a general problem for all the is_channels_last_strides_xd +// implementation. Please check the helper functions +// (is_channels_last_strides_*d_s*) for more details. + +template +inline bool is_channels_last_strides_2d( + const ArrayRef sizes, + const ArrayRef strides) { + switch (sizes.size()) { + case 4: + return is_channels_last_strides_2d_s4(sizes, strides); + // NOLINTNEXTLINE(bugprone-branch-clone) + case 3: + // TODO dim == 3 case will be enabled once it is fully tested + return false; + default: + return false; + } +} + +template +inline bool is_channels_last_strides_3d( + const ArrayRef sizes, + const ArrayRef strides) { + switch (sizes.size()) { + case 5: + return is_channels_last_strides_3d_s5(sizes, strides); + // NOLINTNEXTLINE(bugprone-branch-clone) + case 4: + // TODO dim == 4 case will be enabled once it is fully tested + return false; + default: + return false; + } +} + +inline bool is_channels_last_strides_2d( + const IntArrayRef sizes, + const IntArrayRef strides) { + return is_channels_last_strides_2d(sizes, strides); +} + +inline bool is_channels_last_strides_3d( + const IntArrayRef sizes, + const IntArrayRef strides) { + return is_channels_last_strides_3d(sizes, strides); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h new file mode 100644 index 0000000000000000000000000000000000000000..f1199e1945a65866cfd17c5301e20454721dc117 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace c10 { + +template +class OptionalRef { + public: + OptionalRef() : data_(nullptr) {} + OptionalRef(const T* data) : data_(data) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_); + } + OptionalRef(const T& data) : data_(&data) {} + + bool has_value() const { + return data_ != nullptr; + } + + const T& get() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_); + return *data_; + } + + operator bool() const { + return has_value(); + } + + private: + const T* data_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h new file mode 100644 index 0000000000000000000000000000000000000000..1c39510078bc70aa95e205176fd8bebeeb332065 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h @@ -0,0 +1,81 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace c10 { + +// A PyHandleCache represents a cached pointer from a C++ object to +// a Python object that represents that object analogously in Python. +// Upon a cache hit, the relevant object can be retrieved after a test +// and then a memory load. Two conditions must hold to be able to use this +// class: +// +// - This must truly be a cache; e.g., the caller must be able to produce +// the object some other way if the cache hit misses. +// +// - This must truly be a handle; e.g., the Python object referenced by +// this class must have static lifetime. This means we don't have to +// maintain strong ownership or deallocate the object when the C++ object +// dies. Static lifetime is a good idea in conjunction with the cache, +// since if you are producing a fresh object on miss you won't be +// maintaining object identity. If you need bidirectional ownership, +// you will want to factor out the pattern in TensorImpl with +// resurrection. +// +// This cache is expected to not improve perf under torchdeploy, as one +// interpreter will fill up the cache, and all the interpreters will be +// unable to use the slot. A potential improvement is to have multiple +// slots (one per interpreter), which will work in deployment scenarios +// where there a stable, fixed number of interpreters. You can also store +// the relevant state in the Python library, rather than in the non-Python +// library (although in many cases, this is not convenient, as there may +// not be a way to conveniently index based on the object.) +class PyHandleCache { + public: + PyHandleCache() : pyinterpreter_(nullptr) {} + + // Attempt to fetch the pointer from the cache, if the PyInterpreter + // matches. If it doesn't exist, or the cache entry is not valid, + // use slow_accessor to get the real pointer value and return that + // (possibly writing it to the cache, if the cache entry is + // available.) + template + PyObject* ptr_or(impl::PyInterpreter* self_interpreter, F slow_accessor) + const { + // Note [Memory ordering on Python interpreter tag] + impl::PyInterpreter* interpreter = + pyinterpreter_.load(std::memory_order_acquire); + if (C10_LIKELY(interpreter == self_interpreter)) { + return data_; + } else if (interpreter == nullptr) { + auto* r = slow_accessor(); + impl::PyInterpreter* expected = nullptr; + // attempt to claim this cache entry with the specified interpreter tag + if (pyinterpreter_.compare_exchange_strong( + expected, self_interpreter, std::memory_order_acq_rel)) { + data_ = r; + } + // This shouldn't be possible, as you should be GIL protected + TORCH_INTERNAL_ASSERT(expected != self_interpreter); + return r; + } else { + return slow_accessor(); + } + } + + private: + mutable std::atomic pyinterpreter_; + mutable PyObject* data_{nullptr}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h new file mode 100644 index 0000000000000000000000000000000000000000..b0bb6a245643a3e093c02ae80756403b931245ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +/** + * QEngine is an enum that is used to select the engine to run quantized ops. + * Keep this enum in sync with get_qengine_id() in + * torch/backends/quantized/__init__.py + */ +enum class QEngine : uint8_t { + NoQEngine = 0, + FBGEMM = 1, + QNNPACK = 2, + ONEDNN = 3, + X86 = 4, +}; + +constexpr auto kNoQEngine = QEngine::NoQEngine; +constexpr auto kFBGEMM = QEngine::FBGEMM; +constexpr auto kQNNPACK = QEngine::QNNPACK; +constexpr auto kONEDNN = QEngine::ONEDNN; +constexpr auto kX86 = QEngine::X86; + +inline std::string toString(QEngine qengine) { + switch (qengine) { + case kNoQEngine: + return "NoQEngine"; + case kFBGEMM: + return "FBGEMM"; + case kQNNPACK: + return "QNNPACK"; + case kONEDNN: + return "ONEDNN"; + case kX86: + return "X86"; + default: + TORCH_CHECK( + false, "Unrecognized Quantized Engine: ", static_cast(qengine)); + } +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h new file mode 100644 index 0000000000000000000000000000000000000000..f557affb1de8ff54fc961159d3cc67e2f11ef3b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h @@ -0,0 +1,60 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + +namespace c10 { + +/** + * QScheme is an enum that specifies the type of quantization. This has a one + * to one correspondence with Quantizer + * Please refer to ATen/quantized/Quantizer.h to see the Quantizers classes. + * Keep this file in sync with torch/nn/_qscheme.py + */ +enum class QScheme : uint8_t { + PER_TENSOR_AFFINE = 0, + PER_CHANNEL_AFFINE = 1, + PER_TENSOR_SYMMETRIC = 2, + PER_CHANNEL_SYMMETRIC = 3, + PER_CHANNEL_AFFINE_FLOAT_QPARAMS = 4, + COMPILE_TIME_NUM_QSCHEMES = 5, +}; + +constexpr auto kPerTensorAffine = QScheme::PER_TENSOR_AFFINE; +constexpr auto kPerChannelAffine = QScheme::PER_CHANNEL_AFFINE; +constexpr auto kPerTensorSymmetric = QScheme::PER_TENSOR_SYMMETRIC; +constexpr auto kPerChannelSymmetric = QScheme::PER_CHANNEL_SYMMETRIC; +constexpr auto kPerChannelAffineFloatQParams = + QScheme::PER_CHANNEL_AFFINE_FLOAT_QPARAMS; +constexpr int COMPILE_TIME_NUM_QSCHEMES = + static_cast(QScheme::COMPILE_TIME_NUM_QSCHEMES); + +inline std::string toString(QScheme qscheme) { + switch (qscheme) { + case kPerTensorAffine: + return "per_tensor_affine"; + case kPerChannelAffine: + return "per_channel_affine"; + case kPerTensorSymmetric: + return "per_tensor_symmetric"; + case kPerChannelSymmetric: + return "per_channel_symmetric"; + case kPerChannelAffineFloatQParams: + return "per_channel_affine_float_qparams"; + default: + TORCH_CHECK(false, "Unrecognized qscheme: ", static_cast(qscheme)); + } +} + +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h new file mode 100644 index 0000000000000000000000000000000000000000..8b1e9ca7071a032e6a383dc539b8010af535471b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h @@ -0,0 +1,57 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include + +namespace c10 { + +// A RefcountedDeleterContext object is used as the `ctx` argument for DataPtr +// to implement a shared DataPtr. Normally, a DataPtr is unique, but we use +// this custom context and the `refcounted_deleter` function below to make the +// DataPtr act like a non-unique DataPtr. This context object holds onto an +// inner context and deleter function which handle the actual deletion of the +// data when the refcount reaches 0. +// +// This shared DataPtr feature is only used when storages are shared between +// multiple Python interpreters in MultiPy. // codespell:ignore multipy +// Before storages had PyObject preservation, interpreters could just share the +// same StorageImpl instance. But now a StorageImpl can only be associated with +// one interpreter in order to properly manage a zombie PyObject. So we share +// storages across Python interpreters by creating a different StorageImpl +// instance for each one, but they all point to the same data. +struct C10_API RefcountedDeleterContext { + RefcountedDeleterContext(void* other_ctx, c10::DeleterFnPtr other_deleter) + : other_ctx(other_ctx, other_deleter), refcount(1) {} + + std::unique_ptr other_ctx; + std::atomic_int refcount; +}; + +// `refcounted_deleter` is used as the `ctx_deleter` for DataPtr to implement +// a shared DataPtr. +// +// Warning: This should only be called on a pointer to +// a RefcountedDeleterContext that was allocated on the heap with `new`, +// because when the refcount reaches 0, the context is deleted with `delete` +C10_API void refcounted_deleter(void* ctx_); + +// If the storage's DataPtr does not use `refcounted_deleter`, replace it with +// a DataPtr that does, so it can be shared between multiple StorageImpls +C10_API void maybeApplyRefcountedDeleter(const c10::Storage& storage); + +// Create a new StorageImpl that points to the same data. If the original +// StorageImpl's DataPtr does not use `refcounted_deleter`, it will be replaced +// with one that does +C10_API c10::Storage newStorageImplFromRefcountedDataPtr( + const c10::Storage& storage); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h new file mode 100644 index 0000000000000000000000000000000000000000..bf8eee0e004b5e49c39d9718736df1099769ef24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h @@ -0,0 +1,125 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace c10 { + +// This is an safe owning holder for a PyObject, akin to pybind11's +// py::object, with two major differences: +// +// - It is in c10/core; i.e., you can use this type in contexts where +// you do not have a libpython dependency +// +// - It is multi-interpreter safe (ala torchdeploy); when you fetch +// the underlying PyObject* you are required to specify what the current +// interpreter context is and we will check that you match it. +// +// It is INVALID to store a reference to a Tensor object in this way; +// you should just use TensorImpl directly in that case! +struct C10_API SafePyObject { + // Steals a reference to data + SafePyObject(PyObject* data, c10::impl::PyInterpreter* pyinterpreter) + : data_(data), pyinterpreter_(pyinterpreter) {} + SafePyObject(SafePyObject&& other) noexcept + : data_(std::exchange(other.data_, nullptr)), + pyinterpreter_(other.pyinterpreter_) {} + // For now it's not used, so we just disallow it. + SafePyObject& operator=(SafePyObject&&) = delete; + + SafePyObject(SafePyObject const& other) + : data_(other.data_), pyinterpreter_(other.pyinterpreter_) { + if (data_ != nullptr) { + (*pyinterpreter_)->incref(data_); + } + } + + SafePyObject& operator=(SafePyObject const& other) { + if (this == &other) { + return *this; // Handle self-assignment + } + if (other.data_ != nullptr) { + (*other.pyinterpreter_)->incref(other.data_); + } + if (data_ != nullptr) { + (*pyinterpreter_)->decref(data_); + } + data_ = other.data_; + pyinterpreter_ = other.pyinterpreter_; + return *this; + } + + ~SafePyObject() { + if (data_ != nullptr) { + (*pyinterpreter_)->decref(data_); + } + } + + c10::impl::PyInterpreter& pyinterpreter() const { + return *pyinterpreter_; + } + PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const; + + // stop tracking the current object, and return it + PyObject* release() { + auto rv = data_; + data_ = nullptr; + return rv; + } + + private: + PyObject* data_; + c10::impl::PyInterpreter* pyinterpreter_; +}; + +// A newtype wrapper around SafePyObject for type safety when a python object +// represents a specific type. Note that `T` is only used as a tag and isn't +// actually used for any true purpose. +template +struct SafePyObjectT : private SafePyObject { + SafePyObjectT(PyObject* data, c10::impl::PyInterpreter* pyinterpreter) + : SafePyObject(data, pyinterpreter) {} + ~SafePyObjectT() = default; + SafePyObjectT(SafePyObjectT&& other) noexcept : SafePyObject(other) {} + SafePyObjectT(SafePyObjectT const&) = delete; + SafePyObjectT& operator=(SafePyObjectT const&) = delete; + SafePyObjectT& operator=(SafePyObjectT&&) = delete; + + using SafePyObject::ptr; + using SafePyObject::pyinterpreter; + using SafePyObject::release; +}; + +// Like SafePyObject, but non-owning. Good for references to global PyObjects +// that will be leaked on interpreter exit. You get a copy constructor/assign +// this way. +struct C10_API SafePyHandle { + SafePyHandle() : data_(nullptr), pyinterpreter_(nullptr) {} + SafePyHandle(PyObject* data, c10::impl::PyInterpreter* pyinterpreter) + : data_(data), pyinterpreter_(pyinterpreter) {} + + c10::impl::PyInterpreter& pyinterpreter() const { + return *pyinterpreter_; + } + PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const; + void reset() { + data_ = nullptr; + pyinterpreter_ = nullptr; + } + operator bool() { + return data_; + } + + private: + PyObject* data_; + c10::impl::PyInterpreter* pyinterpreter_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h new file mode 100644 index 0000000000000000000000000000000000000000..863a993ed08a614ca4526fee426ebd46f5633be0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h @@ -0,0 +1,471 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/** + * Scalar represents a 0-dimensional tensor which contains a single element. + * Unlike a tensor, numeric literals (in C++) are implicitly convertible to + * Scalar (which is why, for example, we provide both add(Tensor) and + * add(Scalar) overloads for many operations). It may also be used in + * circumstances where you statically know a tensor is 0-dim and single size, + * but don't know its type. + */ +class C10_API Scalar { + public: + Scalar() : Scalar(int64_t(0)) {} + + void destroy() { + if (Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag) { + raw::intrusive_ptr::decref(v.p); + v.p = nullptr; + } + } + + ~Scalar() { + destroy(); + } + +#define DEFINE_IMPLICIT_CTOR(type, name) \ + Scalar(type vv) : Scalar(vv, true) {} + + AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR) + AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR) + AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR) + + // Helper constructors to allow Scalar creation from long and long long types + // As std::is_same_v is false(except Android), one needs to + // provide a constructor from either long or long long in addition to one from + // int64_t +#if defined(__APPLE__) || defined(__MACOSX) + static_assert( + std::is_same_v, + "int64_t is the same as long long on MacOS"); + Scalar(long vv) : Scalar(vv, true) {} +#endif +#if defined(_MSC_VER) + static_assert( + std::is_same_v, + "int64_t is the same as long long on Windows"); + Scalar(long vv) : Scalar(vv, true) {} +#endif +#if defined(__linux__) && !defined(__ANDROID__) + static_assert( + sizeof(void*) != 8 || std::is_same_v, + "int64_t is the same as long on 64 bit Linux"); +#if LONG_MAX != INT_MAX + Scalar(long long vv) : Scalar(vv, true) {} +#endif /* not 32-bit system */ +#endif + + Scalar(uint16_t vv) : Scalar(vv, true) {} + Scalar(uint32_t vv) : Scalar(vv, true) {} + Scalar(uint64_t vv) { + if (vv > static_cast(INT64_MAX)) { + tag = Tag::HAS_u; + v.u = vv; + } else { + tag = Tag::HAS_i; + // NB: no need to use convert, we've already tested convertibility + v.i = static_cast(vv); + } + } + +#undef DEFINE_IMPLICIT_CTOR + + // Value* is both implicitly convertible to SymbolicVariable and bool which + // causes ambiguity error. Specialized constructor for bool resolves this + // problem. + template < + typename T, + typename std::enable_if_t, bool>* = nullptr> + Scalar(T vv) : tag(Tag::HAS_b) { + v.i = convert(vv); + } + + template < + typename T, + typename std::enable_if_t, bool>* = + nullptr> + Scalar(T vv) : tag(Tag::HAS_sb) { + v.i = convert(vv); + } + +#define DEFINE_ACCESSOR(type, name) \ + type to##name() const { \ + if (Tag::HAS_d == tag) { \ + return checked_convert(v.d, #type); \ + } else if (Tag::HAS_z == tag) { \ + return checked_convert>(v.z, #type); \ + } else if (Tag::HAS_sd == tag) { \ + return checked_convert( \ + toSymFloat().guard_float(__FILE__, __LINE__), #type); \ + } \ + if (Tag::HAS_b == tag) { \ + return checked_convert(v.i, #type); \ + } else if (Tag::HAS_i == tag) { \ + return checked_convert(v.i, #type); \ + } else if (Tag::HAS_u == tag) { \ + return checked_convert(v.u, #type); \ + } else if (Tag::HAS_si == tag) { \ + return checked_convert( \ + toSymInt().guard_int(__FILE__, __LINE__), #type); \ + } else if (Tag::HAS_sb == tag) { \ + return checked_convert( \ + toSymBool().guard_bool(__FILE__, __LINE__), #type); \ + } \ + TORCH_CHECK(false) \ + } + + // TODO: Support ComplexHalf accessor + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ACCESSOR) + DEFINE_ACCESSOR(uint16_t, UInt16) + DEFINE_ACCESSOR(uint32_t, UInt32) + DEFINE_ACCESSOR(uint64_t, UInt64) + +#undef DEFINE_ACCESSOR + + SymInt toSymInt() const { + if (Tag::HAS_si == tag) { + return c10::SymInt(intrusive_ptr::reclaim_copy( + static_cast(v.p))); + } else { + return toLong(); + } + } + + SymFloat toSymFloat() const { + if (Tag::HAS_sd == tag) { + return c10::SymFloat(intrusive_ptr::reclaim_copy( + static_cast(v.p))); + } else { + return toDouble(); + } + } + + SymBool toSymBool() const { + if (Tag::HAS_sb == tag) { + return c10::SymBool(intrusive_ptr::reclaim_copy( + static_cast(v.p))); + } else { + return toBool(); + } + } + + // also support scalar.to(); + // Deleted for unsupported types, but specialized below for supported types + template + T to() const = delete; + + // audit uses of data_ptr + const void* data_ptr() const { + TORCH_INTERNAL_ASSERT(!isSymbolic()); + return static_cast(&v); + } + + bool isFloatingPoint() const { + return Tag::HAS_d == tag || Tag::HAS_sd == tag; + } + + [[deprecated( + "isIntegral is deprecated. Please use the overload with 'includeBool' parameter instead.")]] bool + isIntegral() const { + return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag; + } + + bool isIntegral(bool includeBool) const { + return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag || + (includeBool && isBoolean()); + } + + // See Note [Meaning of HAS_u] + bool isUnsigned() const { + return Tag::HAS_u == tag || (Tag::HAS_i == tag && v.i >= 0); + } + + bool isComplex() const { + return Tag::HAS_z == tag; + } + bool isBoolean() const { + return Tag::HAS_b == tag || Tag::HAS_sb == tag; + } + + // you probably don't actually want these; they're mostly for testing + bool isSymInt() const { + return Tag::HAS_si == tag; + } + bool isSymFloat() const { + return Tag::HAS_sd == tag; + } + bool isSymBool() const { + return Tag::HAS_sb == tag; + } + + bool isSymbolic() const { + return Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag; + } + + C10_ALWAYS_INLINE Scalar& operator=(Scalar&& other) noexcept { + if (&other == this) { + return *this; + } + + destroy(); + moveFrom(std::move(other)); + return *this; + } + + C10_ALWAYS_INLINE Scalar& operator=(const Scalar& other) { + if (&other == this) { + return *this; + } + + *this = Scalar(other); + return *this; + } + + Scalar operator-() const; + Scalar conj() const; + Scalar log() const; + + template < + typename T, + typename std::enable_if_t::value, int> = 0> + bool equal(T num) const { + if (isComplex()) { + TORCH_INTERNAL_ASSERT(!isSymbolic()); + auto val = v.z; + return (val.real() == num) && (val.imag() == T()); + } else if (isFloatingPoint()) { + return toDouble() == num; + } else if (tag == Tag::HAS_i) { + if (overflows(v.i, /* strict_unsigned */ true)) { + return false; + } else { + return static_cast(v.i) == num; + } + } else if (tag == Tag::HAS_u) { + if (overflows(v.u, /* strict_unsigned */ true)) { + return false; + } else { + return static_cast(v.u) == num; + } + } else if (tag == Tag::HAS_si) { + TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality"); + } else if (isBoolean()) { + // boolean scalar does not equal to a non boolean value + TORCH_INTERNAL_ASSERT(!isSymbolic()); + return false; + } else { + TORCH_INTERNAL_ASSERT(false); + } + } + + template < + typename T, + typename std::enable_if_t::value, int> = 0> + bool equal(T num) const { + if (isComplex()) { + TORCH_INTERNAL_ASSERT(!isSymbolic()); + return v.z == num; + } else if (isFloatingPoint()) { + return (toDouble() == num.real()) && (num.imag() == T()); + } else if (tag == Tag::HAS_i) { + if (overflows(v.i, /* strict_unsigned */ true)) { + return false; + } else { + return static_cast(v.i) == num.real() && num.imag() == T(); + } + } else if (tag == Tag::HAS_u) { + if (overflows(v.u, /* strict_unsigned */ true)) { + return false; + } else { + return static_cast(v.u) == num.real() && num.imag() == T(); + } + } else if (tag == Tag::HAS_si) { + TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality"); + } else if (isBoolean()) { + // boolean scalar does not equal to a non boolean value + TORCH_INTERNAL_ASSERT(!isSymbolic()); + return false; + } else { + TORCH_INTERNAL_ASSERT(false); + } + } + + bool equal(bool num) const { + if (isBoolean()) { + TORCH_INTERNAL_ASSERT(!isSymbolic()); + return static_cast(v.i) == num; + } else { + return false; + } + } + + ScalarType type() const { + if (isComplex()) { + return ScalarType::ComplexDouble; + } else if (isFloatingPoint()) { + return ScalarType::Double; + } else if (isIntegral(/*includeBool=*/false)) { + // Represent all integers as long, UNLESS it is unsigned and therefore + // unrepresentable as long + if (Tag::HAS_u == tag) { + return ScalarType::UInt64; + } + return ScalarType::Long; + } else if (isBoolean()) { + return ScalarType::Bool; + } else { + TORCH_CHECK(false, "Unknown scalar type."); + } + } + + Scalar(Scalar&& rhs) noexcept : tag(rhs.tag) { + moveFrom(std::move(rhs)); + } + + Scalar(const Scalar& rhs) : tag(rhs.tag), v(rhs.v) { + if (isSymbolic()) { + c10::raw::intrusive_ptr::incref(v.p); + } + } + + Scalar(c10::SymInt si) { + if (auto m = si.maybe_as_int()) { + tag = Tag::HAS_i; + v.i = *m; + } else { + tag = Tag::HAS_si; + v.p = std::move(si).release(); + } + } + + Scalar(c10::SymFloat sd) { + if (sd.is_symbolic()) { + tag = Tag::HAS_sd; + v.p = std::move(sd).release(); + } else { + tag = Tag::HAS_d; + v.d = sd.as_float_unchecked(); + } + } + + Scalar(c10::SymBool sb) { + if (auto m = sb.maybe_as_bool()) { + tag = Tag::HAS_b; + v.i = *m; + } else { + tag = Tag::HAS_sb; + v.p = std::move(sb).release(); + } + } + + // We can't set v in the initializer list using the + // syntax v{ .member = ... } because it doesn't work on MSVC + private: + enum class Tag { HAS_d, HAS_i, HAS_u, HAS_z, HAS_b, HAS_sd, HAS_si, HAS_sb }; + + // Note [Meaning of HAS_u] + // ~~~~~~~~~~~~~~~~~~~~~~~ + // HAS_u is a bit special. On its face, it just means that we + // are holding an unsigned integer. However, we generally don't + // distinguish between different bit sizes in Scalar (e.g., we represent + // float as double), instead, it represents a mathematical notion + // of some quantity (integral versus floating point). So actually, + // HAS_u is used solely to represent unsigned integers that could + // not be represented as a signed integer. That means only uint64_t + // potentially can get this tag; smaller types like uint8_t fits into a + // regular int and so for BC reasons we keep as an int. + + // NB: assumes that self has already been cleared + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) + C10_ALWAYS_INLINE void moveFrom(Scalar&& rhs) noexcept { + v = rhs.v; + tag = rhs.tag; + if (rhs.tag == Tag::HAS_si || rhs.tag == Tag::HAS_sd || + rhs.tag == Tag::HAS_sb) { + // Move out of scalar + rhs.tag = Tag::HAS_i; + rhs.v.i = 0; + } + } + + Tag tag; + + union v_t { + double d{}; + int64_t i; + // See Note [Meaning of HAS_u] + uint64_t u; + c10::complex z; + c10::intrusive_ptr_target* p; + // NOLINTNEXTLINE(modernize-use-equals-default) + v_t() {} // default constructor + } v; + + template < + typename T, + typename std::enable_if_t< + std::is_integral_v && !std::is_same_v, + bool>* = nullptr> + Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_i) { + v.i = convert(vv); + } + + template < + typename T, + typename std::enable_if_t< + !std::is_integral_v && !c10::is_complex::value, + bool>* = nullptr> + Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_d) { + v.d = convert(vv); + } + + template < + typename T, + typename std::enable_if_t::value, bool>* = nullptr> + Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_z) { + v.z = convert(vv); + } +}; + +using OptionalScalarRef = c10::OptionalRef; + +// define the scalar.to() specializations +#define DEFINE_TO(T, name) \ + template <> \ + inline T Scalar::to() const { \ + return to##name(); \ + } +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_TO) +DEFINE_TO(uint16_t, UInt16) +DEFINE_TO(uint32_t, UInt32) +DEFINE_TO(uint64_t, UInt64) +#undef DEFINE_TO + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h new file mode 100644 index 0000000000000000000000000000000000000000..b678a22630d3d9e625b62149a580b3a0b3bbed9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h @@ -0,0 +1,285 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + +namespace c10 { + +// See [dtype Macros note] in torch/headeronly/core/ScalarType.h +// regarding macros. + +#define DEFINE_CONSTANT(_, name) \ + constexpr ScalarType k##name = ScalarType::name; + +// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT) +#undef DEFINE_CONSTANT + +inline size_t elementSize(ScalarType t) { +#define CASE_ELEMENTSIZE_CASE(ctype, name) \ + case ScalarType::name: \ + return sizeof(ctype); + + switch (t) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(CASE_ELEMENTSIZE_CASE) + default: + TORCH_CHECK(false, "Unknown ScalarType"); + } +#undef CASE_ELEMENTSIZE_CASE +} + +inline bool isIntegralType(ScalarType t, bool includeBool) { + bool isIntegral = + (t == ScalarType::Byte || t == ScalarType::Char || t == ScalarType::Int || + t == ScalarType::Long || t == ScalarType::Short || + t == ScalarType::UInt16 || t == ScalarType::UInt32 || + t == ScalarType::UInt64); + + return isIntegral || (includeBool && t == ScalarType::Bool); +} + +[[deprecated( + "isIntegralType is deprecated. Please use the overload with 'includeBool' parameter instead.")]] inline bool +isIntegralType(ScalarType t) { + return isIntegralType(t, /*includeBool=*/false); +} + +inline bool isFloat8Type(ScalarType t) { + return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz || + t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz || + t == ScalarType::Float8_e8m0fnu; +} + +inline bool isReducedFloatingType(ScalarType t) { + return t == ScalarType::Half || t == ScalarType::BFloat16 || + isFloat8Type(t) || t == ScalarType::Float4_e2m1fn_x2; +} + +inline bool isFloatingType(ScalarType t) { + return t == ScalarType::Double || t == ScalarType::Float || + isReducedFloatingType(t); +} + +inline bool isComplexType(ScalarType t) { + return ( + t == ScalarType::ComplexHalf || t == ScalarType::ComplexFloat || + t == ScalarType::ComplexDouble); +} + +inline bool isBitsType(ScalarType t) { + return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 || + t == ScalarType::Bits4x2 || t == ScalarType::Bits8 || + t == ScalarType::Bits16; +} + +inline bool isBarebonesUnsignedType(ScalarType t) { + return t == ScalarType::UInt1 || t == ScalarType::UInt2 || + t == ScalarType::UInt3 || t == ScalarType::UInt4 || + t == ScalarType::UInt5 || t == ScalarType::UInt6 || + t == ScalarType::UInt7 || t == ScalarType::UInt16 || + t == ScalarType::UInt32 || t == ScalarType::UInt64; +} + +inline ScalarType toQIntType(ScalarType t) { + switch (t) { + case ScalarType::Byte: + return ScalarType::QUInt8; + case ScalarType::Char: + return ScalarType::QInt8; + case ScalarType::Int: + return ScalarType::QInt32; + default: + return t; + } +} + +inline bool isSignedType(ScalarType t) { +#define CASE_ISSIGNED(name) \ + case ScalarType::name: \ + return std::numeric_limits< \ + ::c10::impl::ScalarTypeToCPPTypeT>::is_signed; + + // TODO(#146647): If we expect to have numeric_limits for everything, + // let's just have a big macro for the whole thing. + // If we're hardcoding it, let's just use the macro and a "true"/"false" + // below? + switch (t) { + case ScalarType::QInt8: + case ScalarType::QUInt8: + case ScalarType::QInt32: + case ScalarType::QUInt4x2: + case ScalarType::QUInt2x4: + TORCH_CHECK(false, "isSignedType not supported for quantized types"); + case ScalarType::Bits1x8: + case ScalarType::Bits2x4: + case ScalarType::Bits4x2: + case ScalarType::Bits8: + case ScalarType::Bits16: + TORCH_CHECK(false, "Bits types are undefined"); + CASE_ISSIGNED(UInt16); + CASE_ISSIGNED(UInt32); + CASE_ISSIGNED(UInt64); + CASE_ISSIGNED(BFloat16); + CASE_ISSIGNED(Float8_e5m2); + CASE_ISSIGNED(Float8_e5m2fnuz); + CASE_ISSIGNED(Float8_e4m3fn); + CASE_ISSIGNED(Float8_e4m3fnuz); + CASE_ISSIGNED(Float8_e8m0fnu); + CASE_ISSIGNED(Byte); + CASE_ISSIGNED(Char); + CASE_ISSIGNED(Short); + CASE_ISSIGNED(Int); + CASE_ISSIGNED(Long); + CASE_ISSIGNED(Half); + CASE_ISSIGNED(Float); + CASE_ISSIGNED(Double); + CASE_ISSIGNED(ComplexHalf); + CASE_ISSIGNED(ComplexFloat); + CASE_ISSIGNED(ComplexDouble); + CASE_ISSIGNED(Bool); + case ScalarType::Int1: + case ScalarType::Int2: + case ScalarType::Int3: + case ScalarType::Int4: + case ScalarType::Int5: + case ScalarType::Int6: + case ScalarType::Int7: + case ScalarType::Float4_e2m1fn_x2: + return true; + case ScalarType::UInt1: + case ScalarType::UInt2: + case ScalarType::UInt3: + case ScalarType::UInt4: + case ScalarType::UInt5: + case ScalarType::UInt6: + case ScalarType::UInt7: + return false; + case ScalarType::Undefined: + case ScalarType::NumOptions: + break; + // Do not add default here, but rather define behavior of every new entry + // here. `-Wswitch-enum` would raise a warning in those cases. + // TODO: get PyTorch to adopt exhaustive switches by default with a way to + // opt specific switches to being non-exhaustive. + // Exhaustive: + // `-Wswitch-enum`, `-Wswitch-default`, `-Wno-covered-switch-default` + // Non-Exhaustive: + // `-Wno-switch-enum`, `-Wswitch-default`, `-Wcovered-switch-default` + } + TORCH_CHECK(false, "Unknown ScalarType ", t); +#undef CASE_ISSIGNED +} + +inline bool isUnderlying(ScalarType type, ScalarType qtype) { + return type == toUnderlying(qtype); +} + +inline ScalarType toRealValueType(ScalarType t) { + switch (t) { + case ScalarType::ComplexHalf: + return ScalarType::Half; + case ScalarType::ComplexFloat: + return ScalarType::Float; + case ScalarType::ComplexDouble: + return ScalarType::Double; + default: + return t; + } +} + +inline ScalarType toComplexType(ScalarType t) { + switch (t) { + case ScalarType::BFloat16: + // BFloat16 has range equivalent to Float, + // so we map it to ComplexFloat. + return ScalarType::ComplexFloat; + case ScalarType::Half: + return ScalarType::ComplexHalf; + case ScalarType::Float: + return ScalarType::ComplexFloat; + case ScalarType::Double: + return ScalarType::ComplexDouble; + case ScalarType::ComplexHalf: + return ScalarType::ComplexHalf; + case ScalarType::ComplexFloat: + return ScalarType::ComplexFloat; + case ScalarType::ComplexDouble: + return ScalarType::ComplexDouble; + default: + TORCH_CHECK(false, "Unknown Complex ScalarType for ", t); + } +} + +// see tensor_attributes.rst for detailed explanation and examples +// of casting rules. +inline bool canCast(const ScalarType from, const ScalarType to) { + // We disallow complex -> non complex, e.g., float_tensor *= complex is + // disallowed. + if (isComplexType(from) && !isComplexType(to)) { + return false; + } + // We disallow float -> integral, e.g., int_tensor *= float is disallowed. + if (isFloatingType(from) && isIntegralType(to, false)) { + return false; + } + + // Treat bool as a distinct "category," to be consistent with type promotion + // rules (e.g. `bool_tensor + 5 -> int64_tensor`). If `5` was in the same + // category as `bool_tensor`, we would not promote. Differing categories + // implies `bool_tensor += 5` is disallowed. + // + // NB: numpy distinguishes "unsigned" as a category to get the desired + // `bool_tensor + 5 -> int64_tensor` behavior. We don't, because: + // * We don't want the performance hit of checking the runtime sign of + // Scalars. + // * `uint8_tensor + 5 -> int64_tensor` would be undesirable. + if (from != ScalarType::Bool && to == ScalarType::Bool) { + return false; + } + return true; +} + +C10_API ScalarType promoteTypes(ScalarType a, ScalarType b); + +// Returns a pair of strings representing the names for each dtype. +// The returned pair is (name, legacy_name_if_applicable) +C10_API std::pair getDtypeNames( + c10::ScalarType scalarType); + +// Returns a map of string name to dtype. +C10_API const std::unordered_map& getStringToDtypeMap(); + +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h new file mode 100644 index 0000000000000000000000000000000000000000..d952b0dd2089207bef2bd3b53d348d6cb667e046 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +// these just expose TypeMeta/ScalarType bridge functions in c10 +// TODO move to typeid.h (or codemod away) when TypeMeta et al +// are moved from caffe2 to c10 (see note at top of typeid.h) + +namespace c10 { + +/** + * convert ScalarType enum values to TypeMeta handles + */ +inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) { + return caffe2::TypeMeta::fromScalarType(scalar_type); +} + +/** + * convert TypeMeta handles to ScalarType enum values + */ +inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) { + return dtype.toScalarType(); +} + +/** + * typeMetaToScalarType(), lifted to optional + */ +inline std::optional optTypeMetaToScalarType( + std::optional type_meta) { + if (!type_meta.has_value()) { + return std::nullopt; + } + return type_meta->toScalarType(); +} + +/** + * convenience: equality across TypeMeta/ScalarType conversion + */ +inline bool operator==(ScalarType t, caffe2::TypeMeta m) { + return m.isScalarType(t); +} + +inline bool operator==(caffe2::TypeMeta m, ScalarType t) { + return t == m; +} + +inline bool operator!=(ScalarType t, caffe2::TypeMeta m) { + return !(t == m); +} + +inline bool operator!=(caffe2::TypeMeta m, ScalarType t) { + return !(t == m); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h new file mode 100644 index 0000000000000000000000000000000000000000..203eec24c05e28e413b69dc71fbb0b7be65538a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h @@ -0,0 +1,293 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +struct Storage; + +C10_API bool isSharedStorageAlias( + const Storage& storage0, + const Storage& storage1); + +struct C10_API Storage { + public: + struct use_byte_size_t {}; + struct unsafe_borrow_t { + explicit unsafe_borrow_t() = default; + }; + + Storage() = default; + Storage(c10::intrusive_ptr ptr) + : storage_impl_(std::move(ptr)) {} + + // Allocates memory buffer using given allocator and creates a storage with it + Storage( + use_byte_size_t /*use_byte_size*/, + const SymInt& size_bytes, + Allocator* allocator = nullptr, + bool resizable = false) + : storage_impl_(c10::make_intrusive( + StorageImpl::use_byte_size_t(), + size_bytes, + allocator, + resizable)) {} + + // Creates storage with pre-allocated memory buffer. Allocator is given for + // potential future reallocations, however it can be nullptr if the storage + // is non-resizable + Storage( + use_byte_size_t /*use_byte_size*/, + size_t size_bytes, + at::DataPtr data_ptr, + at::Allocator* allocator = nullptr, + bool resizable = false) + : storage_impl_(c10::make_intrusive( + StorageImpl::use_byte_size_t(), + size_bytes, + std::move(data_ptr), + allocator, + resizable)) {} + + // Creates storage with pre-allocated memory buffer. Allocator is given for + // potential future reallocations, however it can be nullptr if the storage + // is non-resizable + Storage( + use_byte_size_t /*use_byte_size*/, + SymInt size_bytes, + at::DataPtr data_ptr, + at::Allocator* allocator = nullptr, + bool resizable = false) + : storage_impl_(c10::make_intrusive( + StorageImpl::use_byte_size_t(), + std::move(size_bytes), + std::move(data_ptr), + allocator, + resizable)) {} + + protected: + explicit Storage(unsafe_borrow_t /*unused*/, const Storage& rhs) + : storage_impl_(c10::intrusive_ptr::reclaim( + rhs.storage_impl_.get())) {} + + friend MaybeOwnedTraits; + + public: + // Legacy constructor for partially initialized (dtype or memory) storages + // that can be temporarily created with Caffe2 APIs. See the note on top of + // TensorImpl.h for details. + static Storage create_legacy(at::Device device) { + auto allocator = GetAllocator(device.type()); + return Storage(c10::make_intrusive( + StorageImpl::use_byte_size_t(), + 0, + allocator->allocate(0), // materialize a non-default Device. + allocator, + true)); + } + + // Mimic create_legacy, but without requiring a newly-created StorageImpl. + void reset_legacy() { + TORCH_CHECK(resizable() && allocator()); + set_nbytes(0); + set_data_ptr_noswap(allocator()->allocate(0)); + } + + // TODO: remove later + void set_nbytes(size_t size_bytes) const { + storage_impl_->set_nbytes(size_bytes); + } + + void set_nbytes(c10::SymInt size_bytes) const { + storage_impl_->set_nbytes(std::move(size_bytes)); + } + + bool resizable() const { + return storage_impl_->resizable(); + } + + size_t nbytes() const { + return storage_impl_->nbytes(); + } + + SymInt sym_nbytes() const { + return storage_impl_->sym_nbytes(); + } + // get() use here is to get const-correctness + + const void* data() const { + return storage_impl_->data(); + } + + void* mutable_data() const { + return storage_impl_->mutable_data(); + } + + at::DataPtr& mutable_data_ptr() const { + return storage_impl_->mutable_data_ptr(); + } + + const at::DataPtr& data_ptr() const { + return storage_impl_->data_ptr(); + } + + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) const { + return storage_impl_->set_data_ptr(std::move(data_ptr)); + } + + void set_data_ptr_noswap(at::DataPtr&& data_ptr) const { + storage_impl_->set_data_ptr_noswap(std::move(data_ptr)); + } + + DeviceType device_type() const { + return storage_impl_->device_type(); + } + + at::Allocator* allocator() const { + return storage_impl_->allocator(); + } + + at::Device device() const { + return storage_impl_->device(); + } + + StorageImpl* unsafeReleaseStorageImpl() { + return storage_impl_.release(); + } + + StorageImpl* unsafeGetStorageImpl() const noexcept { + return storage_impl_.get(); + } + + c10::weak_intrusive_ptr getWeakStorageImpl() const { + return c10::weak_intrusive_ptr(storage_impl_); + } + + operator bool() const { + return storage_impl_; + } + + size_t use_count() const { + return storage_impl_.use_count(); + } + + inline bool unique() const { + return storage_impl_.unique(); + } + + bool is_alias_of(const Storage& other) const { + return ( + storage_impl_ == other.storage_impl_ || + isSharedStorageAlias(*this, other)); + } + + void UniqueStorageShareExternalPointer( + void* src, + size_t capacity, + DeleterFnPtr d = nullptr) { + if (!storage_impl_.unique()) { + TORCH_CHECK( + false, + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer(src, capacity, d); + } + + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + size_t capacity) { + if (!storage_impl_.unique()) { + TORCH_CHECK( + false, + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer( + std::move(data_ptr), capacity); + } + + protected: + c10::intrusive_ptr storage_impl_; +}; + +template <> +struct MaybeOwnedTraits { + using owned_type = c10::Storage; + using borrow_type = c10::Storage; + + static borrow_type createBorrow(const owned_type& from) { + return borrow_type(borrow_type::unsafe_borrow_t{}, from); + } + + static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) { + lhs.unsafeReleaseStorageImpl(); + lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs); + } + + static void destroyBorrow(borrow_type& toDestroy) { + toDestroy.unsafeReleaseStorageImpl(); // "leak" it, but it was already +0. + } + + static const owned_type& referenceFromBorrow(const borrow_type& borrow) { + return borrow; + } + + static const owned_type* pointerFromBorrow(const borrow_type& borrow) { + return &borrow; + } + + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) { + return true; + } +}; + +template <> +struct ExclusivelyOwnedTraits { + using repr_type = c10::Storage; + using pointer_type = c10::Storage*; + using const_pointer_type = const c10::Storage*; + + static repr_type nullRepr() { + return c10::Storage(); + } + + template + static repr_type createInPlace(Args&&... args) { + return c10::Storage(std::forward(args)...); + } + + static repr_type moveToRepr(c10::Storage&& x) { + return std::move(x); + } + + static c10::Storage take(c10::Storage& x) { + return std::move(x); + } + + static pointer_type getImpl(repr_type& x) { + return &x; + } + + static const_pointer_type getImpl(const repr_type& x) { + return &x; + } +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..2acfa40771c5f29fb41565a06dfd6944a1a55ea4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h @@ -0,0 +1,398 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +[[noreturn]] C10_API void throwNullDataPtrError(); +C10_API void warnDeprecatedDataPtr(); + +// Used in StorageImpl to store extra metadata. +// Currently used only for storing a custom error message +// used when throwing an exception when data_ptr is accessed. +struct C10_API StorageExtraMeta { + std::optional custom_data_ptr_error_msg_ = std::nullopt; +}; + +// A storage represents the underlying backing data buffer for a +// tensor. This concept was inherited from the original Torch7 +// codebase; we'd kind of like to get rid of the concept +// (see https://github.com/pytorch/pytorch/issues/14797) but +// it's hard work and no one has gotten around to doing it. +// +// NB: storage is supposed to uniquely own a data pointer; e.g., +// two non-null data pointers alias if and only if they are from +// the same storage. Technically you can violate this invariant +// (e.g., you can create a non-owning StorageImpl with at::from_blob) +// but a lot of things won't work correctly, including: +// +// - An ordinary deleter on such a storage is wrong, because normal deleters +// assume unique ownership, but if you have two storages at the same data, +// that implies there is some sort of shared ownership. So your deleter would +// have to actually be internally doing some sort of refcount thing +// - Deepcopy in Python side relies on storage equality and not data pointer +// equality; so if there are two separate storages pointing to the same data, +// the data will actually get duplicated in that case (one data ptr before, +// two data ptrs after) +// - Version counts won't work correctly, because we do all VC tracking at the +// level of storages (unless you explicitly disconnect the VC with detach); +// mutation because data pointers are the same are totally untracked +struct C10_API StorageImpl : public c10::intrusive_ptr_target { + public: + struct use_byte_size_t {}; + + StorageImpl( + use_byte_size_t /*use_byte_size*/, + SymInt size_bytes, + at::DataPtr data_ptr, + at::Allocator* allocator, + bool resizable) + : data_ptr_(std::move(data_ptr)), + size_bytes_(std::move(size_bytes)), + size_bytes_is_heap_allocated_(size_bytes_.is_heap_allocated()), + resizable_(resizable), + received_cuda_(false), + allocator_(allocator) { + if (resizable) { + TORCH_INTERNAL_ASSERT( + allocator_, "For resizable storage, allocator must be provided"); + } + refresh_has_data_ptr_check(); + } + + StorageImpl( + use_byte_size_t /*use_byte_size*/, + const SymInt& size_bytes, + at::Allocator* allocator, + bool resizable) + : StorageImpl( + use_byte_size_t(), + size_bytes, + size_bytes.is_heap_allocated() + ? allocator->allocate(0) + : allocator->allocate(size_bytes.as_int_unchecked()), + allocator, + resizable) {} + + StorageImpl& operator=(StorageImpl&& other) = delete; + StorageImpl& operator=(const StorageImpl&) = delete; + StorageImpl() = delete; + StorageImpl(StorageImpl&& other) = delete; + StorageImpl(const StorageImpl&) = delete; + ~StorageImpl() override = default; + + void reset() { + data_ptr_.clear(); + size_bytes_ = 0; + size_bytes_is_heap_allocated_ = false; + } + + // Destructor doesn't call release_resources because it's + // unnecessary; don't forget to change that if needed! + void release_resources() override { + data_ptr_.clear(); + } + + void incref_pyobject() const noexcept override final; + + void decref_pyobject() const noexcept override final; + + bool try_incref_pyobject() const noexcept override final; + + size_t nbytes() const { + // OK to do this instead of maybe_as_int as nbytes is guaranteed positive + TORCH_CHECK(!size_bytes_is_heap_allocated_); + return size_bytes_.as_int_unchecked(); + } + + SymInt sym_nbytes() const { + return size_bytes_; + } + + // TODO: remove later + void set_nbytes(size_t size_bytes) { + size_bytes_ = static_cast(size_bytes); + size_bytes_is_heap_allocated_ = false; + } + + void unsafe_set_nbytes(size_t size_bytes) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!size_bytes_is_heap_allocated_); + size_bytes_.unsafe_set_data(size_bytes); + } + + void set_nbytes(c10::SymInt size_bytes) { + size_bytes_ = std::move(size_bytes); + } + + bool resizable() const { + return resizable_; + } + + const at::DataPtr& data_ptr() const { + if (C10_UNLIKELY(throw_on_immutable_data_ptr_)) { + throw_data_ptr_access_error(); + } + return data_ptr_; + } + + at::DataPtr& mutable_data_ptr() { + if (C10_UNLIKELY(has_mutable_data_ptr_check_)) { + if (throw_on_immutable_data_ptr_) { + throw_data_ptr_access_error(); + } + if (throw_on_mutable_data_ptr_) { + throwNullDataPtrError(); + } + if (warn_deprecated_on_mutable_data_ptr_) { + warnDeprecatedDataPtr(); + } + maybe_materialize_cow(); + } + return data_ptr_; + } + + // Returns the data_ptr. Bypasses all checks. + at::DataPtr& _mutable_data_ptr_no_checks() { + return data_ptr_; + } + + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { + // We need to materialize the old COW DataPtr because it is + // being returned as mutable. + maybe_materialize_cow(); + return set_data_ptr_no_materialize_cow(std::move(data_ptr)); + } + + void set_data_ptr_noswap(at::DataPtr&& data_ptr) { + data_ptr_ = std::move(data_ptr); + refresh_has_data_ptr_check(); + } + + const void* data() const { + if (C10_UNLIKELY(throw_on_immutable_data_ptr_)) { + throw_data_ptr_access_error(); + } + return data_ptr_.get(); + } + + void* mutable_data() { + if (C10_UNLIKELY(has_mutable_data_ptr_check_)) { + if (throw_on_immutable_data_ptr_) { + throw_data_ptr_access_error(); + } + if (throw_on_mutable_data_ptr_) { + throwNullDataPtrError(); + } + if (warn_deprecated_on_mutable_data_ptr_) { + warnDeprecatedDataPtr(); + } + maybe_materialize_cow(); + } + return data_ptr_.mutable_get(); + } + + at::DeviceType device_type() const { + return data_ptr_.device().type(); + } + + at::Allocator* allocator() { + return allocator_; + } + + const at::Allocator* allocator() const { + return allocator_; + } + + // You generally shouldn't use this method, but it is occasionally + // useful if you want to override how a tensor will be reallocated, + // after it was already allocated (and its initial allocator was + // set) + void set_allocator(at::Allocator* allocator) { + allocator_ = allocator; + } + + Device device() const { + return data_ptr_.device(); + } + + void set_resizable(bool resizable) { + if (resizable) { + // We need an allocator to be resizable + AT_ASSERT(allocator_); + } + resizable_ = resizable; + } + + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + void* src, + size_t size_bytes, + DeleterFnPtr d = nullptr) { + UniqueStorageShareExternalPointer( + at::DataPtr(src, src, d, data_ptr_.device()), size_bytes); + } + + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + size_t size_bytes) { + data_ptr_ = std::move(data_ptr); + size_bytes_ = static_cast(size_bytes); + size_bytes_is_heap_allocated_ = false; + allocator_ = nullptr; + resizable_ = false; + } + + // This method can be used only after storage construction and cannot be used + // to modify storage status + void set_received_cuda(bool received_cuda) { + received_cuda_ = received_cuda; + } + + bool received_cuda() { + return received_cuda_; + } + + impl::PyObjectSlot* pyobj_slot() { + return &pyobj_slot_; + } + + const impl::PyObjectSlot* pyobj_slot() const { + return &pyobj_slot_; + } + + StorageExtraMeta& get_extra_meta() { + if (!extra_meta_) { + extra_meta_ = std::make_unique(); + } + return *extra_meta_; + } + + [[noreturn]] void throw_data_ptr_access_error() const; + + void release_data_and_set_meta_custom_data_ptr_error_msg_( + std::optional s) { + throw_on_immutable_data_ptr_ = true; + get_extra_meta().custom_data_ptr_error_msg_ = std::move(s); + refresh_has_data_ptr_check(); + } + + void set_throw_on_mutable_data_ptr() { + throw_on_mutable_data_ptr_ = true; + refresh_has_data_ptr_check(); + } + + void set_warn_deprecated_on_mutable_data_ptr() { + warn_deprecated_on_mutable_data_ptr_ = true; + refresh_has_data_ptr_check(); + } + + protected: + // materialize_cow_storage needs to call set_data_ptr_no_materlize_cow + friend void c10::impl::cow::materialize_cow_storage(StorageImpl& storage); + + // Returns the previous data_ptr. If the old data_ptr was COW, + // this avoids materializing it + at::DataPtr set_data_ptr_no_materialize_cow(at::DataPtr&& data_ptr) { + at::DataPtr old_data_ptr(std::move(data_ptr_)); + data_ptr_ = std::move(data_ptr); + refresh_has_data_ptr_check(); + return old_data_ptr; + } + + private: + void refresh_has_data_ptr_check() { + has_mutable_data_ptr_check_ = is_cow() || throw_on_mutable_data_ptr_ || + warn_deprecated_on_mutable_data_ptr_ || throw_on_immutable_data_ptr_; + } + + inline bool is_cow() const { + return c10::impl::cow::is_cow_data_ptr(data_ptr_); + } + + // Triggers a copy if this is a copy-on-write tensor. + void maybe_materialize_cow() { + if (is_cow()) { + impl::cow::materialize_cow_storage(*this); + } + } + + DataPtr data_ptr_; + SymInt size_bytes_; + bool size_bytes_is_heap_allocated_; + bool resizable_; + // Identifies that Storage was received from another process and doesn't have + // local to process cuda memory allocation + bool received_cuda_; + // All special checks in data/data_ptr calls are guarded behind this single + // boolean. This is for performance: .data/.data_ptr calls are commonly in the + // hot-path. + bool has_mutable_data_ptr_check_ = false; + // If we should throw when mutable_data_ptr() or mutable_data() is called. + bool throw_on_mutable_data_ptr_ = false; + // If we should throw when data_ptr() or data() is called. + bool throw_on_immutable_data_ptr_ = false; + // If we warn when mutable_data_ptr() or mutable_data() is called. + bool warn_deprecated_on_mutable_data_ptr_ = false; + Allocator* allocator_; + impl::PyObjectSlot pyobj_slot_; + std::unique_ptr extra_meta_ = nullptr; +}; + +// Declare StorageImpl create function pointer types. +using StorageImplCreateHelper = intrusive_ptr (*)( + StorageImpl::use_byte_size_t, + SymInt size_bytes, + DataPtr data_ptr, + Allocator* allocator, + bool resizable); + +C10_API void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr); + +C10_API StorageImplCreateHelper GetStorageImplCreate(DeviceType t); + +C10_API c10::intrusive_ptr make_storage_impl( + c10::StorageImpl::use_byte_size_t use_byte_size, + c10::SymInt size_bytes, + c10::DataPtr data_ptr, + c10::Allocator* allocator, + bool resizable, + std::optional device_opt); + +namespace detail { + +#ifndef C10_MOBILE +template +struct TargetTraits< + T, + std::enable_if_t< + std::is_base_of_v>>> { + static constexpr bool can_have_pyobject = true; +}; +#endif + +} // namespace detail + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h new file mode 100644 index 0000000000000000000000000000000000000000..4d3a50984ec6e9093a321b7df2855383758e50ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h @@ -0,0 +1,182 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/// An index representing a specific stream. A StreamId is not independently +/// meaningful without knowing the Device it is associated with; try to +/// use Stream rather than StreamId directly. +/// +/// StreamIds are opaque; they are assigned by some DeviceType-specific +/// numbering system which is not visible to the user. HOWEVER, we +/// guarantee that StreamId 0 is always a valid stream, and corresponds +/// to some sort of "default" stream. +using StreamId = int64_t; + +struct C10_API StreamData3 { + StreamId stream_id; + DeviceIndex device_index; + DeviceType device_type; +}; + +// NB: I decided not to call the above StreamIndex to avoid confusion with +// DeviceIndex. This way, you access device index with index(), and stream id +// with id() + +/** + * A stream is a software mechanism used to synchronize launched kernels + * without requiring explicit synchronizations between kernels. The basic + * model is that every kernel launch is associated with a stream: every + * kernel on the same stream is implicitly synchronized so that if I launch + * kernels A and B on the same stream, A is guaranteed to finish before B + * launches. If I want B to run concurrently with A, I must schedule + * it on a different stream. + * + * The Stream class is a backend agnostic value class representing a stream + * which I may schedule a kernel on. Every stream is associated with a device, + * which is recorded in stream, which is used to avoid confusion about which + * device a stream refers to. + * + * Streams are explicitly thread-safe, in the sense that it is OK to pass + * a Stream from one thread to another, and kernels queued from two different + * threads will still get serialized appropriately. (Of course, the + * time when the kernels get queued is undetermined unless you synchronize + * host side ;) + * + * Stream does NOT have a default constructor. Streams are for expert + * users; if you want to use Streams, we're going to assume you know + * how to deal with C++ template error messages if you try to + * resize() a vector of Streams. + * + * Known instances of streams in backends: + * + * - cudaStream_t (CUDA) + * - hipStream_t (HIP) + * - cl_command_queue (OpenCL) (NB: Caffe2's existing OpenCL integration + * does NOT support command queues.) + * + * Because this class is device agnostic, it cannot provide backend-specific + * functionality (e.g., get the cudaStream_t of a CUDA stream.) There are + * wrapper classes which provide this functionality, e.g., CUDAStream. + */ +class C10_API Stream final { + private: + Device device_; + StreamId id_; + + public: + enum Unsafe { UNSAFE }; + enum Default { DEFAULT }; + + /// Unsafely construct a stream from a Device and a StreamId. In + /// general, only specific implementations of streams for a + /// backend should manufacture Stream directly in this way; other users + /// should use the provided APIs to get a stream. In particular, + /// we don't require backends to give any guarantees about non-zero + /// StreamIds; they are welcome to allocate in whatever way they like. + explicit Stream(Unsafe /*unused*/, Device device, StreamId id) + : device_(device), id_(id) {} + + /// Construct the default stream of a Device. The default stream is + /// NOT the same as the current stream; default stream is a fixed stream + /// that never changes, whereas the current stream may be changed by + /// StreamGuard. + explicit Stream(Default /*unused*/, Device device) + : device_(device), id_(0) {} + + bool operator==(const Stream& other) const noexcept { + return this->device_ == other.device_ && this->id_ == other.id_; + } + bool operator!=(const Stream& other) const noexcept { + return !(*this == other); + } + + Device device() const noexcept { + return device_; + } + DeviceType device_type() const noexcept { + return device_.type(); + } + DeviceIndex device_index() const noexcept { + return device_.index(); + } + StreamId id() const noexcept { + return id_; + } + + // Enqueues a wait instruction in the stream's work queue. + // This instruction is a no-op unless the event is marked + // for recording. In that case the stream stops processing + // until the event is recorded. + template + void wait(const T& event) const { + event.block(*this); + } + + // Return whether all asynchronous work previously enqueued on this stream + // has completed running on the device. + bool query() const; + + // Wait (by blocking the calling thread) until all asynchronous work enqueued + // on this stream has completed running on the device. + void synchronize() const; + + // The purpose of this function is to more conveniently permit binding + // of Stream to and from Python. Without packing, I have to setup a whole + // class with two fields (device and stream id); with packing I can just + // store a single uint64_t. + // + // The particular way we pack streams into a uint64_t is considered an + // implementation detail and should not be relied upon. + uint64_t hash() const noexcept { + // Concat these together into a 64-bit integer + uint64_t bits = static_cast(device_type()) << 56 | + static_cast(device_index()) << 48 | + // Remove the sign extension part of the 64-bit address because + // the id might be used to hold a pointer. + (static_cast(id()) & ((1ull << 48) - 1)); + return bits; + } + + struct StreamData3 pack3() const { + return {id(), device_index(), device_type()}; + } + + static Stream unpack3( + StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + TORCH_CHECK(isValidDeviceType(device_type)); + return Stream(UNSAFE, Device(device_type, device_index), stream_id); + } + + // I decided NOT to provide setters on this class, because really, + // why would you change the device of a stream? Just construct + // it correctly from the beginning dude. +}; + +C10_API std::ostream& operator<<(std::ostream& stream, const Stream& s); + +} // namespace c10 + +namespace std { +template <> +struct hash { + size_t operator()(c10::Stream s) const noexcept { + return std::hash{}(s.hash()); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..003816d62f6ce12223cc5106eee6ae37a26e04e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h @@ -0,0 +1,178 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/** + * A StreamGuard is an RAII class that changes the current device + * to the device corresponding to some stream, and changes the + * default stream on that device to be this stream. + * + * Use of StreamGuard is HIGHLY discouraged in operator definitions. In + * a single operator, you probably don't know enough about the global + * state of the world to profitably decide how to set streams. Let + * the caller handle this appropriately, and just use the current stream + * in your operator code. + * + * This StreamGuard does NOT have an uninitialized state; it is guaranteed + * to reset the stream and device on exit. If you are in a situation + * where you *might* want to setup a stream guard, see OptionalStreamGuard. + */ +struct StreamGuard { + /// No default constructor, see Note [Omitted default constructor from RAII] + explicit StreamGuard() = delete; + ~StreamGuard() = default; + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + explicit StreamGuard(Stream stream) : guard_(stream) {} + + /// Copy is disallowed + StreamGuard(const StreamGuard&) = delete; + StreamGuard& operator=(const StreamGuard&) = delete; + + /// Move is disallowed, as StreamGuard does not have an uninitialized state, + /// which is required for moves on types with nontrivial destructors. + StreamGuard(StreamGuard&& other) = delete; + StreamGuard& operator=(StreamGuard&& other) = delete; + + /// Resets the currently set stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// + /// NOTE: this implementation may skip some stream/device setting if + /// it can prove that it is unnecessary. + /// + /// WARNING: reset_stream does NOT preserve previously set streams on + /// different devices. If you need to set streams on multiple devices + /// on , use MultiStreamGuard instead. + void reset_stream(Stream stream) { + guard_.reset_stream(stream); + } + + /// Returns the stream that was set at the time the guard was constructed. + Stream original_stream() const { + return guard_.original_stream(); + } + + /// Returns the most recent stream that was set using this device guard, + /// either from construction, or via set_stream. + Stream current_stream() const { + return guard_.current_stream(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device/reset_device/set_index. + Device current_device() const { + return guard_.current_device(); + } + + /// Returns the device that was set at the most recent reset_stream(), + /// or otherwise the device at construction time. + Device original_device() const { + return guard_.original_device(); + } + + private: + c10::impl::InlineStreamGuard guard_; +}; + +/** + * An OptionalStreamGuard is an RAII class that sets a device to some value on + * initialization, and resets the device to its original value on destruction. + * See OptionalDeviceGuard for more guidance on how to use this class. + */ +struct OptionalStreamGuard { + /// Create an uninitialized guard. + explicit OptionalStreamGuard() = default; + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + explicit OptionalStreamGuard(Stream stream) : guard_(stream) {} + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream, + /// if the passed stream is not nullopt. + explicit OptionalStreamGuard(std::optional stream_opt) + : guard_(stream_opt) {} + + /// Copy is disallowed + OptionalStreamGuard(const OptionalStreamGuard&) = delete; + OptionalStreamGuard& operator=(const OptionalStreamGuard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + OptionalStreamGuard(OptionalStreamGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + OptionalStreamGuard& operator=(OptionalStreamGuard&& other) = delete; + ~OptionalStreamGuard() = default; + + /// Resets the currently set stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// Initializes the guard if it was not previously initialized. + void reset_stream(Stream stream) { + guard_.reset_stream(stream); + } + + /// Returns the stream that was set at the time the guard was most recently + /// initialized, or nullopt if the guard is uninitialized. + std::optional original_stream() const { + return guard_.original_stream(); + } + + /// Returns the most recent stream that was set using this stream guard, + /// either from construction, or via reset_stream, if the guard is + /// initialized, or nullopt if the guard is uninitialized. + std::optional current_stream() const { + return guard_.current_stream(); + } + + /// Restore the original device and stream, resetting this guard to + /// uninitialized state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalStreamGuard guard_; +}; + +/** + * A MultiStreamGuard is an RAII class that sets the current streams of a set of + * devices all at once, and resets them to their original values on destruction. + */ +struct MultiStreamGuard { + /// Set the current streams to the passed streams on each of their respective + /// devices. + explicit MultiStreamGuard(ArrayRef streams) : guard_(streams) {} + + /// Copy is disallowed + MultiStreamGuard(const MultiStreamGuard&) = delete; + MultiStreamGuard& operator=(const MultiStreamGuard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + MultiStreamGuard(MultiStreamGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + MultiStreamGuard& operator=(MultiStreamGuard&& other) = delete; + ~MultiStreamGuard() = default; + + private: + c10::impl::InlineMultiStreamGuard guard_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h new file mode 100644 index 0000000000000000000000000000000000000000..d12fa75fb41446f3f9967a73aed8a25fc1a60f4b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h @@ -0,0 +1,184 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +class SymInt; + +class C10_API SymBool { + public: + /*implicit*/ SymBool(bool b) : data_(b) {} + SymBool(SymNode ptr) : data_(false), ptr_(std::move(ptr)) { + TORCH_CHECK(ptr_->is_bool()); + } + SymBool() : data_(false) {} + + SymNodeImpl* toSymNodeImplUnowned() const { + return ptr_.get(); + } + + SymNodeImpl* release() && { + return std::move(ptr_).release(); + } + + // Only valid if is_heap_allocated() + SymNode toSymNodeImpl() const; + + // Guaranteed to return a SymNode, wrapping using base if necessary + SymNode wrap_node(const SymNode& base) const; + + bool expect_bool() const { + std::optional c = maybe_as_bool(); + TORCH_CHECK(c.has_value()); + return *c; + } + + SymBool sym_and(const SymBool& /*sci*/) const; + SymBool sym_or(const SymBool& /*sci*/) const; + SymBool sym_not() const; + + SymBool operator&(const SymBool& other) const { + return sym_and(other); + } + SymBool operator|(const SymBool& other) const { + return sym_or(other); + } + SymBool operator||(const SymBool& other) const { + return sym_or(other); + } + SymBool operator~() const { + return sym_not(); + } + + // Insert a guard for the bool to be its concrete value, and then return + // that value. Note that C++ comparison operations default to returning + // bool, so it's not so common to have to call this + bool guard_bool(const char* file, int64_t line) const; + bool expect_true(const char* file, int64_t line) const; + bool guard_size_oblivious(const char* file, int64_t line) const; + bool statically_known_true(const char* file, int64_t line) const; + bool guard_or_false(const char* file, int64_t line) const; + bool guard_or_true(const char* file, int64_t line) const; + + bool has_hint() const; + + bool as_bool_unchecked() const { + return data_; + } + + std::optional maybe_as_bool() const { + if (!is_heap_allocated()) { + return data_; + } + return toSymNodeImplUnowned()->constant_bool(); + } + + // Convert SymBool to SymInt (0 or 1) + // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless + SymInt toSymInt() const; + + bool is_heap_allocated() const { + return ptr_; + } + + private: + // TODO: optimize to union + bool data_; + SymNode ptr_; +}; + +C10_API std::ostream& operator<<(std::ostream& os, const SymBool& s); + +#define TORCH_SYM_CHECK(cond, ...) \ + TORCH_CHECK((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__) +#define TORCH_SYM_INTERNAL_ASSERT(cond, ...) \ + TORCH_INTERNAL_ASSERT((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__) +#define TORCH_MAYBE_SYM_CHECK(cond, ...) \ + if constexpr (std::is_same_v, SymBool>) { \ + TORCH_CHECK((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__) \ + } else { \ + TORCH_CHECK((cond), __VA_ARGS__) \ + } + +inline bool guard_size_oblivious( + bool b, + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) { + return b; +} + +inline bool guard_size_oblivious( + const c10::SymBool& b, + const char* file, + int64_t line) { + return b.guard_size_oblivious(file, line); +} + +inline bool guard_or_false( + bool b, + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) { + return b; +} + +inline bool guard_or_false( + const c10::SymBool& b, + const char* file, + int64_t line) { + return b.guard_or_false(file, line); +} + +inline bool statically_known_true( + bool b, + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) { + return b; +} + +inline bool statically_known_true( + const c10::SymBool& b, + const char* file, + int64_t line) { + return b.statically_known_true(file, line); +} + +inline bool guard_or_true( + bool b, + const char* file [[maybe_unused]], + int64_t line [[maybe_unused]]) { + return b; +} + +inline bool guard_or_true( + const c10::SymBool& b, + const char* file, + int64_t line) { + return b.guard_or_true(file, line); +} + +#define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \ + c10::guard_size_oblivious((cond), __FILE__, __LINE__) + +#define TORCH_STATICALLY_KNOWN_TRUE(cond) \ + c10::statically_known_true((cond), __FILE__, __LINE__) + +#define TORCH_GUARD_OR_FALSE(cond) \ + c10::guard_or_false((cond), __FILE__, __LINE__) + +#define TORCH_GUARD_OR_TRUE(cond) c10::guard_or_true((cond), __FILE__, __LINE__) + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h new file mode 100644 index 0000000000000000000000000000000000000000..332726ba4c5dade5accef6a3dac6076366c04d95 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h @@ -0,0 +1,123 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +// NB: this is actually double precision; we're using the Python naming here +class C10_API SymFloat { + public: + /*implicit*/ SymFloat(double d) : data_(d) {} + SymFloat(SymNode ptr) + : data_(std::numeric_limits::quiet_NaN()), ptr_(std::move(ptr)) { + TORCH_CHECK(ptr_->is_float()); + } + SymFloat() : data_(0.0) {} + + SymNodeImpl* toSymNodeImplUnowned() const { + return ptr_.get(); + } + + SymNodeImpl* release() && { + return std::move(ptr_).release(); + } + + // Only valid if is_symbolic() + SymNode toSymNodeImpl() const; + + // Guaranteed to return a SymNode, wrapping using base if necessary + SymNode wrap_node(const SymNode& base) const; + + double expect_float() const { + TORCH_CHECK(!is_symbolic()); + return data_; + } + + SymFloat operator+(const SymFloat& /*sci*/) const; + SymFloat operator-(const SymFloat& /*sci*/) const; + SymFloat operator*(const SymFloat& /*sci*/) const; + SymFloat operator/(const SymFloat& /*sci*/) const; + + SymBool sym_eq(const SymFloat& /*sci*/) const; + SymBool sym_ne(const SymFloat& /*sci*/) const; + SymBool sym_lt(const SymFloat& /*sci*/) const; + SymBool sym_le(const SymFloat& /*sci*/) const; + SymBool sym_gt(const SymFloat& /*sci*/) const; + SymBool sym_ge(const SymFloat& /*sci*/) const; + + bool operator==(const SymFloat& o) const { + return sym_eq(o).guard_bool(__FILE__, __LINE__); + } + bool operator!=(const SymFloat& o) const { + return sym_ne(o).guard_bool(__FILE__, __LINE__); + } + bool operator<(const SymFloat& o) const { + return sym_lt(o).guard_bool(__FILE__, __LINE__); + } + bool operator<=(const SymFloat& o) const { + return sym_le(o).guard_bool(__FILE__, __LINE__); + } + bool operator>(const SymFloat& o) const { + return sym_gt(o).guard_bool(__FILE__, __LINE__); + } + bool operator>=(const SymFloat& o) const { + return sym_ge(o).guard_bool(__FILE__, __LINE__); + } + + SymFloat min(const SymFloat& sci) const; + SymFloat max(const SymFloat& sci) const; + + // Need guidance on where to put this code + SymFloat sqrt() const; + + // Insert a guard for the float to be its concrete value, and then return + // that value. This operation always works, even if the float is symbolic, + // so long as we know what the underlying value is. Don't blindly put this + // everywhere; you can cause overspecialization of PyTorch programs with + // this method. + // + // It should be called as guard_float(__FILE__, __LINE__). The file and line + // number can be used to diagnose overspecialization. + double guard_float(const char* file, int64_t line) const; + + bool has_hint() const; + + // N.B. It's important to keep this definition in the header + // as we expect if checks to be folded for mobile builds + // where `is_symbolic` is always false + C10_ALWAYS_INLINE bool is_symbolic() const { + return ptr_; + } + + // UNSAFELY coerce this SymFloat into a double. You MUST have + // established that this is a non-symbolic by some other means, + // typically by having tested is_symbolic(). You will get garbage + // from this function if is_symbolic() + double as_float_unchecked() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_symbolic()); + return data_; + } + + private: + // TODO: optimize to union + double data_; + SymNode ptr_; +}; + +C10_API std::ostream& operator<<(std::ostream& os, const SymFloat& s); +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h new file mode 100644 index 0000000000000000000000000000000000000000..f9fa7f645047dbf5f8a2f1831d362606e8d98e98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h @@ -0,0 +1,586 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +class SymFloat; + +// SymInt represents either a regular int64_t, or a symbolic integer +// (represented in a type erased way as SymNode). The intention is for SymInt +// to represent symbolic sizes that arise when doing shape computation in +// operator kernels. This allows for tracing through programs without baking in +// concrete sizes into kernel calls. +// +// SymInt has an API equivalent to int64_t. In particular, it is a value type. +// Internally, SymInt is represented in a clever packed way, so that it only +// occupies one word of space; but morally, it is a union between an int64_t +// and an intrusive pointer to SymNodeImpl. +// +// Invariant: the referenced SymNodeImpl is guaranteed to be a SymNode where +// is_int() returns true + +class C10_API SymInt { + public: + enum Unchecked { + UNCHECKED, + }; + + /*implicit*/ SymInt(int64_t d) : data_(d) { + if (is_heap_allocated()) { + // Large negative number, heap allocate it + promote_to_negative(); + } + } + SymInt() : data_(0) {} + SymInt(SymNode n); + + // unchecked c-tor accepting raw `data_` + // One appropriate use for this is when you are constructing a symint + // in a situation where you know it is non-negative (or, if it is negative, + // the negative value is -1; i.e., not user controlled) + SymInt(Unchecked /*unused*/, int64_t d) : data_(d) {} + + // TODO: these implementations are not optimal because they allocate a + // temporary and then use the move constructor/assignment + SymInt(const SymInt& s) : data_(0) { + if (s.is_heap_allocated()) { + *this = SymInt(s.toSymNode()); + } else { + data_ = s.data_; + } + } + SymInt(SymInt&& s) noexcept : data_(s.data_) { + s.data_ = 0; + } + + SymInt& operator=(const SymInt& s) { + if (this != &s) { + if (s.is_heap_allocated()) { + *this = SymInt(s.toSymNode()); + } else { + data_ = s.data_; + } + } + return *this; + } + SymInt& operator=(SymInt&& s) noexcept { + if (this != &s) { + release_(); // release the current SymNode if any + data_ = s.data_; + if (s.is_heap_allocated()) + s.data_ = 0; + }; + return *this; + } + + SymNodeImpl* toSymNodeImplUnowned() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(is_heap_allocated()); + uint64_t unextended_bits = static_cast(data_) & ~MASK; + uint64_t sign_bit_mask = 1ULL << (62 - 1); + // https://stackoverflow.com/questions/42534749/signed-extension-from-24-bit-to-32-bit-in-c + uint64_t extended_bits = (unextended_bits ^ sign_bit_mask) - sign_bit_mask; + return static_cast( + // NOLINTNEXTLINE(performance-no-int-to-ptr, bugprone*) + reinterpret_cast(static_cast(extended_bits))); + } + + void release_() { + if (is_heap_allocated()) { + SymNode::reclaim(toSymNodeImplUnowned()); // steal + } + } + + SymNodeImpl* release() && { +#ifndef C10_MOBILE + TORCH_INTERNAL_ASSERT(is_heap_allocated()); + auto* r = toSymNodeImplUnowned(); + data_ = 0; // transfer ownership + return r; +#else + TORCH_INTERNAL_ASSERT(false); +#endif + } + + // Only valid if is_heap_allocated() + SymNode toSymNode() const; + + // Guaranteed to return a SymNode, wrapping using base if necessary + SymNode wrap_node(const SymNode& base) const; + + ~SymInt() { + release_(); + } + + // Require the int to be non-symbolic, and if it is symbolic raise an + // error. This is safe to use for C++ code that doesn't work for symbolic + // shapes, and you don't have time to fix it immediately, as if we + // try to trigger the path in C++ you'll appropriately get an error + int64_t expect_int() const { + if (auto r = maybe_as_int()) { + return *r; + } + TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE( + false, "when unpacking SymInt, expected int but got ", *this); + } + + // Test if we have a hint for this int (e.g., guard_int would work). + // Most of the time this is true; it is only false when you have + // an unbacked SymInt. + bool has_hint() const; + + // Insert a guard for the int to be its concrete value, and then return + // that value. This operation always works, even if the int is symbolic, + // so long as we know what the underlying value is (e.g., this won't work + // if you call it on the size of nonzero output). Don't blindly put this + // everywhere; you can cause overspecialization of PyTorch programs with + // this method. + // + // It should be called as guard_int(__FILE__, __LINE__). The file and line + // number can be used to diagnose overspecialization. + int64_t guard_int(const char* file, int64_t line) const; + + // Distinguish actual symbolic values from constants stored on the heap + bool is_symbolic() const { + return is_heap_allocated() && + !toSymNodeImplUnowned()->constant_int().has_value(); + } + + // N.B. It's important to keep this definition in the header + // as we expect if checks to be folded for mobile builds + // where `is_heap_allocated` is always false and optimize dead code paths + C10_ALWAYS_INLINE bool is_heap_allocated() const { +#ifdef C10_MOBILE + return false; +#else + return !check_range(data_); +#endif + } + + SymInt operator+(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma + *mb); + } + } + return operator_add_slow_path(sci); + } + + SymInt operator-(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma - *mb); + } + } + return operator_sub_slow_path(sci); + } + + SymInt operator*(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma * *mb); + } + } + return operator_mul_slow_path(sci); + } + + SymInt operator/(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma / *mb); + } + } + return operator_div_slow_path(sci); + } + + SymInt operator%(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma % *mb); + } + } + return operator_mod_slow_path(sci); + } + + void operator*=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma * *mb); + return; + } + } + operator_imul_slow_path(sci); + } + + void operator+=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma + *mb); + return; + } + } + operator_iadd_slow_path(sci); + } + + void operator/=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma / *mb); + return; + } + } + operator_idiv_slow_path(sci); + } + + SymInt clone() const; + + SymBool sym_eq(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma == *mb); + } + } + return sym_eq_slow_path(sci); + } + + SymBool sym_ne(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma != *mb); + } + } + return sym_ne_slow_path(sci); + } + + SymBool sym_lt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma < *mb); + } + } + return sym_lt_slow_path(sci); + } + + SymBool sym_le(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma <= *mb); + } + } + return sym_le_slow_path(sci); + } + + SymBool sym_gt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma > *mb); + } + } + return sym_gt_slow_path(sci); + } + + SymBool sym_ge(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma >= *mb); + } + } + return sym_ge_slow_path(sci); + } + + bool operator==(const SymInt& o) const { + return sym_eq(o).guard_bool(__FILE__, __LINE__); + } + bool operator!=(const SymInt& o) const { + return sym_ne(o).guard_bool(__FILE__, __LINE__); + } + bool operator<(const SymInt& o) const { + return sym_lt(o).guard_bool(__FILE__, __LINE__); + } + bool operator<=(const SymInt& o) const { + return sym_le(o).guard_bool(__FILE__, __LINE__); + } + bool operator>(const SymInt& o) const { + return sym_gt(o).guard_bool(__FILE__, __LINE__); + } + bool operator>=(const SymInt& o) const { + return sym_ge(o).guard_bool(__FILE__, __LINE__); + } + + SymInt min(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::min(*ma, *mb)); + } + } + return min_slow_path(sci); + } + + SymInt max(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::max(*ma, *mb)); + } + } + return max_slow_path(sci); + } + + // If both are symbolic, this checks if + // they share the same node. + // If both are not symbolic this just checks normal equality. + bool is_same(const SymInt& other) const; + + operator SymFloat() const; + + void unsafe_set_data(size_t nbytes) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated()); + data_ = static_cast(nbytes); + } + + // Don't use this. Prefer maybe_as_int instead + int64_t as_int_unchecked() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated()); + return data_; + } + + std::optional maybe_as_int() const { + if (!is_heap_allocated()) { + return data_; + } + return maybe_as_int_slow_path(); + } + + // Return whether the integer is directly coercible to a SymInt + // without requiring heap allocation. You don't need to use this + // to check if you can pass an integer to SymInt; this is guaranteed + // to work (it just might heap allocate!) + static bool check_range(int64_t i) { + return i > MAX_UNREPRESENTABLE_INT; + } + + // Return the min representable integer as a SymInt without + // heap allocation. For quantities that count bytes (or larger), + // this is still much larger than you need, so you may consider + // using this as a more efficient version of MIN_INT + static constexpr int64_t min_representable_int() { + return MAX_UNREPRESENTABLE_INT + 1; + } + + private: + void promote_to_negative(); + SymInt operator_add_slow_path(const SymInt& sci) const; + SymInt operator_sub_slow_path(const SymInt& sci) const; + SymInt operator_mul_slow_path(const SymInt& sci) const; + SymInt operator_div_slow_path(const SymInt& sci) const; + SymInt operator_mod_slow_path(const SymInt& sci) const; + void operator_imul_slow_path(const SymInt& sci); + void operator_iadd_slow_path(const SymInt& sci); + void operator_idiv_slow_path(const SymInt& sci); + SymBool sym_eq_slow_path(const SymInt& sci) const; + SymBool sym_ne_slow_path(const SymInt& sci) const; + SymBool sym_lt_slow_path(const SymInt& sci) const; + SymBool sym_le_slow_path(const SymInt& sci) const; + SymBool sym_gt_slow_path(const SymInt& sci) const; + SymBool sym_ge_slow_path(const SymInt& sci) const; + + SymInt min_slow_path(const SymInt& sci) const; + SymInt max_slow_path(const SymInt& sci) const; + + std::optional maybe_as_int_slow_path() const; + + // Constraints on the internal representation: + // + // - Should represent positive and small negative ints + // - No conversion necessary for operations on ints + // - Must represent valid 64-bit pointers + // - Is symbolic test should be FAST (two arithmetic instructions is too + // much). + // This code being a hotpath is based on Strobelight profiles of + // is_heap_allocated(). FB only: https://fburl.com/strobelight/5l50ncxd + // (you will need to change the time window). + // + // So, the scheme is to reserve large negative numbers (assuming + // two's complement): + // + // - 0b0.... means we are a positive int + // - 0b11... means we are a small negative int + // - 0b10... means we are are a pointer. This means that + // [-2^63, -2^62-1] are not representable as ints. + // We don't actually need all of this space as on x86_64 + // as the top 16bits aren't used for anything + static constexpr uint64_t MASK = 1ULL << 63 | 1ULL << 62 | 1ULL << 61; + static constexpr uint64_t IS_SYM = 1ULL << 63 | 1ULL << 61; + // We must manually translate the bit pattern test into a greater + // than test because compiler doesn't figure it out: + // https://godbolt.org/z/356aferaW + static constexpr int64_t MAX_UNREPRESENTABLE_INT = + -1LL & static_cast(~(1ULL << 62)); + int64_t data_; +}; + +/// Sum of a list of SymInt; accumulates into the c10::SymInt expression +template < + typename C, + typename std::enable_if_t< + std::is_same_v, + int> = 0> +inline c10::SymInt multiply_integers(const C& container) { + return std::accumulate( + container.begin(), + container.end(), + c10::SymInt(1), + [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; }); +} + +template < + typename Iter, + typename = std::enable_if_t::value_type, + c10::SymInt>>> +inline c10::SymInt multiply_integers(Iter begin, Iter end) { + return std::accumulate( + begin, + end, + c10::SymInt(1), + [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; }); +} + +#define DECLARE_SYMINT_OP_INTONLY(scalar_t, RetTy) \ + C10_API RetTy operator%(const SymInt& a, scalar_t b); \ + C10_API RetTy operator%(scalar_t a, const SymInt& b); + +#define DECLARE_SYMINT_OP(scalar_t, RetTy) \ + C10_API RetTy operator+(const SymInt& a, scalar_t b); \ + C10_API RetTy operator-(const SymInt& a, scalar_t b); \ + C10_API RetTy operator*(const SymInt& a, scalar_t b); \ + C10_API RetTy operator/(const SymInt& a, scalar_t b); \ + C10_API RetTy operator+(scalar_t a, const SymInt& b); \ + C10_API RetTy operator-(scalar_t a, const SymInt& b); \ + C10_API RetTy operator*(scalar_t a, const SymInt& b); \ + C10_API RetTy operator/(scalar_t a, const SymInt& b); \ + C10_API bool operator==(const SymInt& a, scalar_t b); \ + C10_API bool operator!=(const SymInt& a, scalar_t b); \ + C10_API bool operator<(const SymInt& a, scalar_t b); \ + C10_API bool operator<=(const SymInt& a, scalar_t b); \ + C10_API bool operator>(const SymInt& a, scalar_t b); \ + C10_API bool operator>=(const SymInt& a, scalar_t b); \ + C10_API bool operator==(scalar_t a, const SymInt& b); \ + C10_API bool operator!=(scalar_t a, const SymInt& b); \ + C10_API bool operator<(scalar_t a, const SymInt& b); \ + C10_API bool operator<=(scalar_t a, const SymInt& b); \ + C10_API bool operator>(scalar_t a, const SymInt& b); \ + C10_API bool operator>=(scalar_t a, const SymInt& b); + +DECLARE_SYMINT_OP_INTONLY(int64_t, SymInt) +DECLARE_SYMINT_OP_INTONLY(int32_t, SymInt) +DECLARE_SYMINT_OP_INTONLY(uint64_t, SymInt) +DECLARE_SYMINT_OP_INTONLY(uint32_t, SymInt) +DECLARE_SYMINT_OP(int64_t, SymInt) +DECLARE_SYMINT_OP(int32_t, SymInt) // make sure constants work +DECLARE_SYMINT_OP(uint64_t, SymInt) +DECLARE_SYMINT_OP(uint32_t, SymInt) +DECLARE_SYMINT_OP(double, SymFloat) +DECLARE_SYMINT_OP(float, SymFloat) // just for completeness + +// On OSX size_t is different than uint64_t so we have to +// define it separately +#if defined(__APPLE__) +DECLARE_SYMINT_OP_INTONLY(size_t, SymInt) +DECLARE_SYMINT_OP(size_t, SymInt) +#endif + +#undef DECLARE_SYMINT_OP + +C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s); +C10_API SymInt operator-(const SymInt& s); + +inline bool sym_eq(int64_t a, int64_t b) { + return a == b; +} + +inline SymBool sym_eq(const SymInt& a, const SymInt& b) { + return a.sym_eq(b); +} + +inline bool sym_ne(int64_t a, int64_t b) { + return a != b; +} + +inline SymBool sym_ne(const SymInt& a, const SymInt& b) { + return a.sym_ne(b); +} + +inline bool sym_lt(int64_t a, int64_t b) { + return a < b; +} + +inline SymBool sym_lt(const SymInt& a, const SymInt& b) { + return a.sym_lt(b); +} + +inline bool sym_le(int64_t a, int64_t b) { + return a <= b; +} + +inline SymBool sym_le(const SymInt& a, const SymInt& b) { + return a.sym_le(b); +} + +inline bool sym_gt(int64_t a, int64_t b) { + return a > b; +} + +inline SymBool sym_gt(const SymInt& a, const SymInt& b) { + return a.sym_gt(b); +} + +inline bool sym_ge(int64_t a, int64_t b) { + return a >= b; +} + +inline SymBool sym_ge(const SymInt& a, const SymInt& b) { + return a.sym_ge(b); +} + +} // namespace c10 + +#include + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_specialized = true; + + static constexpr int64_t max() noexcept { + return std::numeric_limits::max(); + } + + static constexpr int64_t min() noexcept { + return std::numeric_limits::min(); + } + + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; +}; + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h new file mode 100644 index 0000000000000000000000000000000000000000..b63753b186937f0e6869ee557ca1528bb2d7e340 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h @@ -0,0 +1,113 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { +using SymIntArrayRef = ArrayRef; + +inline at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) { + return IntArrayRef(reinterpret_cast(ar.data()), ar.size()); +} + +// TODO: a SymIntArrayRef containing a heap allocated large negative integer +// can actually technically be converted to an IntArrayRef... but not with +// the non-owning API we have here. We can't reinterpet cast; we have to +// allocate another buffer and write the integers into it. If you need it, +// we can do it. But I don't think you need it. + +inline std::optional asIntArrayRefSlowOpt( + c10::SymIntArrayRef ar) { + for (const c10::SymInt& sci : ar) { + if (sci.is_heap_allocated()) { + return std::nullopt; + } + } + + return {asIntArrayRefUnchecked(ar)}; +} + +inline at::IntArrayRef asIntArrayRefSlow( + c10::SymIntArrayRef ar, + const char* file, + int64_t line) { + for (const c10::SymInt& sci : ar) { + TORCH_CHECK( + !sci.is_heap_allocated(), + file, + ":", + line, + ": SymIntArrayRef expected to contain only concrete integers"); + } + return asIntArrayRefUnchecked(ar); +} + +// Even slower than asIntArrayRefSlow, as it forces an allocation for a +// destination int, BUT it is able to force specialization (it never errors) +inline c10::DimVector asIntArrayRefSlowAlloc( + c10::SymIntArrayRef ar, + const char* file, + int64_t line) { + c10::DimVector res(ar.size(), 0); + for (const auto i : c10::irange(ar.size())) { + res[i] = ar[i].guard_int(file, line); + } + return res; +} + +#define C10_AS_INTARRAYREF_SLOW(a) c10::asIntArrayRefSlow(a, __FILE__, __LINE__) +#define C10_AS_INTARRAYREF_SLOW_ALLOC(a) \ + c10::asIntArrayRefSlowAlloc(a, __FILE__, __LINE__) + +// Prefer using a more semantic constructor, like +// fromIntArrayRefKnownNonNegative +inline SymIntArrayRef fromIntArrayRefUnchecked(IntArrayRef array_ref) { + return SymIntArrayRef( + reinterpret_cast(array_ref.data()), array_ref.size()); +} + +inline SymIntArrayRef fromIntArrayRefKnownNonNegative(IntArrayRef array_ref) { + return fromIntArrayRefUnchecked(array_ref); +} + +inline SymIntArrayRef fromIntArrayRefSlow(IntArrayRef array_ref) { + for (long i : array_ref) { + TORCH_CHECK( + SymInt::check_range(i), + "IntArrayRef contains an int that cannot be represented as a SymInt: ", + i); + } + return SymIntArrayRef( + reinterpret_cast(array_ref.data()), array_ref.size()); +} + +inline c10::SymBool sym_equals(SymIntArrayRef LHS, SymIntArrayRef RHS) { + if (LHS.size() != RHS.size()) { + return c10::SymBool(false); + } + + c10::SymBool result = sym_eq(LHS.size(), RHS.size()); + for (size_t i = 0; i < RHS.size(); ++i) { + c10::SymBool equals = sym_eq(LHS[i], RHS[i]); + std::optional equals_bool = equals.maybe_as_bool(); + + if (equals_bool.has_value() && !*equals_bool) { + // Early return if element comparison is known to be false + return equals; + } + result = result.sym_and(equals); + } + return result; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..a4257684ea150ac4f8f1bda39ab4c1212c1929ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h @@ -0,0 +1,261 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter") + +namespace c10 { + +class SymNodeImpl; +using SymNode = c10::intrusive_ptr; + +// When you add a method, you also need to edit +// torch/csrc/jit/python/init.cpp +// torch/csrc/utils/python_symnode.h +// c10/core/ConstantSymNodeImpl.h +class C10_API SymNodeImpl : public c10::intrusive_ptr_target { + public: + ~SymNodeImpl() override = default; + + template + c10::intrusive_ptr dyn_cast() const { + return c10::intrusive_ptr::reclaim_copy(dynamic_cast(this)); + } + + // these could be pure virtual when we implement LTC versions + virtual bool is_int() { + TORCH_CHECK(false, "NYI"); + } + virtual bool is_bool() { + TORCH_CHECK(false, "NYI"); + } + virtual bool is_float() { + TORCH_CHECK(false, "NYI"); + } + virtual bool is_nested_int() const { + return false; + } + virtual SymNode add(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sub(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode mul(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + // NB: legacy, prefer float_truediv or int_truediv + virtual SymNode truediv(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode float_truediv(const SymNode& other) { + return truediv(other); + } + virtual SymNode int_truediv(const SymNode& other) { + return truediv(other); + } + // NB: legacy, prefer float_pow or pow_by_natural + virtual SymNode pow(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode float_pow(const SymNode& other) { + return pow(other); + } + virtual SymNode pow_by_natural(const SymNode& other) { + return pow(other); + } + // NB: legacy, prefer int_floordiv + virtual SymNode floordiv(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode int_floordiv(const SymNode& other) { + return floordiv(other); + } + virtual SymNode mod(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode eq(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode ne(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode gt(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode lt(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode le(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode ge(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode ceil() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode floor() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode neg() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_min(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_max(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_or(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_and(const SymNode& other) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_not() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_ite(const SymNode& then_val, const SymNode& else_val) { + TORCH_CHECK(false, "NYI"); + } + // NB: self is ignored here, only the arguments are used + virtual SymNode is_contiguous( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode is_channels_last_contiguous_2d( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode is_channels_last_contiguous_3d( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode is_channels_last_strides_2d( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode is_channels_last_strides_3d( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode is_non_overlapping_and_dense( + ArrayRef sizes, + ArrayRef strides) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode clone() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode sym_float() { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode wrap_int(int64_t num) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode wrap_float(double num) { + TORCH_CHECK(false, "NYI"); + } + virtual SymNode wrap_bool(bool num) { + TORCH_CHECK(false, "NYI"); + } + virtual int64_t guard_int(const char* file, int64_t line) { + TORCH_CHECK(false, "NYI"); + } + virtual bool guard_bool(const char* file, int64_t line) { + TORCH_CHECK(false, "NYI"); + } + virtual double guard_float(const char* file, int64_t line) { + TORCH_CHECK(false, "NYI"); + } + virtual bool guard_size_oblivious(const char* file, int64_t line) { + // No improvement for unbacked SymBools by default, replace this + // with a better implementation! + return guard_bool(file, line); + } + virtual bool guard_or_false(const char* file, int64_t line) { + // Note: PT2 primarily uses PythonSymNodeImpl for this functionality. + // XLA is currently the main consumer of this fallback path since it uses + // ahead-of-time compilation and cannot depend on Python runtime. + return guard_bool(file, line); + } + virtual bool statically_known_true(const char* file, int64_t line) { + // Note: PT2 primarily uses PythonSymNodeImpl for this functionality. + // XLA is currently the main consumer of this fallback path since it uses + // ahead-of-time compilation and cannot depend on Python runtime. + return guard_bool(file, line); + } + virtual bool guard_or_true(const char* file, int64_t line) { + // Note: PT2 primarily uses PythonSymNodeImpl for this functionality. + // XLA is currently the main consumer of this fallback path since it uses + // ahead-of-time compilation and cannot depend on Python runtime. + return guard_bool(file, line); + } + virtual bool expect_true(const char* file, int64_t line) { + // No improvement for unbacked SymBools by default, replace this + // with a better implementation! + return guard_bool(file, line); + } + virtual int64_t int_() { + TORCH_CHECK(false, "NYI"); + } + virtual bool bool_() { + TORCH_CHECK(false, "NYI"); + } + virtual bool has_hint() { + TORCH_CHECK(false, "NYI"); + } + virtual std::string str() { + TORCH_CHECK(false, "NYI"); + } + virtual std::string _graph_repr() { + return str(); + } + virtual std::optional nested_int() { + return std::nullopt; + } + virtual std::optional nested_int_coeff() { + return std::nullopt; + } + virtual std::optional constant_int() { + return std::nullopt; + } + virtual std::optional constant_bool() { + return std::nullopt; + } + virtual std::optional maybe_as_int() { + return std::nullopt; + } + virtual bool is_constant() { + return false; + } + virtual bool is_symbolic() { + return true; + } + std::ostream& operator<<(std::ostream& os) { + os << str(); + return os; + } +}; + +} // namespace c10 +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h new file mode 100644 index 0000000000000000000000000000000000000000..411c81a98bac68a34c7c2bafbf78b096bf2bc9cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h @@ -0,0 +1,234 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +class C10_API SymbolicShapeMeta { + public: + // Basic metadata from which other quantities are derived + SymDimVector sizes_ = {0}; + SymDimVector strides_ = {1}; + SymInt storage_offset_ = 0; + + bool strides_valid_ = true; // e.g. for sparse where there are no strides + + SymbolicShapeMeta() = default; + ~SymbolicShapeMeta() = default; + SymbolicShapeMeta(const SymbolicShapeMeta& other); + SymbolicShapeMeta(SymbolicShapeMeta&& other) = delete; + SymbolicShapeMeta& operator=(const SymbolicShapeMeta& other) = delete; + SymbolicShapeMeta& operator=(SymbolicShapeMeta&& other) = delete; + + void refresh_numel() { + // Non-const, don't need to hold mutables_ lock + available_.fetch_and(~numel_avail); + numel_ = 1; + } + + void refresh_contiguous() { + // Non-const, don't need to hold mutables_ lock + available_.fetch_and(numel_avail); + is_contiguous_ = false; + is_channels_last_contiguous_ = false; + is_channels_last_3d_contiguous_ = false; + is_channels_last_ = false; + is_channels_last_3d_ = false; + is_non_overlapping_and_dense_ = false; + } + + int64_t dim() const { + return static_cast(sizes_.size()); + } + + // Accessors for derived quantities, computed lazily on first access + + bool has_numel() const { + return available_.load() & numel_avail; + } + bool has_is_contiguous() const { + return available_.load() & is_contiguous_avail; + } + bool has_is_channels_last_contiguous() const { + return available_.load() & is_channels_last_contiguous_avail; + } + bool has_is_channels_last_3d_contiguous() const { + return available_.load() & is_channels_last_3d_contiguous_avail; + } + bool has_is_channels_last() const { + return available_.load() & is_channels_last_avail; + } + bool has_is_channels_last_3d() const { + return available_.load() & is_channels_last_3d_avail; + } + bool has_is_non_overlapping_and_dense() const { + return available_.load() & is_non_overlapping_and_dense_avail; + } + + // Accessors to cached derived properties + // DO NOT call with mutables_ lock held + const SymInt& numel() const { + if (C10_UNLIKELY(!has_numel())) { + init_numel(); + } + return numel_; + } + + const SymBool& is_contiguous(at::MemoryFormat memory_format) const { + if (memory_format == at::MemoryFormat::ChannelsLast) { + return this->is_channels_last_contiguous(); + } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { + return this->is_channels_last_3d_contiguous(); + } + return this->is_contiguous(); + } + + const SymBool& is_contiguous() const { + if (C10_UNLIKELY(!has_is_contiguous())) { + init_is_contiguous(); + } + return is_contiguous_; + } + + const SymBool& is_channels_last_contiguous() const { + if (C10_UNLIKELY(!has_is_channels_last_contiguous())) { + init_is_channels_last_contiguous(); + } + return is_channels_last_contiguous_; + } + + const SymBool& is_channels_last_3d_contiguous() const { + if (C10_UNLIKELY(!has_is_channels_last_3d_contiguous())) { + init_is_channels_last_3d_contiguous(); + } + return is_channels_last_3d_contiguous_; + } + + const SymBool& is_channels_last() const { + if (C10_UNLIKELY(!has_is_channels_last())) { + init_is_channels_last(); + } + return is_channels_last_; + } + + const SymBool& is_channels_last_3d() const { + if (C10_UNLIKELY(!has_is_channels_last_3d())) { + init_is_channels_last_3d(); + } + return is_channels_last_3d_; + } + + const SymBool& is_non_overlapping_and_dense() const { + if (C10_UNLIKELY(!has_is_non_overlapping_and_dense())) { + init_is_non_overlapping_and_dense(); + } + return is_non_overlapping_and_dense_; + } + + // Assumptions so we can short-circuit computation + // NOTE: Don't need to lock mutables_ since these aren't const + void assume_contiguous(SymBool val = true) { + is_contiguous_ = std::move(val); + available_.fetch_or(is_contiguous_avail); + } + void assume_channels_last_contiguous(SymBool val = true) { + is_contiguous_ = std::move(val); + available_.fetch_or(is_channels_last_contiguous_avail); + } + void assume_channels_last_3d_contiguous(SymBool val = true) { + is_channels_last_3d_contiguous_ = std::move(val); + available_.fetch_or(is_channels_last_3d_contiguous_avail); + } + void assume_channels_last(SymBool val = true) { + is_channels_last_ = std::move(val); + available_.fetch_or(is_channels_last_avail); + } + void assume_channels_last_3d(SymBool val = true) { + is_channels_last_3d_ = std::move(val); + available_.fetch_or(is_channels_last_3d_avail); + } + void assume_non_overlapping_and_dense(SymBool val = true) { + is_non_overlapping_and_dense_ = std::move(val); + available_.fetch_or(is_non_overlapping_and_dense_avail); + } + + private: + SymBool compute_contiguous() const; + SymBool compute_channels_last_contiguous_2d() const; + SymBool compute_channels_last_contiguous_3d() const; + SymBool compute_strides_like_channels_last_2d() const; + SymBool compute_strides_like_channels_last_3d() const; + SymBool compute_non_overlapping_and_dense() const; + + // These are little wrappers over the real compute_ functions that + // can make use of other contiguity fields to short circuit. + // They need to be implemented separately for SymBool, as SymBool does + // not short circuit. + // TODO: should the SymBool cases avoid the short circuit? Need to reason + // if its correct, and reason if the simpler expressions are better for + // analysis (maybe not!) + + SymBool compute_channels_last_contiguous_3d_dim5() const; + SymBool compute_channels_last_2d_dim5() const; + SymBool compute_channels_last_3d_dim5() const; + SymBool compute_is_non_overlapping_and_dense_dim4() const; + SymBool compute_is_non_overlapping_and_dense_dim5() const; + SymBool compute_is_non_overlapping_and_dense_anydim() const; + + void init_numel() const; + void init_is_contiguous() const; + void init_is_channels_last_contiguous() const; + void init_is_channels_last_3d_contiguous() const; + void init_is_channels_last() const; + void init_is_channels_last_3d() const; + void init_is_non_overlapping_and_dense() const; + + // NOTE: These only set if !has_foo() + void set_numel(SymInt val) const; + void set_is_contiguous(SymBool val) const; + void set_is_channels_last_contiguous(SymBool val) const; + void set_is_channels_last_3d_contiguous(SymBool val) const; + void set_is_channels_last(SymBool val) const; + void set_is_channels_last_3d(SymBool val) const; + void set_is_non_overlapping_and_dense(SymBool val) const; + + // Lazily initialized variables, with the corresponding available_ flag + // indicating whether the value has been initialized + mutable std::atomic available_{0}; + + enum avail { + numel_avail = 1 << 0, + is_contiguous_avail = 1 << 1, + is_channels_last_contiguous_avail = 1 << 2, + is_channels_last_3d_contiguous_avail = 1 << 3, + is_channels_last_avail = 1 << 4, + is_channels_last_3d_avail = 1 << 5, + is_non_overlapping_and_dense_avail = 1 << 6, + }; + + // Mutex to prevent races when initializing the variable from const accessors + mutable std::mutex mutables_; + mutable SymInt numel_ = 1; + mutable SymBool is_contiguous_{true}; + mutable SymBool is_channels_last_contiguous_{false}; + mutable SymBool is_channels_last_3d_contiguous_{false}; + mutable SymBool is_channels_last_{false}; + mutable SymBool is_channels_last_3d_{false}; + mutable SymBool is_non_overlapping_and_dense_{true}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..03faea3fbc70500bda37a8099657e80f38976657 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h @@ -0,0 +1,3333 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// A global boolean variable to control whether we free memory when a Tensor +// is shrunk to a smaller size. As a result, a Tensor is always going to +// keep the memory allocated for its maximum capacity reshaped to so far. +// +// This parameter is respected "upper-case" methods which call Resize() +// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_ +// or ShrinkTo, both of which guarantee to never to free memory. +C10_DECLARE_bool(caffe2_keep_on_shrink); + +// Since we can have high variance in blob memory allocated across different +// inputs in the same run, we will shrink the blob only if the memory gain +// is larger than this flag in bytes. This only applies to functions which +// respect caffe2_keep_on_shrink. +C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory); + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + +namespace at { +class Tensor; +class TensorBase; +} // namespace at + +namespace c10 { + +/** + * A utility function to convert vector to vector. + */ +inline std::vector ToVectorint64_t(const ArrayRef& src) { + return std::vector(src.begin(), src.end()); +} + +/** + * Return product of all dimensions starting from k + */ +inline int64_t size_from_dim_(int k, IntArrayRef dims) { + int64_t r = 1; + for (const auto i : c10::irange(k, dims.size())) { + r *= dims[i]; + } + return r; +} + +// Product of all dims up to k (not including dims[k]) +inline int64_t size_to_dim_(int k, IntArrayRef dims) { + TORCH_CHECK(k >= 0 && static_cast(k) <= dims.size()); + int64_t r = 1; + for (const auto i : c10::irange(k)) { + r *= dims[i]; + } + return r; +} + +// Product of all dims between k and l (not including dims[k] and dims[l]) +inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) { + TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size()); + int64_t r = 1; + if (k < l) { + for (int i = k + 1; i < l; ++i) { + r *= dims[i]; + } + } else { + for (int i = l + 1; i < k; ++i) { + r *= dims[i]; + } + } + return r; +} + +// Wrap around axis_index if it is negative, s.t., -1 is the last dim +inline int canonical_axis_index_(int axis_index, int ndims) { + TORCH_CHECK(axis_index >= -ndims); + TORCH_CHECK(axis_index < ndims); + if (axis_index < 0) { + return axis_index + ndims; + } + return axis_index; +} + +using PlacementDtor = void (*)(void*, size_t); + +/* + * A Context that will call extra placement deleter during + * deconstruction. + * + * Accept a already constructed DataPtr and store it as member + * during destruction, we'll call extra deleter on the underlying + * data pointer before the DataPtr is destructed. + * `data_ptr_` owns the memory. + */ +struct C10_API PlacementDeleteContext { + DataPtr data_ptr_; + PlacementDtor placement_dtor_; + size_t size_; + + PlacementDeleteContext( + DataPtr&& data_ptr, + PlacementDtor placement_dtor, + size_t size) + : data_ptr_(std::move(data_ptr)), + placement_dtor_(placement_dtor), + size_(size) {} + + PlacementDeleteContext(PlacementDeleteContext&&) noexcept = delete; + PlacementDeleteContext(const PlacementDeleteContext&) = delete; + PlacementDeleteContext& operator=(const PlacementDeleteContext&) = delete; + PlacementDeleteContext& operator=(PlacementDeleteContext&&) = delete; + static DataPtr makeDataPtr( + DataPtr&& data_ptr, + PlacementDtor placement_dtor, + size_t size, + Device device); + ~PlacementDeleteContext() { + placement_dtor_(data_ptr_.get(), size_); + // original memory will be freed when data_ptr_ is destructed + } +}; + +struct C10_API AutogradMetaInterface { + virtual void set_requires_grad( + bool requires_grad, + at::TensorImpl* self_impl) = 0; + virtual bool requires_grad() const = 0; + virtual at::Tensor& mutable_grad() = 0; + virtual const at::Tensor& grad() const = 0; + virtual const at::Tensor& fw_grad(uint64_t level, const at::TensorBase& self) + const = 0; + virtual void set_fw_grad( + const at::TensorBase& new_grad, + const at::TensorBase& self, + uint64_t level, + bool is_inplace_op) = 0; + virtual ~AutogradMetaInterface(); +}; + +namespace impl { + +// Unfortunately, the definition of AutogradMeta lives in a separate +// compilation unit than TensorImpl (libtorch.so versus libc10.so) +// which means that we cannot construct an AutogradMeta from TensorImpl, +// not even from the cpp file. So we have to indirect it through a factory +// function which will be initialized when we load libtorch.so. + +struct C10_API AutogradMetaFactory { + virtual ~AutogradMetaFactory() = default; + virtual std::unique_ptr make() const = 0; + // This method is the dumbest method. But I don't have access + // to Tensor (not TensorImpl) which is undefined in this header. + virtual const at::Tensor& undefined_tensor() const = 0; +}; + +C10_API void SetAutogradMetaFactory(AutogradMetaFactory* factory); +C10_API AutogradMetaFactory* GetAutogradMetaFactory(); + +struct C10_API AutogradMetaFactoryRegisterer{ + explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory * factory){ + SetAutogradMetaFactory(factory); +} // namespace impl +}; // namespace c10 + +} // namespace impl + +struct C10_API NamedTensorMetaInterface { + virtual ~NamedTensorMetaInterface() = default; + virtual std::unique_ptr clone() const { + TORCH_INTERNAL_ASSERT( + false, "Not implemented: NamedTensorMetaInterface::clone"); + } + virtual int64_t slow_dim() const { + TORCH_INTERNAL_ASSERT( + false, "Not implemented: NamedTensorMetaInterface::slow_dim"); + } +}; + +// For ease of copy pasting +#if 0 +is_contiguous +is_channels_last_contiguous +is_channels_last_3d_contiguous +is_channels_last +is_channels_last_3d +is_non_overlapping_and_dense +#endif + +/** + * This structure is intended to hold additional metadata of the specific device + * backend. + **/ +struct C10_API BackendMeta : intrusive_ptr_target { + ~BackendMeta() override = default; + virtual intrusive_ptr clone( + const intrusive_ptr& ptr) const { + return ptr; + } +}; + +struct C10_API ExtraMeta { + std::unique_ptr symbolic_shape_meta_ = nullptr; + std::unique_ptr named_tensor_meta_ = nullptr; + intrusive_ptr backend_meta_ = nullptr; + std::optional custom_data_ptr_error_msg_ = std::nullopt; + std::optional custom_storage_error_msg_ = std::nullopt; + + ExtraMeta() = default; + ~ExtraMeta() = default; + ExtraMeta(const ExtraMeta& other) { + if (other.symbolic_shape_meta_) { + symbolic_shape_meta_ = + std::make_unique(*other.symbolic_shape_meta_); + } + if (other.named_tensor_meta_) { + named_tensor_meta_ = other.named_tensor_meta_->clone(); + } + if (other.backend_meta_) { + backend_meta_ = other.backend_meta_->clone(other.backend_meta_); + } + if (other.custom_data_ptr_error_msg_) { + custom_data_ptr_error_msg_ = other.custom_data_ptr_error_msg_; + } + if (other.custom_storage_error_msg_) { + custom_storage_error_msg_ = other.custom_storage_error_msg_; + } + } + ExtraMeta& operator=(const ExtraMeta& other) = delete; + ExtraMeta(ExtraMeta&& other) = delete; + ExtraMeta& operator=(ExtraMeta&& other) = delete; + + ExtraMeta( + std::unique_ptr symbolic_shape_meta, + std::unique_ptr named_tensor_meta, + intrusive_ptr backend_meta, + std::optional custom_data_ptr_error_msg = std::nullopt, + std::optional custom_storage_access_error_msg = std::nullopt) + : symbolic_shape_meta_(std::move(symbolic_shape_meta)), + named_tensor_meta_(std::move(named_tensor_meta)), + backend_meta_(std::move(backend_meta)), + custom_data_ptr_error_msg_(std::move(custom_data_ptr_error_msg)), + custom_storage_error_msg_(std::move(custom_storage_access_error_msg)) {} + + std::unique_ptr clone() const { + return std::make_unique(*this); + } +}; + +// NOTE [ Version Counter Sharing ] +// +// Every Tensor has a version counter. Version counters are incremented whenever +// the data or size of a tensor changes through in-place Variable operations. +// Version counters are used to detect modifications to saved variables which +// would result in incorrect gradient calculations. Version counters may be +// shared between Variables: +// +// 1. A view shares the version counter of the base Variable, +// 2. `x.detach()` shares the version counter of `x`, +// 3. Unpacked saved variables share the version counter of the source. +// +// Version counters are not shared in these scenarios: +// +// 1. When we replace a `Variable`'s underlying `Tensor` by calling +// `set_data(...)`, +// 2. `x.data` does not share the version counter of `x`. (See discussion at +// https://github.com/pytorch/pytorch/issues/5396) +// +// Question: Why do we put the version counter in TensorImpl instead of +// AutogradMeta? +// +// Answer: After the Variable/Tensor merge, a tensor will not have AutogradMeta +// when its `requires_grad_` is false, but when we use this tensor in the +// forward pass of a function that requires saving this tensor for backward, we +// need to keep track of this tensor's version to make sure it's always valid in +// the autograd graph. +// +// To achieve this goal, we put the version counter in TensorImpl instead of +// AutogradMeta, and have it always be available. This allows us to have the +// optimization of not carrying AutogradMeta when a tensor doesn't require +// gradient. +// +// A hypothetical alternative way to achieve this goal is to initialize +// AutogradMeta and create the version counter for the non-requires-grad tensor +// only when it's saved for backward. However, since saving a tensor for +// backward happens in the forward pass, and our invariant is that forward pass +// needs to be thread-safe, lazy-initializing AutogradMeta when saving a tensor +// can introduce race conditions when we are running the forward pass in +// multi-thread scenarios, thus making the forward pass not thread-safe anymore, +// which breaks the invariant. +struct C10_API VariableVersion { + private: + struct VersionCounter : intrusive_ptr_target { + VersionCounter(uint32_t version) : version_(version) {} + std::atomic version_; + }; + c10::intrusive_ptr version_counter_; + + public: + // Note [Disabled VariableVersion] + // VariableVersion struct has an intrusive_ptr pointing VersionCounter struct + // with an atomic variable. Thus `VariableVersion(/*version=*/0)` is not as + // cheap as we expected. In some cases constructing a VariableVersion with + // version 0 is not necessary so we add a cheap constructor which + // doesn't allocate the intrusive_ptr. + // Example use cases are: + // - Inference tensors don't track version counter, so they'll just always + // have disabled VariableVersion. + // - In SavedVariable class we override version_counter_ inside its + // constructor + // so that we can use the cheap constructor there. + enum Disabled { DISABLED }; + // It's okay to return true even for inference tensor which + // doesn't have version counter enabled. + // We want to be permissive here since in many cases (e.g. make_variable) + // we can std::move a TensorImpl if there's no other uses which saves us + // an additional TensorImpl allocation. + bool unique() const { + return version_counter_ ? 1 == version_counter_.use_count() : true; + } + // NOTE: As of C++11 and 14, default-constructing a std::atomic variable + // leaves it in a persistently undefined state. See + // https://cplusplus.github.io/LWG/issue2334. + VariableVersion(uint32_t version) + : version_counter_(c10::make_intrusive(version)) {} + VariableVersion(Disabled /*unused*/ = DISABLED) {} + + bool enabled() const { + return version_counter_; + } + + // Note [Inplace update inference tensor] + // 1. Inplace update to inference tensor is forbidden in normal mode. + // For example: + // inference_tensor.copy_(normal_tensor_requires_grad) + // This inplace makes inference_tensor have requires_grad=True and + // have a grad_fn. This is bad because views of `inference_tensor` + // created in InferenceMode won't be able to know the grad_fn since + // their ViewMeta were not recorded. To match NoGradMode behavior + // that "inplace update to a view created in NoGradMode raise an error", + // we just ban inplace update to inference tensor since we can't tell + // if an inference tensor is a view created in InferenceMode. + // + // Note that views of normal tensor created in InferenceMode has proper + // ViewMeta so that they're aware of the grad_fn correctly. + // + // 2. Inplace update to inference tensor in inference tensor doesn't bump + // version counter. + // * It either doesn't call bump() by skipping ADInplaceOrView kernel, + // - e.g. inference_tensor.add_(1) + // * or bump() is a no-op for inference tensor. + // - e.g. inference_tensor.add_(normal_tensor) + void bump() { + // TODO: Replace the link to the documentation once it's available. + TORCH_CHECK( + version_counter_ || InferenceMode::is_enabled(), + "Inplace update to inference tensor outside InferenceMode is not allowed." + "You can make a clone to get a normal tensor before doing inplace update." + "See https://github.com/pytorch/rfcs/pull/17 for more details."); + if (version_counter_) { + ++version_counter_->version_; + } + } + + void set_version(int64_t i) { + TORCH_CHECK( + version_counter_, + "Tried to call torch.autograd._unsafe_set_version() on a tensor " + "that does not have a version counter. Was it created in inference mode?"); + TORCH_CHECK(i >= 0, "Cannot set a version_counter to a value below 0: ", i); + version_counter_->version_ = i; + } + + // Inference tensor doesn't have version counter so it shouldn't be + // accessed. + uint32_t current_version() const { + TORCH_CHECK( + version_counter_, "Inference tensors do not track version counter."); + return version_counter_->version_; + } +}; + +// Forward declaration of TensorImpl needed for forward declaration of +// C10_TensorImpl_Size_Check_Dummy_Class +struct C10_API TensorImpl; + +/** + * NOTE: Some TensorImpl methods are small and not overridden in the + * PyTorch codebase itself, but may theoretically need to be + * overridden by third-party TensorImpl subclasses. This macro allows + * users that need maximum performance and don't need these extension + * points to disable them with a build-time flag. (In particular, + * XLA's XLATensorImpl currently overrides these methods, so we can't + * enable this flag by default.) + */ +#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY +#define TENSORIMPL_MAYBE_VIRTUAL +#else +#define TENSORIMPL_MAYBE_VIRTUAL virtual +#endif + +/** + * The low-level representation of a tensor, which contains a pointer + * to a storage (which contains the actual data) and metadata (e.g., sizes and + * strides) describing this particular view of the data as a tensor. + * + * Some basic characteristics about our in-memory representation of + * tensors: + * + * - It contains a pointer to a storage struct (Storage/StorageImpl) + * which contains the pointer to the actual data and records the + * data type and device of the view. This allows multiple tensors + * to alias the same underlying data, which allows to efficiently + * implement differing *views* on a tensor. + * + * - The tensor struct itself records view-specific metadata about + * the tensor, e.g., sizes, strides and offset into storage. + * Each view of a storage can have a different size or offset. + * + * - This class is intrusively refcounted. It is refcounted so that + * we can support prompt deallocation of large tensors; it is + * intrusively refcounted so that we can still perform reference + * counted operations on raw pointers, which is often more convenient + * when passing tensors across language boundaries. + * + * - For backwards-compatibility reasons, a tensor may be in an + * uninitialized state. A tensor may be uninitialized in the following + * two ways: + * + * - A tensor may be DTYPE UNINITIALIZED. A tensor of this + * form has an uninitialized dtype. This situation most + * frequently arises when a user writes Tensor x(CPU). The dtype + * is subsequently initialized when mutable_data() is + * invoked for the first time. + * + * - A tensor may be STORAGE UNINITIALIZED. A tensor of this form + * has non-zero size, but has a storage with a null data pointer. + * This situation most frequently arises when a user calls + * Resize() or FreeMemory(). This is because Caffe2 historically + * does lazy allocation: allocation of data doesn't occur until + * mutable_data() is invoked. A tensor with zero size is + * always storage initialized, because no allocation is necessary + * in this case. + * + * All combinations of these two uninitialized states are possible. + * Consider the following transcript in idiomatic Caffe2 API: + * + * Tensor x(CPU); // x is storage-initialized, dtype-UNINITIALIZED + * x.Resize(4); // x is storage-UNINITIALIZED, dtype-UNINITIALIZED + * x.mutable_data(); // x is storage-initialized, dtype-initialized + * x.FreeMemory(); // x is storage-UNINITIALIZED, dtype-initialized. + * + * All other fields on tensor are always initialized. In particular, + * size is always valid. (Historically, a tensor declared as Tensor x(CPU) + * also had uninitialized size, encoded as numel == -1, but we have now + * decided to default to zero size, resulting in numel == 0). + * + * Uninitialized storages MUST be uniquely owned, to keep our model + * simple. Thus, we will reject operations which could cause an + * uninitialized storage to become shared (or a shared storage to + * become uninitialized, e.g., from FreeMemory). + * + * In practice, tensors which are storage-UNINITIALIZED and + * dtype-UNINITIALIZED are *extremely* ephemeral: essentially, + * after you do a Resize(), you basically always call mutable_data() + * immediately afterwards. Most functions are not designed to + * work if given a storage-UNINITIALIZED, dtype-UNINITIALIZED tensor. + * + * We intend to eliminate all uninitialized states, so that every + * tensor is fully initialized in all fields. Please do not write new code + * that depends on these uninitialized states. + */ +struct C10_API TensorImpl : public c10::intrusive_ptr_target { + TensorImpl() = delete; + ~TensorImpl() override; + // Note [Enum ImplType] + // This enum is temporary. In the followup refactor we should + // think about how to specialize TensorImpl creation for view + // tensors. Currently we only special case its key_set_ but + // there's also potential to share version_counter_ directly + // without creating first and then override in as_view. + enum ImplType { VIEW }; + + /** + * Construct a 1-dim 0-size tensor backed by the given storage. + */ + TensorImpl( + Storage&& storage, + DispatchKeySet /*key_set*/, + const caffe2::TypeMeta data_type); + + // See Note [Enum ImplType] + TensorImpl( + ImplType /*unused*/, + Storage&& storage, + DispatchKeySet /*key_set*/, + const caffe2::TypeMeta data_type); + + /** + * Construct a 1-dim 0 size tensor that doesn't have a storage. + */ + TensorImpl( + DispatchKeySet /*key_set*/, + const caffe2::TypeMeta data_type, + std::optional device_opt); + + // Legacy constructors so I don't have to go update call sites. + // TODO: When Variable is added, delete these constructors + TensorImpl( + Storage&& storage, + DispatchKey dispatch_key, + const caffe2::TypeMeta data_type) + : TensorImpl( + std::move(storage), + DispatchKeySet(dispatch_key), + data_type) {} + TensorImpl( + DispatchKey dispatch_key, + const caffe2::TypeMeta data_type, + std::optional device_opt) + : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {} + + private: + // This constructor is private, because the data_type is redundant with + // storage. Still, we pass it in separately because it's easier to write + // the initializer list if we're not worried about storage being moved out + // from under us. + TensorImpl( + Storage&& storage, + DispatchKeySet /*key_set*/, + const caffe2::TypeMeta data_type, + std::optional /*device_opt*/); + + public: + TensorImpl(const TensorImpl&) = delete; + TensorImpl& operator=(const TensorImpl&) = delete; + TensorImpl(TensorImpl&&) = delete; + TensorImpl& operator=(TensorImpl&&) = delete; + + /** + * Release (decref) storage, and any other external allocations. This + * override is for `intrusive_ptr_target` and is used to implement weak + * tensors. + */ + void release_resources() override; + + public: + /** + * Return the DispatchKeySet corresponding to this Tensor, specifying + * all of the DispatchKeys that this Tensor identifies as. This is the + * information used to dispatch operations on this tensor. + */ + DispatchKeySet key_set() const { + return key_set_; + } + + private: + [[noreturn]] void throw_cannot_call_with_symbolic(const char* meth) const; + + // NOTE: The general recipe for customizable methods is that the fastpath + // function (e.g., sizes()) does an unlikely policy test, and if doesn't + // trigger, it does the fast path implementation with no checks and going + // directly to on-TensorImpl fields. In particular, you never need to + // check ExtraMeta if the policy doesn't trigger, as non-trivial ExtraMeta + // implies the policy will always match. + // + // The default implementations of methods are "safe": they do extra tests + // to make sure the internal state is consistent no matter if you are + // doing symbolic shapes or not. If you don't want the tests, directly + // override the custom method (e.g., custom_sizes()) to do your preferred + // behavior. + + public: + /** + * Return a reference to the sizes of this tensor. This reference remains + * valid as long as the tensor is live and not resized. + */ + IntArrayRef sizes() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return sizes_custom(); + } + return sizes_and_strides_.sizes_arrayref(); + } + + SymIntArrayRef sym_sizes() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return sym_sizes_custom(); + } + // Sizes guaranteed to be non-negative, so unchecked cast is OK + return c10::fromIntArrayRefKnownNonNegative( + sizes_and_strides_.sizes_arrayref()); + } + + IntArrayRef sizes_default() const { + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + throw_cannot_call_with_symbolic("sizes"); + } + return sizes_and_strides_.sizes_arrayref(); + } + + SymIntArrayRef sym_sizes_default() const { + if (has_symbolic_sizes_strides_) { + return symbolic_shape_meta().sizes_; + } else { + // Sizes guaranteed to be non-negative, so unchecked cast is OK + return c10::fromIntArrayRefKnownNonNegative(sizes_default()); + } + } + + template + ArrayRef generic_sizes() { + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); + + if constexpr (std::is_same_v) { + return sizes(); + } else { + return sym_sizes(); + } + } + + template + ArrayRef generic_strides() { + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); + + if constexpr (std::is_same_v) { + return strides(); + } else { + return sym_strides(); + } + } + + template + T generic_storage_offset() { + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); + + if constexpr (std::is_same_v) { + return storage_offset(); + } else { + return sym_storage_offset(); + } + } + + /** + * The number of elements in a tensor. + * + * WARNING: Previously, if you were using the Caffe2 API, you could + * test numel() == -1 to see if a tensor was uninitialized. This + * is no longer true; numel always accurately reports the product + * of sizes of a tensor. + */ + int64_t numel() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return numel_custom(); + } + return numel_; + } + + c10::SymInt sym_numel() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return sym_numel_custom(); + } + return c10::SymInt(SymInt::UNCHECKED, numel_); + } + + int64_t numel_default() const { + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + throw_cannot_call_with_symbolic("numel"); + } + return numel_; + } + + c10::SymInt sym_numel_default() const { + if (has_symbolic_sizes_strides_) { + return symbolic_shape_meta().numel(); + } else { + return c10::SymInt(SymInt::UNCHECKED, numel_); + } + } + + /** + * Return the number of dimensions of this tensor. Note that 0-dimension + * represents a Tensor that is a Scalar, e.g., one that has a single element. + */ + int64_t dim() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return dim_custom(); + } + return static_cast(sizes_and_strides_.size()); + } + + int64_t dim_default() const { + if (has_symbolic_sizes_strides_) { + return static_cast(symbolic_shape_meta().sizes_.size()); + } else { + return static_cast(sizes_and_strides_.size()); + } + } + + /** + * Return the offset in number of elements into the storage that this + * tensor points to. Most tensors have storage_offset() == 0, but, + * for example, an index into a tensor will have a non-zero storage_offset(). + * + * WARNING: This is NOT computed in bytes. + */ + int64_t storage_offset() const { + // TODO: maybe this should be toggled by strides + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return storage_offset_custom(); + } + return storage_offset_; + } + + c10::SymInt sym_storage_offset() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return sym_storage_offset_custom(); + } + return c10::SymInt(SymInt::UNCHECKED, storage_offset_); + } + + int64_t storage_offset_default() const { + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + throw_cannot_call_with_symbolic("storage_offset"); + } + return storage_offset_; + } + + c10::SymInt sym_storage_offset_default() const { + if (has_symbolic_sizes_strides_) { + return symbolic_shape_meta().storage_offset_; + } else { + return c10::SymInt(SymInt::UNCHECKED, storage_offset_); + } + } + + /** + * Return a reference to the strides of this tensor. This reference remains + * valid as long as the tensor is live and not restrided. + */ + IntArrayRef strides() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return strides_custom(); + } + return sizes_and_strides_.strides_arrayref(); + } + + c10::SymIntArrayRef sym_strides() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return sym_strides_custom(); + } + return c10::fromIntArrayRefKnownNonNegative(strides_default()); + } + + IntArrayRef strides_default() const { + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + throw_cannot_call_with_symbolic("strides"); + } + return sizes_and_strides_.strides_arrayref(); + } + + c10::SymIntArrayRef sym_strides_default() const { + if (has_symbolic_sizes_strides_) { + return symbolic_shape_meta().strides_; + } else { + return c10::fromIntArrayRefKnownNonNegative(strides_default()); + } + } + + c10::SymBool sym_is_contiguous( + at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return sym_is_contiguous_custom(memory_format); + } + return sym_is_contiguous_default(memory_format); + } + + template + T is_contiguous_default_impl(at::MemoryFormat memory_format) const { + if (!has_symbolic_sizes_strides_) { + if (memory_format == at::MemoryFormat::ChannelsLast) { + return is_channels_last_contiguous_; + } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { + return is_channels_last_3d_contiguous_; + } + return is_contiguous_; + } + + // Handle dynamic shapes. + const auto& symbolic = symbolic_shape_meta().is_contiguous(memory_format); + + if constexpr (std::is_same_v) { + return symbolic.guard_bool(__FILE__, __LINE__); + } else { + return symbolic; + } + } + + bool is_contiguous_default(at::MemoryFormat memory_format) const { + return is_contiguous_default_impl(memory_format); + } + + c10::SymBool sym_is_contiguous_default(at::MemoryFormat memory_format) const { + return is_contiguous_default_impl(memory_format); + } + + /** + * Whether or not a tensor is laid out in contiguous memory. + * + * Tensors with non-trivial strides are not contiguous. See + * compute_contiguous() for the exact definition of whether or not + * a tensor is contiguous or not. + */ + bool is_contiguous( + at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return is_contiguous_custom(memory_format); + } + return is_contiguous_default(memory_format); + } + + bool is_strides_like_default(at::MemoryFormat memory_format) const { + if (has_symbolic_sizes_strides_) { + if (memory_format == at::MemoryFormat::ChannelsLast) { + return symbolic_shape_meta().is_channels_last().guard_bool( + __FILE__, __LINE__); + } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { + return symbolic_shape_meta().is_channels_last_3d().guard_bool( + __FILE__, __LINE__); + } else { + return false; + } + } + + if (memory_format == at::MemoryFormat::ChannelsLast) { + return is_channels_last_; + } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { + return is_channels_last_3d_; + } else { + return false; + } + } + + SymBool sym_is_non_overlapping_and_dense_default() const { + if (has_symbolic_sizes_strides_) { + return symbolic_shape_meta().is_non_overlapping_and_dense(); + } else { + return is_non_overlapping_and_dense_; + } + } + + bool is_non_overlapping_and_dense_default() const { + if (has_symbolic_sizes_strides_) { + return sym_is_non_overlapping_and_dense_default().guard_bool( + __FILE__, __LINE__); + } else { + return is_non_overlapping_and_dense_; + } + } + + // NB: these dim accessor functions don't have _default(), as you can use + // sizes_default/strides_default + /** + * Return the size of a tensor at some dimension, wrapping the dimension if + * necessary. + * + * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will + * be faster + */ + int64_t size(int64_t d) const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return size_custom(d); + } + d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false); + return sizes_and_strides_.size_at_unchecked(d); + } + + c10::SymInt sym_size(int64_t d) const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) { + return sym_size_custom(d); + } + d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false); + const auto sizes = this->sym_sizes(); + return sizes[d]; + } + + /** + * Return the stride of a tensor at some dimension, wrapping the dimension + * if necessary. + * + * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will + * be faster + */ + int64_t stride(int64_t d) const { + d = maybe_wrap_dim(d, dim(), false); + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + // TODO: provide stride_custom, symmetrically with size_custom. + // There is presently no user for it; only NestedTensor is using + // size_custom overrideability + return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds) + } + // Intentionally don't call default, which also handles symbolic + return sizes_and_strides_.stride_at_unchecked(d); + } + + enum class SizesStridesPolicy : uint8_t { + // Default behavior, e.g., dense tensor. + // + // Can override: nothing + Default = 0, + // Customizable strides behavior, e.g., sparse tensor, + // mkldnn tensor. + // + // Can override: strides(), is_contiguous() + CustomStrides = 1, + // Customizable sizes behavior, e.g., nested tensor + // + // Can override: strides(), is_contiguous(), sizes(), dim(), numel() + CustomSizes = 2 + }; + + protected: + inline bool matches_policy(SizesStridesPolicy policy) const { + return sizes_strides_policy_ >= static_cast(policy); + } + + inline bool matches_custom(SizesStridesPolicy policy) const { + return custom_sizes_strides_ >= static_cast(policy); + } + + inline bool matches_python_custom(SizesStridesPolicy policy) const { + auto r = python_custom_sizes_strides_ >= static_cast(policy); + if (r) { + TORCH_INTERNAL_ASSERT(is_python_dispatch()) + } + return r; + } + + /** + * Customization points for the functions above. sizes_strides_policy_ + * must be set to enable these. + * + * NB: dim is overridable separately from sizes because it is possible + * for a tensor to have rank, but not well defined sizes. + */ + // sizes_strides_policy_ >= CustomStrides + + virtual bool is_strides_like_custom(at::MemoryFormat memory_format) const; + + virtual c10::SymBool sym_is_non_overlapping_and_dense_custom() const; + + bool is_non_overlapping_and_dense_custom() const { + return sym_is_non_overlapping_and_dense_custom().guard_bool( + __FILE__, __LINE__); + } + + virtual c10::SymBool sym_is_contiguous_custom( + at::MemoryFormat memory_format) const; + + bool is_contiguous_custom(at::MemoryFormat memory_format) const { + return sym_is_contiguous_custom(memory_format) + .guard_bool(__FILE__, __LINE__); + } + + // sizes_strides_policy_ >= CustomSizes + // Currently this method only exists to be overwritten by subclasses such as + // NestedTensorImpl. + virtual int64_t size_custom(int64_t d) const { + // TODO: We could add support to Python dispatch here. + // TODO: We could call into aten::size.int instead of + // sizes_custom()[d] and enable use of the dispatcher. + d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false); + return sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds) + } + + virtual c10::SymInt sym_size_custom(int64_t d) const { + // TODO: We could add support to Python dispatch here. + // TODO: We could call into aten::size.int instead of + // sym_sizes_custom()[d] and enable use of the dispatcher. + d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false); + return sym_sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds) + } + + virtual IntArrayRef sizes_custom() const; + virtual IntArrayRef strides_custom() const; + virtual int64_t numel_custom() const; + virtual int64_t storage_offset_custom() const; + virtual int64_t dim_custom() const; + virtual Device device_custom() const; + virtual Layout layout_custom() const; + + virtual c10::SymIntArrayRef sym_sizes_custom() const; + virtual c10::SymIntArrayRef sym_strides_custom() const; + virtual c10::SymInt sym_numel_custom() const; + virtual c10::SymInt sym_storage_offset_custom() const; + + public: +/** + * True if this tensor has storage. See storage() for details. + */ +#ifdef DEBUG + // Allow subclasses to check that their storage_ is never getting set in debug + // builds. + virtual +#else + TENSORIMPL_MAYBE_VIRTUAL +#endif + bool + has_storage() const +// NOTE: we devirtualize this because it arguably shouldn't be an +// error just to ask subclasses if they have storage. +// This used to throw for most subclasses, but OpaqueTensorImpl +// wanted it to successfully return false, so we went ahead and made +// it a non-error. +#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY + { + return storage_; + } +#else + ; +#endif + + /** + * Return the underlying storage of a Tensor. Multiple tensors may share + * a single storage. A Storage is an impoverished, Tensor-like class + * which supports far less operations than Tensor. + * + * Avoid using this method if possible; try to use only Tensor APIs to perform + * operations. + */ + TENSORIMPL_MAYBE_VIRTUAL const Storage& storage() const { + if (C10_UNLIKELY(storage_access_should_throw_)) { + throw_storage_access_error(); + } + return storage_; + } + + /** + * Return the underlying storage, unsafely assuming this is a basic strided + * tensor. In cases where `storage` access would throw, this returns a + * default-constructed Storage. + */ + inline const Storage& unsafe_storage() const { + return storage_; + } + + bool unique_version() const { + return version_counter_.unique(); + } + + protected: + virtual Layout layout_impl() const { + TORCH_CHECK( + false, "layout_impl is only implemented for TensorImpl subclasses."); + } + + public: + // Whether a tensor is sparse COO or not. + bool is_sparse() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + return key_set_.has_all(c10::sparse_ks); + } + + // Whether a tensor is sparse CSR or not. + bool is_sparse_csr() const { + return layout() == kSparseCsr; + } + + // Whether a tensor is sparse CSR/CSC/BSR/BSC or not. + bool is_sparse_compressed() const { + return key_set_.has_all(c10::sparse_csr_ks); + } + + bool is_quantized() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized); + return key_set_.has_all(quantized_ks); + } + + bool is_meta() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_meta(); + } + return device_opt_.has_value() && device_opt_->type() == kMeta; + } + + bool is_cpu() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_cpu(); + } + // Note: we cannot rely on dispatch keys to determine the device type + // of a tensor, because "wrapper" tensors (like FunctionalTensorWrapper) + // don't include backend dispatch keys. + return device_opt_.has_value() && device_opt_->type() == kCPU; + } + + bool is_cuda() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_cuda(); + } + return device_opt_.has_value() && device_opt_->type() == kCUDA; + } + + bool is_xpu() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_xpu(); + } + return device_opt_.has_value() && device_opt_->type() == kXPU; + } + + bool is_ipu() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_ipu(); + } + return device_opt_.has_value() && device_opt_->type() == kIPU; + } + + bool is_xla() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_xla(); + } + return device_opt_.has_value() && device_opt_->type() == kXLA; + } + + bool is_mtia() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_mtia(); + } + return device_opt_.has_value() && device_opt_->type() == kMTIA; + } + + bool is_hpu() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_hpu(); + } + return device_opt_.has_value() && device_opt_->type() == kHPU; + } + + bool is_lazy() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_lazy(); + } + return device_opt_.has_value() && device_opt_->type() == kLazy; + } + + bool is_hip() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_hip(); + } + return device_opt_.has_value() && device_opt_->type() == kHIP; + } + + bool is_ve() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_ve(); + } + return device_opt_.has_value() && device_opt_->type() == kVE; + } + + bool is_privateuseone() const { + // NB: This method is not virtual and avoid dispatches for performance + // reasons. + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_privateuseone(); + } + return device_opt_.has_value() && device_opt_->type() == kPrivateUse1; + } + + bool is_mkldnn() const { + return key_set_.has_all(c10::mkldnn_ks); + } + + bool is_vulkan() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_vulkan(); + } + return device_opt_.has_value() && device_opt_->type() == kVulkan; + } + + bool is_metal() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_metal(); + } + return device_opt_.has_value() && device_opt_->type() == kMetal; + } + + bool is_mps() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_mps(); + } + return device_opt_.has_value() && device_opt_->type() == kMPS; + } + + bool is_maia() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().is_maia(); + } + return device_opt_.has_value() && device_opt_->type() == kMAIA; + } + + bool is_nested() const { + return key_set_.has(DispatchKey::NestedTensor); + } + + // TODO: remove this once we don't automatically enabled Autograd dispatch + // keys + // in TensorImpl constructor. + // DON'T USE THIS API!! It's only created for testing purpose in + // file aten/src/ATen/core/boxing/impl/test_helpers.h + void remove_autograd_key() { + key_set_ = key_set_ - autograd_dispatch_keyset; + } + + // Inference tensor doesn't have autograd or ADInplaceOrView key. + // Invariant: + // Inference tensor has version_counter_.enabled() == false + bool is_inference() { + bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks); + bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + no_ADInplaceOrView == no_Autograd, + "ADInplaceOrView and Autograd keys must be on/off at the same time."); + return no_ADInplaceOrView && no_Autograd; + } + + DeviceIndex get_device() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom().index(); + } + return device_default().index(); + } + + Device device() const { + if (C10_UNLIKELY(device_policy_)) { + return device_custom(); + } + return device_default(); + } + + protected: + c10::Device device_default() const { + TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device"); + // See NOTE [std::optional operator usage in CUDA] + return *device_opt_; + } + + public: + Layout layout() const { + if (C10_UNLIKELY(layout_policy_)) { + return layout_custom(); + } + + // NB: This method is not virtual and avoid dispatches for perf. + // strided is also the most common layout type, so we check for + // strided case first. + // This keyset must also be kept in sync with the logic in + // is_sparse() / is_sparse_csr() / is_mkldnn() + constexpr auto sparse_and_sparsecsr_and_mkldnn_ks = + c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks; + if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) { + return kStrided; + } else if (is_sparse()) { + return kSparse; + } else if (is_sparse_compressed()) { + // Typically, the tensor dispatch keys define the tensor layout + // uniquely. This allows using non-virtual layout method for + // better performance. However, when tensor's layout depends, + // say, on tensor attributes, one must use this execution path + // where the corresponding tensor impl class overwrites virtual + // layout_impl() method. + // + // TODO: implement layout() as native function/method so that + // __torch_dispatch__ users will be able to redefine the + // layout() method. + return layout_impl(); + } else { + TORCH_INTERNAL_ASSERT( + is_mkldnn(), "There is an error in the layout calculation logic."); + return kMkldnn; + } + } + + /** + * True if a tensor was auto-wrapped from a C++ or Python number. + * For example, when you write 't + 2', 2 is auto-wrapped into a Tensor + * with `is_wrapped_number_` set to true. + * + * Wrapped numbers do not participate in the result type computation for + * mixed-type operations if there are any Tensors that are not wrapped + * numbers. This is useful, because we want 't + 2' to work with + * any type of tensor, not just LongTensor (which is what integers + * in Python represent). + * + * Otherwise, they behave like their non-wrapped equivalents. + * See [Result type computation] in TensorIterator.h. + * + * Why did we opt for wrapped numbers, as opposed to just having + * an extra function add(Tensor, Scalar)? This helps greatly reduce + * the amount of code we have to write for add, when actually + * a Tensor-Scalar addition is really just a Tensor-Tensor + * addition when the RHS is 0-dim (except for promotion behavior.) + */ + bool is_wrapped_number() const { + return is_wrapped_number_; + } + + /** + * Set whether or not a tensor was auto-wrapped from a C++ or Python + * number. You probably don't want to call this, unless you are + * writing binding code. + */ + void set_wrapped_number(bool value) { + TORCH_INTERNAL_ASSERT(dim() == 0); + is_wrapped_number_ = value; + } + + /** + * Returns true if Tensor supports as_strided and as_strided_backward. + * This is used in autograd to perform inplace update on view Tensors. + * See Note [View + Inplace update for base tensor] and + * [View + Inplace update for view tensor] for details. + * Note this method only returns true for XLA backend, where it + * simulates strided Tensor to support most view ops, but it cannot + * fully support general `as_strided` case. + * It can be expanded as needed in the future, e.g sparse Tensor. + */ + inline bool support_as_strided() const { + if (is_nested()) { + return false; + } + if (key_set_.has(DispatchKey::Functionalize)) { + return false; + } + return device().supports_as_strided(); + } + + // ~~~~~ Autograd API ~~~~~ + // Some methods below are defined in TensorImpl.cpp because Tensor is an + // incomplete type. + + /** + * Set whether or not a tensor requires gradient. + */ + void set_requires_grad(bool requires_grad); + + /** + * True if a tensor requires gradient. Tensors which require gradient + * have history tracked for any operations performed on them, so that + * we can automatically differentiate back to them. A tensor that + * requires gradient and has no history is a "leaf" tensor, which we + * accumulate gradients into. + */ + bool requires_grad() const; + + /** + * Return a mutable reference to the gradient. This is conventionally + * used as `t.grad() = x` to set a gradient to a completely new tensor. + */ + at::Tensor& mutable_grad(); + + /** + * Return the accumulated gradient of a tensor. This gradient is written + * into when performing backwards, when this tensor is a leaf tensor. + */ + const at::Tensor& grad() const; + + /** + * Whether or not the imaginary part of the tensor should be negated + */ + inline bool is_conj() const { + constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate); + return key_set_.has_all(conjugate_ks); + } + + /** + * Set whether or not to take the conjugate of the tensor (flip the imaginary + * bit). + */ + void _set_conj(bool value) { + if (value) { + key_set_ = key_set_.add(DispatchKey::Conjugate); + TORCH_INTERNAL_ASSERT(isComplexType(typeMetaToScalarType(dtype()))); + } else { + key_set_ = key_set_.remove(DispatchKey::Conjugate); + } + } + + /** + * XXX: do not use, private api! + * Update the backend component related keys to the backend component + * corresponding to this device. + */ + void _change_backend_component_keys(c10::Device device); + + /** + * Whether or not the tensor is a zerotensor + */ + inline bool _is_zerotensor() const { + constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor); + return key_set_.has_all(zerotensor_ks); + } + + /** + Set whether or not the tensor is a zero tensor + */ + void _set_zero(bool value) { + if (value) { + TORCH_INTERNAL_ASSERT( + false, + "Please call `torch._efficientzerotensor` if you want to create a tensor with no storage."); + } else { + key_set_ = key_set_.remove(DispatchKey::ZeroTensor); + } + } + + /** + * Whether or not the tensor should be negated + */ + inline bool is_neg() const { + constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative); + return key_set_.has_all(negative_ks); + } + + /** + * Set whether or not to take the conjugate of the tensor (flip the imaginary + * bit). + */ + void _set_neg(bool value) { + if (value) { + key_set_ = key_set_.add(DispatchKey::Negative); + } else { + key_set_ = key_set_.remove(DispatchKey::Negative); + } + } + + /** + * Return the accumulated gradient of a tensor. This gradient is computed + * using forward mode AD. + * + * This is an internal API that should never be used by end users. + * + * The API is as follows: + * - "level" allows to specify the level of forward AD nesting for which the + * gradient should be returned. Note that since levels are not fully + * supported yet, this argument should be 0. See documentation for + * torch::autograd::enter_dual_level for more details about forward AD + * nesting. + * - "self" should represent the Tensor whose forward grad is accessed. It + * is required when dealing with view. + */ + const at::Tensor& _fw_grad(uint64_t level, const at::TensorBase& self) const; + + /** + * Sets the forward gradient for this Tensor. + * The given Tensor might not be used directly and its content will be copied. + * + * This is an internal API that should never be used by end users. + * + * The API is as follows: + * - "new_grad" is a Tensor containing the new value of the gradient that + * should be set + * - "self" should represent the Tensor whose forward grad is accessed. It + * is required when dealing with view. + * - "level" allows to specify the level of forward AD nesting for which the + * gradient should be set. Note that since levels are not fully supported + * yet, this argument should be 0. See documentation for + * torch::autograd::enter_dual_level for more details about forward AD + * nesting. + * - "is_inplace_op" is a boolean flag that tells if this gradient was + * generated by an inplace operation or an out of place one. This allows + * better error checking. + */ + void _set_fw_grad( + const at::TensorBase& new_grad, + const at::TensorBase& self, + uint64_t level, + bool is_inplace_op); + + /** + * Return a typed data pointer to the actual data which this tensor refers to. + * This checks that the requested type (from the template parameter) matches + * the internal type of the tensor. + * + * It is invalid to call data() on a dtype-uninitialized tensor, even if + * the size is 0. + * + * WARNING: If a tensor is not contiguous, you MUST use strides when + * performing index calculations to determine the location of elements in + * the tensor. We recommend using 'TensorAccessor' to handle this computation + * for you; this class is available from 'Tensor'. + */ + template + const T* data_dtype_initialized() const { + return data_dtype_initialized_impl( + [this] { return static_cast(storage_.data()); }); + } + + /** + * Return a mutable typed data pointer to the actual data which this + * tensor refers to. This checks that the requested type (from the + * template parameter) matches the internal type of the tensor. + * + * It is invalid to call data() on a dtype-uninitialized tensor, even if + * the size is 0. + * + * WARNING: If a tensor is not contiguous, you MUST use strides when + * performing index calculations to determine the location of elements in + * the tensor. We recommend using 'TensorAccessor' to handle this computation + * for you; this class is available from 'Tensor'. + */ + template + T* mutable_data_dtype_initialized() { + return data_dtype_initialized_impl( + [this] { return static_cast(storage_.mutable_data()); }); + } + + private: + // Shared implementation of data_dtype_initialized() and + // mutable_data_dtype_initialized(). + template + T* data_dtype_initialized_impl(const Func& get_data) const { + TORCH_CHECK( + data_type_.Match>(), + "Tensor type mismatch, caller expects elements to be ", + caffe2::TypeMeta::TypeName>(), + ", while tensor contains ", + data_type_.name(), + ". "); + return data_ptr_impl_impl(get_data); + } + + public: + /** + * More efficient helper for Tensor::data_ptr(). Like data(), but + * does not do a type check. Unlike the untemplated data(), does + * check has_storage() and storage_initialized(). + */ + template + inline const T* data_ptr_impl() const { + return data_ptr_impl_impl( + [this] { return static_cast(storage_.data()); }); + } + + /** + * More efficient helper for Tensor::data_ptr(). Like data(), but + * does not do a type check. Unlike the untemplated data(), does + * check has_storage() and storage_initialized(). + */ + template + inline T* mutable_data_ptr_impl() { + return data_ptr_impl_impl( + [this] { return static_cast(storage_.mutable_data()); }); + } + + private: + // Shared implementation of mutable_data_ptr_impl() and the future + // mutable_data_ptr_impl(). + template + __ubsan_ignore_pointer_overflow__ T* data_ptr_impl_impl( + const Func& get_data) const { + if (C10_UNLIKELY(!has_storage())) { + throw_data_ptr_access_error(); + } + TORCH_CHECK( + storage_initialized(), + "The tensor has a non-zero number of elements, but its data is not allocated yet.\n" + "If you're using torch.compile/export/fx, it is likely that we are erroneously " + "tracing into a custom kernel. To fix this, please wrap the custom kernel into " + "an opaque custom op. Please see the following for details: " + "https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html\n" + "If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call " + "mutable_data() or raw_mutable_data() to actually allocate memory."); + // Caller does the type check. + // Note: storage_offset_ can be non-null even for zero-elements tensors + // (for example if created as `torch.empty(5)[10:]`) that triggers + // applying non-zero offset to null pointer in UBSan + return get_data() + storage_offset_; + } + + public: + /** + * Return a const void* data pointer to the actual data which this + * tensor refers to. + * + * It is invalid to call data() on a dtype-uninitialized tensor, even if the + * size is 0. + * + * WARNING: The data pointed to by this tensor may not contiguous; do NOT + * assume that itemsize() * numel() is sufficient to compute the bytes that + * can be validly read from this tensor. + */ + inline const void* data() const { + return data_impl( + [this] { return static_cast(storage_.data()); }); + } + + /** + * Return a void* data pointer to the actual data which this tensor refers to. + * + * It is invalid to call mutable_data() on a dtype-uninitialized + * tensor, even if the size is 0. + * + * WARNING: The data pointed to by this tensor may not contiguous; do NOT + * assume that itemsize() * numel() is sufficient to compute the bytes that + * can be validly read from this tensor. + */ + inline void* mutable_data() { + return data_impl( + [this] { return static_cast(storage_.mutable_data()); }); + } + + private: + /// Shared implementation of data() and mutable_data(). + /// + /// get_data must return a byte-addressed pointer, e.g. char*, + /// std::byte const*, etc. + template + Void* data_impl(const Func& get_data) const { + if (C10_UNLIKELY(!has_storage())) { + throw_data_ptr_access_error(); + } + TORCH_CHECK( + dtype_initialized(), + "Cannot access data pointer of Tensor that doesn't have initialized dtype " + "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data() on x)"); + auto* data = get_data(); + static_assert( + sizeof(*data) == 1, "get_data must return a byte-addressed pointer."); + // Computing an offset into an empty tensor would be UB, since an empty + // tensor's storage will be nullptr, and adding a nonzero offset to nullptr + // is UB. So we skip the offset computation in this case. + if (is_empty()) { + return nullptr; + } + return data + data_type_.itemsize() * storage_offset_; + } + + public: + /** + * Returns the TypeMeta of a tensor, which describes what data type + * it is (e.g., int, float, ...) + */ + const caffe2::TypeMeta dtype() const { + return data_type_; + } + + /** + * Return the size of a single element of this tensor in bytes. + */ + size_t itemsize() const { + TORCH_CHECK( + dtype_initialized(), + "Cannot report itemsize of Tensor that doesn't have initialized dtype " + "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data() on x)"); + return data_type_.itemsize(); + } + + void set_backend_meta(intrusive_ptr backend_meta) { + get_extra_meta().backend_meta_ = std::move(backend_meta); + } + + c10::BackendMeta* get_backend_meta() { + if (!extra_meta_) { + return nullptr; + } + return extra_meta_->backend_meta_.get(); + } + + intrusive_ptr get_backend_meta_intrusive_ptr() const { + if (!extra_meta_) { + return nullptr; + } + return extra_meta_->backend_meta_; + } + + void release_storage_and_set_meta_custom_data_ptr_error_msg_( + std::optional s) { + storage_ = {}; + set_storage_access_should_throw(); + get_extra_meta().custom_data_ptr_error_msg_ = s; + get_extra_meta().custom_storage_error_msg_ = std::move(s); + } + + protected: + /** + * Returns the human-readable name of the actual type of this object (e.g., + * TensorImpl, BatchedTensorImpl, etc.). Used for error messages. + */ + virtual const char* tensorimpl_type_name() const { + return "TensorImpl"; + } + + private: + [[noreturn]] void throw_storage_access_error() const; + [[noreturn]] void throw_data_ptr_access_error() const; + + ExtraMeta& get_extra_meta() { + if (!extra_meta_) { + extra_meta_ = std::make_unique(); + } + return *extra_meta_; + } + + c10::SymbolicShapeMeta& symbolic_shape_meta() { + TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_); + return *extra_meta_->symbolic_shape_meta_; + } + + const c10::SymbolicShapeMeta& symbolic_shape_meta() const { + TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_); + return *extra_meta_->symbolic_shape_meta_; + } + + public: + /** + * True if a tensor has no elements (e.g., numel() == 0). + */ + inline bool is_empty() const { + return numel() == 0; + } + + // if we are going to use sym sizes, we should be setting sym strides at the + // same time, otherwise it's very easy to misuse this API + void set_sizes_and_strides( + c10::SymIntArrayRef sizes, + c10::SymIntArrayRef strides, + std::optional storage_offset = std::nullopt); + // This is renamed to avoid breaking overload BC + void generic_set_sizes_contiguous(c10::SymIntArrayRef sizes); + void generic_set_sizes_contiguous(c10::IntArrayRef sizes) { + set_sizes_contiguous(sizes); + } + + /** + * Change the size at some dimension. This DOES NOT update strides; + * thus, most changes to size will not preserve contiguity. You probably + * also want to call set_stride() when you call this. + * + * TODO: This should be jettisoned in favor of `set_sizes_and_strides`, + * which is harder to misuse. + */ + virtual void set_size(int64_t dim, int64_t new_size) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_size ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !matches_policy(SizesStridesPolicy::CustomSizes), + "set_size() called on tensor with dynamic shapes or customized size behavior") + sizes_and_strides_.size_at(dim) = new_size; + refresh_numel(); + refresh_contiguous(); + } + + /** + * Change the stride at some dimension. + * + * TODO: This should be jettisoned in favor of `set_sizes_and_strides`, + * which is harder to misuse. + */ + virtual void set_stride(int64_t dim, int64_t new_stride) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_stride ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "set_stride() called on tensor with symbolic shape") + sizes_and_strides_.stride_at_unchecked(dim) = new_stride; + refresh_contiguous(); + } + + /** + * Set the offset into the storage of this tensor. + * + * WARNING: This does NOT check if the tensor is in bounds for the new + * location at the storage; the caller is responsible for checking this + * (and resizing if necessary.) + */ + virtual void set_storage_offset(int64_t storage_offset) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_storage_offset ", + err_msg_tensor_metadata_change_not_allowed); + // TODO: this should probably consult policy + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "set_storage_offset() called on tensor with symbolic shape") + storage_offset_ = storage_offset; + } + + /** + * Like set_sizes_and_strides but assumes contiguous strides. + * + * WARNING: This function does not check if the requested + * sizes/strides are in bounds for the storage that is allocated; + * this is the responsibility of the caller + */ + void set_sizes_contiguous(IntArrayRef new_size) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_sizes_contiguous ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !matches_policy(SizesStridesPolicy::CustomStrides), + "tried to directly modify sizes for customized tensor"); + sizes_and_strides_.set_sizes(new_size); + + refresh_numel(); + empty_tensor_restride( + MemoryFormat::Contiguous); // calls refresh_contiguous() + } + + C10_ALWAYS_INLINE const impl::SizesAndStrides& sizes_and_strides() { + return sizes_and_strides_; + } + + /** + * Set the sizes and strides of a tensor. + * + * WARNING: This function does not check if the requested + * sizes/strides are in bounds for the storage that is allocated; + * this is the responsibility of the caller + */ + void set_sizes_and_strides( + IntArrayRef new_size, + IntArrayRef new_stride, + std::optional storage_offset = std::nullopt) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_sizes_and_strides ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "set_sizes_and_strides() called on tensor with symbolic shape") + TORCH_CHECK( + new_size.size() == new_stride.size(), + "dimensionality of sizes (", + new_size.size(), + ") must match dimensionality of strides (", + new_stride.size(), + ")"); + const auto new_dim = new_size.size(); + bool overflowed = false; + sizes_and_strides_.set_sizes(new_size); + + if (new_dim > 0) { + for (size_t dim = new_dim - 1;; dim--) { + if (new_stride[dim] >= 0) { + sizes_and_strides_.stride_at_unchecked(dim) = new_stride[dim]; + } else { + // XXX: This behavior is surprising and may need to be removed to + // support negative strides. Some pytorch functions rely on it: + // for example, torch.cat (run TestTorch.test_cat_empty). + if (dim == new_dim - 1) { + sizes_and_strides_.stride_at_unchecked(dim) = 1; + } else { + // Keep stride monotonically increasing to match NumPy. + overflowed |= c10::mul_overflows( + sizes_and_strides_.stride_at_unchecked(dim + 1), + std::max( + sizes_and_strides_.size_at_unchecked(dim + 1), 1), + std::addressof(sizes_and_strides_.stride_at_unchecked(dim))); + } + } + if (dim == 0) + break; + } + TORCH_CHECK(!overflowed, "Stride calculation overflowed"); + } + + refresh_numel(); + refresh_contiguous(); + + if (storage_offset.has_value()) { + storage_offset_ = *storage_offset; + } + } + + /** + * Set whether a tensor allows changes to its metadata (e.g. sizes / strides / + * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor + * ] for details. + */ + void set_allow_tensor_metadata_change(bool value [[maybe_unused]]) { + // TODO: at some point, we should kill this field completely. + allow_tensor_metadata_change_ = true; + } + + /** + * True if a tensor allows changes to its metadata (e.g. sizes / strides / + * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor + * ] for details. + */ + bool allow_tensor_metadata_change() const { + return allow_tensor_metadata_change_; + } + + /** + * Set the pointer to autograd metadata. + */ + void set_autograd_meta( + std::unique_ptr autograd_meta); + + /** + * Return the pointer to autograd metadata. May return nullptr if the + * tensor does not track gradients. + */ + c10::AutogradMetaInterface* autograd_meta() const; + + /** + * Set the pointer to named tensor metadata. + */ + void set_named_tensor_meta( + std::unique_ptr named_tensor_meta) { + TORCH_WARN_ONCE( + "Named tensors and all their associated APIs are an experimental feature ", + "and subject to change. Please do not use them for anything important ", + "until they are released as stable."); +#ifdef DEBUG + if (named_tensor_meta) { + TORCH_INTERNAL_ASSERT(named_tensor_meta->slow_dim() == dim()); + } +#endif + if (named_tensor_meta) { + get_extra_meta().named_tensor_meta_ = std::move(named_tensor_meta); + key_set_ = key_set_.add(DispatchKey::Named); + } else { + if (extra_meta_) { + extra_meta_->named_tensor_meta_ = nullptr; + } + key_set_ = key_set_.remove(DispatchKey::Named); + } + } + + void set_python_dispatch(bool k) { + if (k) { + key_set_ = key_set_.add(c10::python_ks); + } else { + key_set_ = key_set_ - c10::python_ks; + } + } + + bool is_python_dispatch() const { + return key_set_.has_all(c10::python_ks); + } + + /** + * Return the pointer to named tensor metadata. + */ + const c10::NamedTensorMetaInterface* named_tensor_meta() const { + if (!extra_meta_) { + return nullptr; + } + return extra_meta_->named_tensor_meta_.get(); + } + + c10::NamedTensorMetaInterface* named_tensor_meta() { + if (!extra_meta_) { + return nullptr; + } + return extra_meta_->named_tensor_meta_.get(); + } + + bool has_named_tensor_meta() const { + if (!extra_meta_) { + return false; + } + return extra_meta_->named_tensor_meta_ != nullptr; + } + + // NOTE [ TensorImpl Shallow-Copying ] + // + // TensorImpl shallow-copying is used when we want to have two Variables share + // the same tensor metadata (e.g. sizes / strides / storage pointer / + // storage_offset), but each with a different autograd history. Example call + // sites: + // + // 1. `var_detached = var.detach()` uses `shallow_copy_and_detach()` to create + // `var_detached` that shares the same tensor metadata with `var`, but with a + // completely new autograd history. + // 2. `var.set_data(tensor)` uses `shallow_copy_from()` to copy tensor + // metadata from `tensor` into `var`, while keeping `var`'s original + // AutogradMeta. + // + // Functions that shallow-copy a TensorImpl (such as + // `shallow_copy_and_detach()` / `shallow_copy_from()` / + // `copy_tensor_metadata()`) copy the tensor metadata fields (e.g. sizes / + // strides / storage pointer / storage_offset) by value. However, the + // following fields are not copied: + // + // 1. the AutogradMeta pointer, because it is unique for each Variable. + // 2. the version counter, because the destination TensorImpl's version + // counter is either set to the passed-in `version_counter` (in + // `shallow_copy_and_detach()` and `copy_tensor_metadata()`), or it is kept + // intact (in `shallow_copy_from()`). See NOTE [ Version Counter Sharing ] for + // details. + // + // In `shallow_copy_and_detach()` and `copy_tensor_metadata()`, the passed-in + // `allow_tensor_metadata_change` determines whether the TensorImpl + // shallow-copy allows changes to its metadata (e.g. sizes / strides / storage + // / storage_offset). See NOTE [ Metadata Change for a Detached Tensor ] for + // details. + // + // In `shallow_copy_from()`, we don't check the destination TensorImpl's + // `allow_tensor_metadata_change_`, because `shallow_copy_from()` is used for + // implementing functions such as `var.set_data(tensor)`, which changes + // `var`'s tensor metadata and expects its `allow_tensor_metadata_change_` to + // be ignored. + + /** + * One TensorImpl can be copied to another TensorImpl if they have the same + * DispatchKeySet. The only two special cases (for legacy reason) are: + * CPU is compatible with CUDA and SparseCPU is + * compatible with SparseCUDA. + */ + inline bool has_compatible_shallow_copy_type(DispatchKeySet from) { + auto is_dense = [](DispatchKeySet ts) { + constexpr auto dense_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::MPSBit, + BackendComponent::HIPBit, + BackendComponent::XPUBit, + BackendComponent::HPUBit, + BackendComponent::MTIABit}); + constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense); + return ts.has_any(dense_k) && ts.has_any(dense_backends); + }; + auto is_sparse = [](DispatchKeySet ts) { + constexpr auto sparse_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::MPSBit, + BackendComponent::HIPBit, + BackendComponent::XPUBit}); + constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse); + return ts.has_any(sparse_k) && ts.has_any(sparse_backends); + }; + auto is_sparse_compressed = [](DispatchKeySet ts) { + constexpr auto sparse_compressed_k = + DispatchKeySet(DispatchKey::SparseCsr); + return ts.has_any(sparse_compressed_k); + }; + return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) || + (is_sparse(key_set_) && is_sparse(from)) || + (is_sparse_compressed(key_set_) && is_sparse_compressed(from)); + ; + } + + private: + template + c10::intrusive_ptr shallow_copy_and_detach_core( + VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const; + + public: + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + virtual c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const; + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + virtual c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const; + + /** + * Shallow-copies data from another TensorImpl into this TensorImpl. + * + * For why this function doesn't check this TensorImpl's + * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ]. + */ + virtual void shallow_copy_from(const c10::intrusive_ptr& impl) { + copy_tensor_metadata( + /*src_impl=*/impl.get(), + /*dest_impl=*/this, + /*version_counter=*/version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); + } + + // Inference tensor doesn't have version counter, + // set_version_counter is no-op for them. + void set_version_counter(const c10::VariableVersion& version_counter) { + TORCH_CHECK( + !(is_inference() && version_counter.enabled()), + "Cannot set version_counter for inference tensor"); + version_counter_ = version_counter; + } + + void set_version_counter(c10::VariableVersion&& version_counter) { + TORCH_CHECK( + !(is_inference() && version_counter.enabled()), + "Cannot set version_counter for inference tensor"); + version_counter_ = std::move(version_counter); + } + + const c10::VariableVersion& version_counter() const noexcept { + return version_counter_; + } + + void bump_version() { + version_counter_.bump(); + } + + impl::PyObjectSlot* pyobj_slot() { + return &pyobj_slot_; + } + + const impl::PyObjectSlot* pyobj_slot() const { + return &pyobj_slot_; + } + + void incref_pyobject() const noexcept override final; + + void decref_pyobject() const noexcept override final; + + bool try_incref_pyobject() const noexcept override final; + + private: + // See NOTE [std::optional operator usage in CUDA] + // We probably don't want to expose this publicly until + // the note is addressed. + std::optional device_opt() const { + return device_opt_; + } + + public: + /** + * The device type of a Tensor, e.g., DeviceType::CPU or DeviceType::CUDA. + */ + DeviceType device_type() const { + // TODO: A useful internal assert would be to show that device_opt_ is null + // only if you are an undefined tensor + TORCH_CHECK( + device_opt_.has_value(), + "device_type cannot be run on undefined Tensor"); + // See NOTE [std::optional operator usage in CUDA] + return (*device_opt_).type(); + } + + /** + * @brief Extends the outer-most dimension of this tensor by num elements, + * preserving the existing data. + * + * The underlying data may be reallocated in order to accommodate the new + * elements, in which case this tensors' capacity is grown at a factor of + * growthPct. This ensures that Extend runs on an amortized O(1) time + * complexity. + * + * This op is auto-asynchronous if the underlying device (CUDA) supports it. + */ + void Extend(int64_t num, float growthPct); + + /** + * @brief Reserve space for the underlying tensor. + * + * This must be called after Resize(), since we only specify the first + * dimension This does not copy over the old data to the newly allocated space + */ + void ReserveSpace(int64_t outer_dim); + + /** + * @brief Resizes a tensor. + * + * Resize takes in a vector of ints specifying the dimensions of the tensor. + * You can pass in an empty vector to specify that it is a scalar (i.e. + * containing one single item). + * + * The underlying storage may be deleted after calling Resize: if the new + * shape leads to a different number of items in the tensor, the old memory + * is deleted and new memory will be allocated next time you call + * mutable_data(). However, if the shape is different but the total number of + * items is the same, the underlying storage is kept. + * + * This method respects caffe2_keep_on_shrink. Consult the internal logic + * of this method to see exactly under what circumstances this flag matters. + */ + template + void Resize(Ts... dim_source) { + bool size_changed = SetDims(dim_source...); + if (size_changed) { + HandleResize(); + } + } + + template + void Resize(const std::vector& dim_source) { + Resize(ArrayRef(dim_source)); + } + + /** + * Resizes the tensor without touching underlying storage. + * This requires the total size of the tensor to remains constant. + */ + void Reshape(const std::vector& dims); + + /** + * Release whatever memory the tensor was holding but keep size and type + * information. Subsequent call to mutable_data will trigger new memory + * allocation. + */ + void FreeMemory(); + + /** + * @brief Shares the data with another tensor. + * + * To share data between two tensors, the sizes of the two tensors must be + * equal already. The reason we do not implicitly do a Resize to make the two + * tensors have the same shape is that we want to allow tensors of different + * shapes but the same number of items to still be able to share data. This + * allows one to e.g. have a n-dimensional Tensor and a flattened version + * sharing the same underlying storage. + * + * The source tensor should already have its data allocated. + */ + // To be deprecated + void ShareData(const TensorImpl& src); + + void ShareExternalPointer( + DataPtr&& data_ptr, + const caffe2::TypeMeta data_type, + size_t size_bytes); + + /** + * Returns a mutable raw pointer of the underlying storage. Since we will need + * to know the type of the data for allocation, a TypeMeta object is passed in + * to specify the necessary information. This is conceptually equivalent of + * calling mutable_data() where the TypeMeta parameter meta is derived from + * the type T. This function differs from mutable_data() in the sense that + * the type T can be specified during runtime via the TypeMeta object. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data(const caffe2::TypeMeta& meta) { + // For 0-size tensors it's fine to return any pointer (including nullptr) + if (data_type_ == meta && storage_initialized()) { + return static_cast( + static_cast(storage_.mutable_data()) + + storage_offset_ * meta.itemsize()); + } else { + bool had_special_dtor = data_type_.placementDelete() != nullptr; + storage_offset_ = 0; + data_type_ = meta; + // NB: device is not changed + + // We can reuse the existing buffer if the current data does not have + // a special destructor and the new data doesn't have a special + // constructor. + if (numel_ == 0 || + (meta.placementNew() == nullptr && !had_special_dtor && + (storage_.nbytes() >= (numel_ * data_type_.itemsize())))) { + TORCH_INTERNAL_ASSERT( + storage_offset_ == 0); // because we just reallocated + return storage_.mutable_data(); + } + Allocator* allocator = storage_.allocator(); + // Storage might have nullptr allocator in rare cases, for example, if + // an external memory segment has been wrapped with Tensor and we don't + // know how to reallocate it. However, in order to preserve legacy C2 + // behavior, we allow reallocating the memory using default allocator. + if (allocator == nullptr) { + allocator = GetAllocator(storage_.device_type()); + } + if (meta.placementNew()) { + // For types that need placement new, we will call it, as well as + // making sure that when the data is freed, it calls the right + // destruction procedure. + auto size = numel_; + auto dtor = data_type_.placementDelete(); + auto data_ptr = allocator->allocate(numel_ * data_type_.itemsize()); + storage_.set_data_ptr_noswap(PlacementDeleteContext::makeDataPtr( + std::move(data_ptr), dtor, size, storage_.device())); + data_type_.placementNew()(storage_.mutable_data(), numel_); + } else { + // For fundamental type, new and delete is easier. + storage_.set_data_ptr_noswap( + allocator->allocate(numel_ * data_type_.itemsize())); + } + storage_.set_nbytes(numel_ * data_type_.itemsize()); + TORCH_INTERNAL_ASSERT( + storage_offset_ == 0); // because we just reallocated + device_opt_ = storage_.device(); + return storage_.mutable_data(); + } + } + + /** + * Returns a typed pointer of the underlying storage. + * + * For fundamental types, we reuse possible existing storage if there + * is sufficient capacity. + */ + template + inline T* mutable_data() { + if (storage_initialized() && data_type_.Match()) { + return static_cast(storage_.mutable_data()) + storage_offset_; + } + // Check it here statically - otherwise TypeMeta would throw the runtime + // error in attempt to invoke TypeMeta::ctor() + static_assert( + std::is_default_constructible_v, + "Tensor can't hold non-default-constructable types"); + return static_cast(raw_mutable_data(caffe2::TypeMeta::Make())); + } + + /** + * True if a tensor is storage initialized. A tensor may become + * storage UNINITIALIZED after a Resize() or FreeMemory() + */ + bool storage_initialized() const { + TORCH_CHECK( + has_storage(), + "cannot call storage_initialized on tensor that does not have storage"); + return storage_.data() || numel_ == 0; + } + + /** + * True if a tensor is dtype initialized. A tensor allocated with + * Caffe2-style constructors is dtype uninitialized until the + * first time mutable_data() is called. + */ + bool dtype_initialized() const noexcept { + return data_type_ != caffe2::TypeMeta(); + } + + void set_storage_keep_dtype(at::Storage storage) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_storage ", + err_msg_tensor_metadata_change_not_allowed); + storage_ = std::move(storage); + device_opt_ = storage_.device(); + } + + void set_storage_and_dtype( + at::Storage storage, + const caffe2::TypeMeta data_type) { + set_storage_keep_dtype(std::move(storage)); + data_type_ = data_type; + } + + void empty_tensor_restride_symint(MemoryFormat memory_format); + + /** + * Set the strides of the tensor to match memory_format + * + * WARNING: This function doesn't rearrange data and assumes tensor is a + * memory contiguous + */ + void empty_tensor_restride(MemoryFormat memory_format) { + if (has_symbolic_sizes_strides_) { + empty_tensor_restride_symint(memory_format); + return; + } +#ifdef DEBUG + TORCH_INTERNAL_ASSERT( + compute_numel() == numel_, + "If you are seeing this error, that means empty_tensor_restride was " + "called before setting correct numel"); +#endif + switch (memory_format) { + case MemoryFormat::Contiguous: { + // dim_ is a virtual call, don't repeat it + const auto dim_ = dim(); + sizes_and_strides_.resize(dim_); + if (dim_ > 0) { + bool overflowed = false; + const auto last_idx = dim_ - 1; + sizes_and_strides_.stride_at_unchecked(last_idx) = 1; + for (auto i = last_idx - 1; i >= 0; --i) { + overflowed |= c10::mul_overflows( + sizes_and_strides_.stride_at_unchecked(i + 1), + std::max( + sizes_and_strides_.size_at_unchecked(i + 1), 1), + std::addressof(sizes_and_strides_.stride_at_unchecked(i))); + } + TORCH_CHECK(!overflowed, "Stride calculation overflowed"); + } + break; + } + case MemoryFormat::ChannelsLast: { + TORCH_CHECK( + dim() == 4, "required rank 4 tensor to use channels_last format"); + set_sizes_and_strides(sizes(), get_channels_last_strides_2d(sizes())); + break; + } + case MemoryFormat::ChannelsLast3d: { + TORCH_CHECK( + dim() == 5, + "required rank 5 tensor to use channels_last_3d format"); + set_sizes_and_strides(sizes(), get_channels_last_strides_3d(sizes())); + break; + } + case MemoryFormat::Preserve: + TORCH_CHECK(false, "unsupported memory format ", memory_format); + // Cleaning warning messages, no need to break as TORCH_CHECK(false) + // terminates flow. + // break; + case MemoryFormat::NumOptions: + TORCH_INTERNAL_ASSERT(false, "invalid memory format ", memory_format); + } + // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually + // exclusive see #24090 + refresh_contiguous(); + } + + bool is_strides_like(at::MemoryFormat memory_format) const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return is_strides_like_custom(memory_format); + } + return is_strides_like_default(memory_format); + } + + bool is_strides_like_channels_last() const { + return is_strides_like(at::MemoryFormat::ChannelsLast); + } + + bool is_strides_like_channels_last_3d() const { + return is_strides_like(at::MemoryFormat::ChannelsLast3d); + } + + bool is_non_overlapping_and_dense_or_false() const { + return sym_is_non_overlapping_and_dense().guard_or_false( + __FILE__, __LINE__); + } + + bool is_non_overlapping_and_dense() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return is_non_overlapping_and_dense_custom(); + } + return is_non_overlapping_and_dense_default(); + } + + SymBool sym_is_non_overlapping_and_dense() const { + if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) { + return sym_is_non_overlapping_and_dense_custom(); + } + return sym_is_non_overlapping_and_dense_default(); + } + + // if this returns true, then it is guaranteed that this tensor has symbolic + // sizes/strides + bool has_symbolic_sizes_strides() const { + return has_symbolic_sizes_strides_; + } + + private: + void HandleResize(); + + // The Caffe2 Resize() method supports being called both as Resize({2,2}) as + // well as variadic with Resize(2, 2). These overloads provide all of the + // supported calling configurations, while being overloads (and not templates) + // so that implicit conversions still work. + // + // SetDims on ArrayRef is internally implemented as a template, so we can + // handle both ArrayRefs of different types (there are some uses of + // Resize in Caffe2 which pass in int, not int64_t.) + + template < + typename T, + typename = typename std::enable_if_t>> + bool SetDimsTemplate(ArrayRef src) { + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "SetDims() called on tensor with symbolic shape") + + auto old_numel = numel_; + sizes_and_strides_.resize(src.size()); + int64_t new_numel = 1; + for (const auto i : c10::irange(src.size())) { + new_numel *= src[i]; + sizes_and_strides_.size_at_unchecked(i) = src[i]; + } + numel_ = new_numel; + empty_tensor_restride(MemoryFormat::Contiguous); + return numel_ != old_numel; + } + + bool SetDims(ArrayRef s) { + return SetDimsTemplate(s); + } + + bool SetDims(ArrayRef s) { + return SetDimsTemplate(s); + } + + bool SetDims(ArrayRef s) { + return SetDimsTemplate(s); + } + + bool SetDims() { + return SetDims(IntArrayRef{}); + } + + bool SetDims(const int64_t d0) { + return SetDims(IntArrayRef{d0}); + } + + bool SetDims(const int64_t d0, const int64_t d1) { + return SetDims(IntArrayRef{d0, d1}); + } + + bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) { + return SetDims(IntArrayRef{d0, d1, d2}); + } + + bool SetDims( + const int64_t d0, + const int64_t d1, + const int64_t d2, + const int64_t d3) { + return SetDims(IntArrayRef{d0, d1, d2, d3}); + } + + /** + * Compute the number of elements based on the sizes of a tensor. + */ + // NB: This is ONLY called when sizes_and_strides_ is used directly; if + // we are virtualizing, then numel calls are virtualized as well, and this + // should never get called + int64_t compute_numel() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_); +#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE) + // Use overflow checks if supported by the compiler + return safe_compute_numel(); +#else + return c10::multiply_integers(sizes_and_strides_.sizes_arrayref()); +#endif + } + + /** + * Compute the number of elements based on the sizes of a + * tensor. Catches integer overflow that may occur when a tensor + * using a sparse layout has multiple dimensions with large sizes. + */ + int64_t safe_compute_numel() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_); + uint64_t n = 1; + bool overflows = + c10::safe_multiplies_u64(sizes_and_strides_.sizes_arrayref(), &n); + constexpr auto numel_max = std::min( + static_cast(std::numeric_limits::max()), + static_cast(std::numeric_limits::max())); + + overflows |= (n > numel_max); + TORCH_CHECK(!overflows, "numel: integer multiplication overflow"); + return static_cast(n); + } + + /** + * Compute whether or not a tensor is contiguous based on the sizes and + * strides of a tensor. + */ + bool compute_contiguous() const; + + bool compute_channels_last_contiguous_2d() const; + + bool compute_channels_last_contiguous_3d() const; + + bool compute_strides_like_channels_last_2d() const; + + bool compute_strides_like_channels_last_3d() const; + + bool compute_non_overlapping_and_dense() const; + + protected: + /** + * Recompute the cached numel of a tensor. Call this if you modify + * sizes. + * + * For tensors with sparse layouts, use safe_refresh_numel() instead + * because it will catch integer overflow that may occur for tensors + * with sparse layouts and large dimensions. + * + * NB: We may uselessly recompute cached numel even in situations where + * it is completely never used (e.g., if CustomSizes for Python). However, + * we still must keep it up to date in case the Python overload + * returns None (in which case we will consult the field here). This also + * implies that sizes/strides will never be complete garbage; in the + * very worst case scenario, it will reflect a 1-dim zero size tensor. + */ + void refresh_numel() { + if (has_symbolic_sizes_strides_) { + symbolic_shape_meta().refresh_numel(); + } else { + numel_ = compute_numel(); + } + } + + /** + * Recompute the cached numel of a tensor. Call this if you modify + * sizes. Use only for tensors with sparse layouts because only + * sparse tensor are likely to have sizes that may lead to integer + * overflow when computing numel. + */ + void safe_refresh_numel() { + if (has_symbolic_sizes_strides_) { + // NB: sym numel is done with symbolic integers, which handle overflow + // checking + symbolic_shape_meta().refresh_numel(); + } else { + numel_ = safe_compute_numel(); + } + } + + private: + void _set_is_contiguous(bool b) { + is_contiguous_ = b; + } + + void _set_is_channels_last_contiguous(bool b) { + is_channels_last_contiguous_ = b; + } + + void _set_is_channels_last_3d_contiguous(bool b) { + is_channels_last_3d_contiguous_ = b; + } + + void _set_is_channels_last(bool b) { + is_channels_last_ = b; + } + + void _set_is_channels_last_3d(bool b) { + is_channels_last_3d_ = b; + } + + void _set_is_non_overlapping_and_dense(bool b) { + is_non_overlapping_and_dense_ = b; + } + + // These are little wrappers over the real compute_ functions that + // can make use of other contiguity fields to short circuit. + + bool compute_is_non_overlapping_and_dense_dim4() { + return is_contiguous_ || is_channels_last_contiguous_ || + compute_non_overlapping_and_dense(); + } + + bool compute_channels_last_contiguous_3d_dim5() { + return !is_channels_last_contiguous_ && + compute_channels_last_contiguous_3d(); + } + + bool compute_channels_last_2d_dim5() { + return !is_channels_last_3d_contiguous_ && + compute_strides_like_channels_last_2d(); + } + + bool compute_channels_last_3d_dim5() { + return !is_channels_last_ && compute_strides_like_channels_last_3d(); + } + + bool compute_is_non_overlapping_and_dense_dim5() { + return is_contiguous_ || is_channels_last_contiguous_ || + is_channels_last_3d_contiguous_ || compute_non_overlapping_and_dense(); + } + + bool compute_is_non_overlapping_and_dense_anydim() { + return is_contiguous_ || compute_non_overlapping_and_dense(); + } + + void _refresh_contiguous() { + // Note: + // Dim 0, 1, 2 will never be a channels last 2d/3d format + // Dim 3+ is possibly be a channels last 2d format (Dim 4 only at this + // point) Dim 4+ is possibly be a channels last 3d format (Dim 5 only at + // this point) + switch (dim()) { + case 4: { + _set_is_contiguous(compute_contiguous()); + _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d()); + _set_is_channels_last_3d_contiguous(false); + _set_is_channels_last(compute_strides_like_channels_last_2d()); + _set_is_channels_last_3d(false); + _set_is_non_overlapping_and_dense( + compute_is_non_overlapping_and_dense_dim4()); + break; + } + case 5: { + _set_is_contiguous(compute_contiguous()); + _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d()); + _set_is_channels_last_3d_contiguous( + compute_channels_last_contiguous_3d_dim5()); + _set_is_channels_last(compute_channels_last_2d_dim5()); + _set_is_channels_last_3d(compute_channels_last_3d_dim5()); + _set_is_non_overlapping_and_dense( + compute_is_non_overlapping_and_dense_dim5()); + break; + } + default: + // is_channels_last_ and is_channels_last_3d_ are suggested + // memory_format. Being channels_last_contiguous doesn't necessarily + // mean the tensor is strided like channels_last: for strides on channel + // dimension could suggest desired memory_layout, but it doesn't affect + // memory storage + _set_is_contiguous(compute_contiguous()); + _set_is_channels_last_contiguous(false); + _set_is_channels_last_3d_contiguous(false); + _set_is_channels_last(false); + _set_is_channels_last_3d(false); + _set_is_non_overlapping_and_dense( + compute_is_non_overlapping_and_dense_anydim()); + break; + } + } + + protected: + /** + * Recompute the cached contiguity of a tensor. Call this if you modify sizes + * or strides. + */ + void refresh_contiguous() { + if (has_symbolic_sizes_strides_) { + symbolic_shape_meta().refresh_contiguous(); + } else { + _refresh_contiguous(); + } + } + + /** + * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / + * storage_offset) from one TensorImpl to another TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE + * [ TensorImpl Shallow-Copying ]. + */ + static void copy_tensor_metadata( + const TensorImpl* src_impl, + TensorImpl* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change); + + /** + * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / + * storage_offset) from one TensorImpl to another TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE + * [ TensorImpl Shallow-Copying ]. + */ + static void copy_tensor_metadata( + const TensorImpl* src_impl, + TensorImpl* dest_impl, + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change); + + private: + static void copy_tensor_metadata_except_version_counter( + const TensorImpl* src_impl, + TensorImpl* dest_impl, + bool allow_tensor_metadata_change); + + protected: + // Error message to show when the user tries to change tensor metadata on + // Tensor created from .data or .detach(). + // + // See NOTE [ Metadata Change for a Detached Tensor ] for details. + static const char* const err_msg_tensor_metadata_change_not_allowed; + + static void copy_generic_tensor_metadata( + const TensorImpl* src_impl, + TensorImpl* dest_impl); + + public: + void set_storage_access_should_throw() { + storage_access_should_throw_ = true; + } + + public: + void set_custom_sizes_strides(SizesStridesPolicy policy) { + custom_sizes_strides_ = static_cast(policy); + refresh_sizes_strides_policy(); + } + + void set_python_custom_sizes_strides(SizesStridesPolicy policy) { + python_custom_sizes_strides_ = static_cast(policy); + refresh_sizes_strides_policy(); + } + + void set_custom_device(bool custom_device) { + custom_device_ = custom_device; + refresh_device_policy(); + } + + void set_custom_layout(bool custom_layout) { + custom_layout_ = custom_layout; + refresh_layout_policy(); + } + + void set_python_custom_device(bool custom_device) { + python_custom_device_ = custom_device; + refresh_device_policy(); + } + + void set_python_custom_layout(bool custom_layout) { + python_custom_layout_ = custom_layout; + refresh_layout_policy(); + } + + protected: + void refresh_sizes_strides_policy() { + if (has_symbolic_sizes_strides_) { + sizes_strides_policy_ = + static_cast(SizesStridesPolicy::CustomSizes); + } else { + sizes_strides_policy_ = + std::max(custom_sizes_strides_, python_custom_sizes_strides_); + } + } + + void refresh_device_policy() { + device_policy_ = custom_device_ || python_custom_device_; + } + + void refresh_layout_policy() { + layout_policy_ = custom_layout_ || python_custom_layout_; + } + + protected: + Storage storage_; + + private: + // This pointer points to an AutogradMeta struct that stores autograd-specific + // fields (such as grad_ / grad_fn_ / grad_accumulator_). This pointer always + // has unique ownership (meaning only one TensorImpl can own it at a time). + // + // autograd_meta_ can be nullptr, as an optimization. When this occurs, it is + // equivalent to having an autograd_meta_ pointing to a default constructed + // AutogradMeta; intuitively, tensors which don't require grad will have this + // field set to null. + // + // This means accessors on autograd_meta_ have to be careful to test if they + // got a nullptr, and handle default behavior appropriately in that case. + // + // Note that we don't enforce the invariant that if the AutogradMeta is + // default constructed, it is nullptr (to do this, we'd have to continuously + // check if an AutogradMeta became, by mutation, equal to the default + // constructed form. (This might be useful, but it seems rare enough that + // a requires_grad=True variable will turn back into the requires_grad=False + // version.) So there are three representable states: + // + // 1. autograd_meta_ == nullptr + // 2. autograd_meta_ is default constructed (semantically, same as (1)) + // 3. autograd_meta_ has nontrivial information content + // + std::unique_ptr autograd_meta_ = nullptr; + + protected: + std::unique_ptr extra_meta_ = nullptr; + + c10::VariableVersion version_counter_; + + impl::PyObjectSlot pyobj_slot_; + + c10::impl::SizesAndStrides sizes_and_strides_; + + int64_t storage_offset_ = 0; + // If sizes and strides are empty, the numel is 1!! However, most of the + // time, we will immediately set sizes to {0} and reset numel to 0. + // (Can't do that in the default initializers, because there's no way to + // spell "allocate a one-element array" for strides_). + int64_t numel_ = 1; + + // INVARIANT: When storage is non-null, this type meta must + // agree with the type meta in storage + caffe2::TypeMeta data_type_; + + // NOTE [std::optional operator usage in CUDA] + // Our optional definition doesn't compile in .cu file if `value()` or + // `operator->` are used. Instead, we always use `operator*`. + // See https://github.com/pytorch/pytorch/issues/18496 for more info. + // If this is too burdensome to maintain, we can just + // manually implement this with an additional bool. + + // INVARIANT: When storage is non-null, this Device must + // agree with the type meta in storage. + // + // INVARIANT: device_opt_ is only nullopt for undefined tensors + // (which do not have a device.) + std::optional device_opt_; + + // default member initializers for bit-fields only available with -std=c++2a + // or -std=gnu++2a + inline void init_bitfields() { + is_contiguous_ = true; + is_channels_last_ = false; + is_channels_last_contiguous_ = false; + is_channels_last_3d_ = false; + is_channels_last_3d_contiguous_ = false; + is_non_overlapping_and_dense_ = true; + is_wrapped_number_ = false; + allow_tensor_metadata_change_ = true; + reserved_ = false; + sizes_strides_policy_ = static_cast(SizesStridesPolicy::Default); + custom_sizes_strides_ = static_cast(SizesStridesPolicy::Default); + python_custom_sizes_strides_ = + static_cast(SizesStridesPolicy::Default); + python_custom_device_ = false; + python_custom_layout_ = false; + custom_device_ = false; + custom_layout_ = false; + device_policy_ = false; + layout_policy_ = false; + storage_access_should_throw_ = false; + has_symbolic_sizes_strides_ = false; + } + + // Tensor is contiguous + bool is_contiguous_ : 1; + + // Tensor is a subclass that does not permit storage access. + bool storage_access_should_throw_ : 1; + + // Tensor is stored in the channels last 2d memory format, when dimensions + // order is (N)CHW and C-strides < W-strides < H-strides (< N-strides) + // (If size of any dimension is equal to 1, this dimension strides value + // is not taken into account). + bool is_channels_last_ : 1; + + // Channels last contiguous tensor is channel last tensor which occupies + // contiguous memory block. + bool is_channels_last_contiguous_ : 1; + + // Tensor is stored in the channels last 3d memory format, when dimensions + // order is (N)CDHW and C-strides < W-strides < H-strides < D - strides (< + // N-strides) (If size of any dimension is equal to 1, this dimension strides + // value is not taken into account). + bool is_channels_last_3d_ : 1; + + // Channels last 3d contiguous tensor is channel last 3d tensor which occupies + // contiguous memory block. + bool is_channels_last_3d_contiguous_ : 1; + + // Dense tensor is the tensor that store values in a contiguous block of + // memory. Non-overlapping tensor is the tensor in which elements occupy + // individual non-repetitive memory. + bool is_non_overlapping_and_dense_ : 1; + + bool is_wrapped_number_ : 1; + + // NOTE [ Metadata Change for a Detached Tensor ] + // + // Normally, a user is allowed to change the tensor metadata + // (e.g. sizes / strides / storage / storage_offset) of a tensor. + // However, if the tensor is created by `t1_detached = t1.data` in Python + // or `t1_detached = t1.detach()` in Python/C++, those changes to the + // tensor metadata of `t1_detached` will not be propagated back to the + // original tensor `t1`. In order to make such changes explicitly illegal, + // we created the `allow_tensor_metadata_change_` flag, to prevent users + // from changing metadata of the detached tensor and expecting the original + // tensor to also be updated. + // + // NOTE: For a full list of tensor metadata fields, please see + // `copy_tensor_metadata()` in TensorImpl and its subclasses to find + // which fields are copied by value. + bool allow_tensor_metadata_change_ : 1; + + // we decide to keep reserved_ and it will + // live in Tensor after the split + // The logic is that if Extend() or ReserveSpace() were ever called, + // then subsequent Resize()s will not free up Storage. + bool reserved_ : 1; + + // Call _custom() virtual methods for + // strides()/is_contiguous()/sizes()/dim()/numel() + // This is a combination of sizes_strides_custom_dispatch_ + // and has_symbolic_sizes_strides_ + uint8_t sizes_strides_policy_ : 2; + + // Whether or not sizes_and_strides_ contains a symbolic value. + bool has_symbolic_sizes_strides_ : 1; + + // Call _custom() virtual method for + // strides()/is_contiguous()/sizes()/dim()/numel() + uint8_t custom_sizes_strides_ : 2; + + // Combo of custom_ and python_custom_ + bool device_policy_ : 1; + bool layout_policy_ : 1; + + // Call _custom() virtual method for device() + bool custom_device_ : 1; + + // Call _custom() virtual method for layout() + bool custom_layout_ : 1; + + // Call into Python for + // strides()/is_contiguous()/sizes()/dim()/numel() + uint8_t python_custom_sizes_strides_ : 2; + + // Call into Python for device() + bool python_custom_device_ : 1; + + // Call into Python for layout() + bool python_custom_layout_ : 1; + + // The set of DispatchKeys which describe this tensor. NB: this + // does NOT include Autograd (historically, it did, but + // not anymore!) + // + // INVARIANT: extra_meta_->named_tensor_meta_ != nullptr <==> + // key_set_.has(DispatchKey::Named) + DispatchKeySet key_set_; + + private: + // C10_TensorImpl_Size_Check_Dummy_Class needs to be friends with + // TensorImpl so it can inspect the size of private fields + template < + size_t cplusplus, + size_t clang_ver_major, + size_t gcc_ver, + size_t gcc_ver_minor, + size_t nvcc, + size_t cuda_version, + size_t cuda_version_major, + size_t ptr_size> + friend class C10_TensorImpl_Size_Check_Dummy_Class; +}; + +namespace detail { + +#ifndef C10_MOBILE +template +struct TargetTraits< + T, + std::enable_if_t>>> { + static constexpr bool can_have_pyobject = true; +}; +#endif + +} // namespace detail + +// Note [TensorImpl size constraints] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Changed the size of TensorImpl? If the size went down, good for +// you! Adjust the documentation below and the expected size. +// Did it go up? Read on... +// +// Struct size matters. In some production systems at Facebook, we have +// 400M live tensors during a training run. Do the math: every 64-bit +// word you add to Tensor is an extra 3.2 gigabytes in RAM. +// +// If you are a Facebook employee, you can check if the run in question +// has tipped you over the point using the command here: +// https://fburl.com/q5enpv98 +// +// For reference, we OOMed at 160 bytes (20 words) per TensorImpl. +// This is not counting overhead from strides out-of-line allocation and +// StorageImpl space and this is from before we inlined sizes and strides +// directly into TensorImpl as SmallVectors. +// +// Our memory usage on 32-bit systems is suboptimal, but we're not checking +// for it at the moment (to help avoid rage inducing cycles when the +// 32-bit number is wrong). +// +// Current breakdown: +// +// vtable pointer +// strong refcount TODO: pack these into one word +// weak refcount +// storage pointer +// autograd metadata pointer +// named tensor metadata pointer +// version counter pointer +// PyObjectSlot +// SizesAndStrides size/pointer +// SizesAndStrides sizes (pre-allocated 0) +// SizesAndStrides sizes (pre-allocated 1) +// SizesAndStrides sizes (pre-allocated 2) +// SizesAndStrides sizes (pre-allocated 3) +// SizesAndStrides sizes (pre-allocated 4) +// SizesAndStrides strides (pre-allocated 0) +// SizesAndStrides strides (pre-allocated 1) +// SizesAndStrides strides (pre-allocated 2) +// SizesAndStrides strides (pre-allocated 3) +// SizesAndStrides strides (pre-allocated 4) +// storage offset +// numel +// data type, device, is_contiguous, storage_access_should_throw_, bitfields +// DispatchKeySet +// + +// Various preprocessor macros we use to check that the +// TensorImpl size hasn't changed unexpectedly. We undef +// these later. +#ifndef __NVCC__ +#define C10_NVCC 0 +#else +#define C10_NVCC __NVCC__ +#endif + +#ifndef __CUDA_VER_MAJOR__ +#define C10_CUDA_VERSION_MAJOR 0 +#else +#define C10_CUDA_VERSION_MAJOR __CUDA_VER_MAJOR__ +#endif + +#ifndef CUDA_VERSION +#define C10_CUDA_VERSION 0 +#else +#define C10_CUDA_VERSION CUDA_VERSION +#endif + +#ifndef __clang_major__ +#define C10_CLANG_MAJOR_VERSION 0 +#else +#define C10_CLANG_MAJOR_VERSION __clang_major__ +#endif + +#ifndef __GNUC__ +#define C10_GCC_VERSION 0 +#else +#define C10_GCC_VERSION __GNUC__ +#endif + +#ifndef __GNUC_MINOR__ +#define C10_GCC_VERSION_MINOR 0 +#else +#define C10_GCC_VERSION_MINOR __GNUC_MINOR__ +#endif + +// We use a templatized class to both contain the logic of checking the sizes +// as well as to provide compile-time information that might be useful in +// figuring out why sizes may have changed. +// All the compile time information is given by the template fields that are +// always printed by the compiler when the static_assert fails. +template < + size_t cplusplus = __cplusplus, + size_t clang_ver_major = C10_CLANG_MAJOR_VERSION, + size_t gcc_ver = C10_GCC_VERSION, + size_t gcc_ver_minor = C10_GCC_VERSION_MINOR, + size_t nvcc = C10_NVCC, + size_t cuda_version = C10_CUDA_VERSION, + size_t cuda_version_major = C10_CUDA_VERSION_MAJOR, + size_t ptr_size = sizeof(void*)> +class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { + // Names of (non-bitfield) fields in TensorImpl; used to provide + // compile-time info about fields whose size changes unexpectedly. + enum class FieldNameEnum { + storage_, + autograd_meta_, + extra_meta_, + version_counter_, + pyobj_slot_, + sizes_and_strides_, + storage_offset_, + numel_, + data_type_, + device_opt_, + key_set_, + TOTAL_SIZE + }; + + // Provides compile-time equality check that reveals what numbers + // were used and on which quantity + template + constexpr static bool are_equal() { + static_assert( + Actual == Expected, + "Actual and Expected sizes of a field did not match!"); + return true; + } + + // Provides compile-time <= check that reveals what numbers + // were used and on which quantity + template + constexpr static bool is_le() { + static_assert( + Actual <= Expected, + "Actual and Expected sizes of a field did not match!"); + return true; + } + + public: + // Compile-time check that TensorImpl field sizes are as expected + // + // Observed total sizes and associated versions + // If you find a flag that predicts when unique_ptr has 16 bytes + // on 64-bit systems or when sizes_and_strides_ is 84 vs 88 bytes + // on 32-bit systems you get a cookie! + // Length | LLVM | GCC | C++ | CUDA + // 192 | ? | 11.2 | 201703 | 11040 + // 208 | ? | 11.2 | 201703 | 11040 + // 208 | ? | 11.2 | 201402 | 11040 + // 192 | ? | 11.2 | 201402 | 11040 + // 160 | 12 | 4.2 | 201703 | 0 + // + // To keep things clean, we split on systems here. + +#if UINTPTR_MAX == 0xFFFFFFFF + // This is a 32-bit system + static constexpr bool check_sizes() { + constexpr size_t tsize = 20 * sizeof(int64_t); + + // clang-format off + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + is_le(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + is_le(); + // clang-format on + + return true; + } +#else + // This is a 64-bit system + static constexpr bool check_sizes() { + constexpr size_t tsize = 26 * sizeof(int64_t); + + // clang-format off + are_equal(); + // On some systems involving NVCC the size of unique_ptr is 16 bytes. We haven't + // figured out how to detect those via macro preprocessors yet, so we use <= + // comparisons for the relevant fields. + is_le(); + is_le(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + are_equal(); + is_le(); + // clang-format on + + return true; + } +#endif +}; + +// We use a class to encapsulate size-checking logic with +// templates to capture sizes and flags. We call this within +// a static assert to prove there is no run-time behaviour. +// Since the methods we call return either true or fail their +// own static_asserts, we should never see the error messages +// below. We have to provide it though for c++ <17. +static_assert( + C10_TensorImpl_Size_Check_Dummy_Class<>::check_sizes(), + "You should not see this message."); + +// Clean up after ourselves +#undef C10_NVCC +#undef C10_CUDA_VERSION_MAJOR +#undef C10_CUDA_VERSION +#undef C10_CLANG_MAJOR_VERSION +#undef C10_GCC_VERSION +#undef C10_GCC_VERSION_MINOR + +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..7add8edc4361ab3c38675d8565ad13b4d1ed48b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h @@ -0,0 +1,791 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") + +namespace c10 { + +inline ScalarType dtype_or_default(std::optional dtype) { + return dtype.value_or(get_default_dtype_as_scalartype()); +} + +inline caffe2::TypeMeta dtype_or_default( + std::optional dtype) { + return dtype.value_or(get_default_dtype()); +} + +inline Layout layout_or_default(std::optional layout) { + return layout.value_or(kStrided); +} + +inline Device device_or_default(std::optional device) { + return device.value_or(Device(kCPU)); +} + +inline bool pinned_memory_or_default(std::optional pinned_memory) { + return pinned_memory.value_or(false); +} + +/// A class to encapsulate construction axes of an Tensor. TensorOptions was +/// designed to support the Python style API for specifying construction options +/// on factory functions, e.g., +/// +/// torch.zeros(2, 3, dtype=torch.int32) +/// +/// Because C++ doesn't natively support keyword arguments, there must be +/// another way of specifying keyword-like arguments. TensorOptions is a +/// builder class which can be used to construct this "dictionary" of keyword +/// arguments: functions which support TensorOptions conventionally take this +/// argument optionally as their last argument. +/// +/// WARNING: In PyTorch, there are `torch::` variants of factory functions, +/// e.g., torch::zeros for at::zeros. These return Variables (while the +/// stock ATen functions return plain Tensors). If you mix these functions +/// up, you WILL BE SAD. +/// +/// Rather than use the constructor of this class directly, you should prefer to +/// use the constructor functions, and then chain setter methods on top of them. +/// +/// at::device(at::kCUDA).dtype(kInt) +/// at::dtype(at::kInt) +/// +/// Additionally, anywhere a TensorOptions is expected, you can directly +/// pass at::kCUDA / at::kInt, and it will implicitly convert to a +/// TensorOptions. +/// +/// Here are some recommended ways to create a 2x2 tensor of zeros +/// with certain properties. These all *implicitly* make use of +/// TensorOptions, even if they don't mention the class explicitly: +/// +/// at::zeros({2,2}, at::kCUDA); +/// at::zeros({2,2}, at::kLong); +/// at::zeros({2,2}, at::device(at::kCUDA).dtype(at::kLong())); +/// at::zeros({2,2}, at::device({at::kCUDA, 1})); // place on device 1 +/// at::zeros({2,2}, at::requires_grad()); +/// + +/// NOTE [ TensorOptions Constructors ] +/// +/// TensorOptions is like a dictionary with entries from the set: +/// {requires_grad, device, dtype, layout}, where each entry may be +/// unspecified (i.e., is optional). It is used to specify the properties of +/// tensors in many places both in C++ internal and API, e.g., tensor factory +/// methods like `at::empty({10}, options)`, tensor conversions like +/// `tensor.to(...)`, etc. +/// +/// To provide a simple API that is consistent with Python, where one can do +/// `torch.empty(sizes, X)` with `X` being a `torch.device`, `torch.dtype`, or a +/// `torch.layout`, we want TensorOptions to be implicitly convertible from +/// `ScalarType dtype`, `Layout layout` and `Device device`. Therefore, we have +/// three implicit constructors from each of these three types. +/// +/// This is sufficient for `ScalarType` and `Layout` as they are simple Enum +/// classes. However, `Device` is an ordinary class with implicit constructors +/// `Device(DeviceType, DeviceIndex = -1)` and `Device(std::string)` to be +/// consistent with Python API, where strings are treated as equivalent with a +/// `torch.device` object (e.g., "cuda:1" can be passed to everywhere a +/// `torch.device("cuda:1")` is accepted). To support the syntax +/// `at::empty({10}, {kCUDA, 1})` and `tensor.to(kCUDA)`, we need to make sure +/// that `TensorOptions` is implicitly constructible with any arguments that a +/// `Device` can constructed from. So we have, +/// +/// /* implicit */ TensorOptions(T&& device) : TensorOptions() { +/// this->set_device(device); +/// } +/// +/// template ::value>> +/// /* implicit */ TensorOptions(Args&&... args) +/// : TensorOptions(Device(std::forward(args)...)) {} +/// +/// +/// But this will be problematic. Consider this: `TensorOptions({kCUDA, 1})`. +/// Compiler will complain about ambiguity between the copy constructor and the +/// `Device` constructor because `{kCUDA, 1}` can be converted to both a +/// `TensorOption` and a `Device`. +/// +/// To get around this, we templatize the `Device` constructor. Since overload +/// resolution is done before template resolution, our problem is solved. + +DispatchKey computeDispatchKey( + std::optional dtype, + std::optional layout, + std::optional device); + +struct C10_API TensorOptions { + TensorOptions() + : requires_grad_(false), + pinned_memory_(false), + has_device_(false), + has_dtype_(false), + has_layout_(false), + has_requires_grad_(false), + has_pinned_memory_(false), + has_memory_format_(false) {} + + /// Constructs a `TensorOptions` object with the given layout. + /* implicit */ TensorOptions(Layout layout) : TensorOptions() { + this->set_layout(layout); + } + + /// Constructs a `TensorOptions` object with the given device. + /// See NOTE [ TensorOptions Constructors ] on why this is templatized. + template < + typename T, + typename = std::enable_if_t, Device>>> + /* implicit */ TensorOptions(T&& device) : TensorOptions() { + this->set_device(std::forward(device)); + } + + /// Constructs a `TensorOptions` object from arguments allowed in `Device` + /// constructors. + /// + /// See NOTE [ TensorOptions Constructors ]. + /// + /// NB: Ideally we only allow implicit constructors here. But there is no easy + /// way to detect them. So we have this one that allows explicit + /// constructors too. + template < + typename... Args, + typename = std::enable_if_t>> + /* implicit */ TensorOptions(Args&&... args) + : TensorOptions(Device(std::forward(args)...)) {} + + /// Constructs a `TensorOptions` object with the given dtype. + /* implicit */ TensorOptions(caffe2::TypeMeta dtype) : TensorOptions() { + this->set_dtype(dtype); + } + + /// legacy constructor to support ScalarType + /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() { + this->set_dtype(dtype); + } + + /// Constructs a `TensorOptions` object with the given memory format. + /* implicit */ TensorOptions(MemoryFormat memory_format) : TensorOptions() { + set_memory_format(memory_format); + } + + /// Return a copy of `TensorOptions` with `device` set to the given one, or + /// cleared if `device` is `nullopt`. + [[nodiscard]] TensorOptions device( + std::optional device) const noexcept { + TensorOptions r = *this; + r.set_device(device); + return r; + } + + /// Return a copy of `TensorOptions` with `device` set to the given one. + /// (This overload ensures that variadic template std::optional constructor + /// for Device work correctly.) + template + [[nodiscard]] TensorOptions device(Args&&... args) const noexcept { + return device( + std::optional(std::in_place, std::forward(args)...)); + } + + /// Return a copy of `TensorOptions`, but with device set to CUDA, and the + /// device index set to the given one. + /// + /// TODO: This function encourages bad behavior (assuming CUDA is + /// the only device that matters). Get rid of it / rename it. + [[nodiscard]] TensorOptions device_index( + c10::DeviceIndex device_index) const noexcept { + return device(Device::Type::CUDA, device_index); + } + + /// Return a copy of `TensorOptions` with `dtype` set to the given one. + [[nodiscard]] TensorOptions dtype( + std::optional dtype) const noexcept { + TensorOptions r = *this; + r.set_dtype(dtype); + return r; + } + + // legacy function to support ScalarType + [[nodiscard]] TensorOptions dtype( + std::optional dtype) const noexcept { + TensorOptions r = *this; + r.set_dtype(dtype); + return r; + } + + // Since dtype is taken... + template + TensorOptions& dtype() { + dtype_ = caffe2::TypeMeta::Make(); + has_dtype_ = true; + return *this; + } + + /// Sets the layout of the `TensorOptions`. + [[nodiscard]] TensorOptions layout( + std::optional layout) const noexcept { + TensorOptions r = *this; + r.set_layout(layout); + return r; + } + + /// Sets the `requires_grad` property of the `TensorOptions`. + [[nodiscard]] TensorOptions requires_grad( + std::optional requires_grad) const noexcept { + TensorOptions r = *this; + r.set_requires_grad(requires_grad); + return r; + } + + /// Sets the `pinned_memory` property on the `TensorOptions`. + [[nodiscard]] TensorOptions pinned_memory( + std::optional pinned_memory) const noexcept { + TensorOptions r = *this; + r.set_pinned_memory(pinned_memory); + return r; + } + + /// Sets the `memory_format` property on `TensorOptions`. + [[nodiscard]] TensorOptions memory_format( + std::optional memory_format) const noexcept { + TensorOptions r = *this; + r.set_memory_format(memory_format); + return r; + } + + /// Returns the device of the `TensorOptions`. + Device device() const noexcept { + return device_or_default(device_opt()); + } + + /// Returns whether the device is specified. + bool has_device() const noexcept { + return has_device_; + } + + /// Returns the device of the `TensorOptions`, or `std::nullopt` if + /// device is not specified. + std::optional device_opt() const noexcept { + return has_device_ ? std::make_optional(device_) : std::nullopt; + } + + /// Returns the device index of the `TensorOptions`. + c10::DeviceIndex device_index() const noexcept { + return device().index(); + } + + /// Returns the dtype of the `TensorOptions`. + caffe2::TypeMeta dtype() const noexcept { + return dtype_or_default(dtype_opt()); + } + + /// Returns whether the dtype is specified. + bool has_dtype() const noexcept { + return has_dtype_; + } + + /// Returns the dtype of the `TensorOptions`, or `std::nullopt` if + /// device is not specified. + std::optional dtype_opt() const noexcept { + return has_dtype_ ? std::make_optional(dtype_) : std::nullopt; + } + + /// Returns the layout of the `TensorOptions`. + Layout layout() const noexcept { + return layout_or_default(layout_opt()); + } + + /// Returns whether the layout is specified. + bool has_layout() const noexcept { + return has_layout_; + } + + /// Returns the layout of the `TensorOptions`, or `std::nullopt` if + /// layout is not specified. + std::optional layout_opt() const noexcept { + return has_layout_ ? std::make_optional(layout_) : std::nullopt; + } + + /// Returns the `requires_grad` property of the `TensorOptions`. + bool requires_grad() const noexcept { + return has_requires_grad_ ? requires_grad_ : false; + } + + /// Returns whether the `requires_grad` is specified. + bool has_requires_grad() const noexcept { + return has_requires_grad_; + } + + /// Returns the `requires_grad` property of the `TensorOptions`, or + /// `std::nullopt` if `requires_grad` is not specified. + std::optional requires_grad_opt() const noexcept { + return has_requires_grad_ ? std::make_optional(requires_grad_) + : std::nullopt; + } + + /// Returns the `pinned_memory` property of the `TensorOptions`. + bool pinned_memory() const noexcept { + return pinned_memory_or_default(pinned_memory_opt()); + } + + /// Returns whether the `pinned_memory` is specified. + bool has_pinned_memory() const noexcept { + return has_pinned_memory_; + } + + /// Returns if the layout is sparse + bool is_sparse() const { + return layout_ == c10::Layout::Sparse; + } + + /// Returns if the layout is sparse CSR, deprecated, use + /// is_sparse_compressed() instead + bool is_sparse_csr() const { + return layout_ == c10::Layout::SparseCsr; + } + + bool is_sparse_compressed() const { + return layout_ == c10::Layout::SparseCsr || + layout_ == c10::Layout::SparseCsc || + layout_ == c10::Layout::SparseBsr || layout_ == c10::Layout::SparseBsc; + } + + // For compatibility with legacy tensor.type() comparisons + bool type_equal(const TensorOptions& other) const { + return computeDispatchKey() == other.computeDispatchKey() && + typeMetaToScalarType(dtype_) == typeMetaToScalarType(other.dtype()); + } + + /// Returns the `pinned_memory` property of the `TensorOptions`, or + /// `std::nullopt` if `pinned_memory` is not specified. + std::optional pinned_memory_opt() const noexcept { + return has_pinned_memory_ ? std::make_optional(pinned_memory_) + : std::nullopt; + } + + /// Returns whether the `memory_layout` is specified + bool has_memory_format() const noexcept { + return has_memory_format_; + } + + // NB: memory_format() getter is PURPOSELY not defined, as the default + // behavior of memory_format varies from function to function. + + /// Returns the `memory_layout` property of `TensorOptions, or + /// `std::nullopt` if `memory_format` is not specified. + std::optional memory_format_opt() const noexcept { + return has_memory_format_ ? std::make_optional(memory_format_) + : std::nullopt; + } + + // Resolves the ATen backend specified by the current construction axes. + // TODO: Deprecate this + Backend backend() const { + return at::dispatchKeyToBackend(computeDispatchKey()); + } + + /// Return the right-biased merge of two TensorOptions. This has the + /// effect of overwriting settings from self with specified options + /// of options. + /// + /// NB: This merging operation does NOT respect device merges. + /// For example, if you device({kCUDA, 1}).merge_in(kCUDA) + /// you will get kCUDA in the end! Functions like Tensor.new_empty + /// ensure the right device is selected anyway by way of a + /// device guard. + /// + TensorOptions merge_in(TensorOptions options) const noexcept { + TensorOptions merged = *this; + if (options.has_device()) + merged.set_device(options.device_opt()); + if (options.has_dtype()) + merged.set_dtype(options.dtype_opt()); + if (options.has_layout()) + merged.set_layout(options.layout_opt()); + // NB: requires grad is right biased; not a logical AND/OR! + if (options.has_requires_grad()) + merged.set_requires_grad(options.requires_grad_opt()); + if (options.has_pinned_memory()) + merged.set_pinned_memory(options.pinned_memory_opt()); + if (options.has_memory_format()) + merged.set_memory_format(options.memory_format_opt()); + return merged; + } + + // TODO remove after TensorOptions rationalization + TensorOptions merge_memory_format( + std::optional optional_memory_format) const noexcept { + TensorOptions merged = *this; + if (optional_memory_format.has_value()) { + merged.set_memory_format(optional_memory_format); + } + return merged; + } + + // INVARIANT: computeDispatchKey returns only the subset of dispatch keys for + // which dispatchKeyToBackend is injective, if it is defined at all (for + // the most part, this just means that this function never returns an + // Autograd key) + DispatchKey computeDispatchKey() const { + return c10::computeDispatchKey( + optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt()); + } + + private: + // These methods are currently private because I'm not sure if it's wise + // to actually publish them. They are methods because I need them in + // the constructor and the functional API implementation. + // + // If you really, really need it, you can make these public, but check if you + // couldn't just do what you need with the functional API. Similarly, these + // methods are not chainable, because if you wanted chaining, you probably + // want to use the functional API instead. (It's probably OK to make + // these chainable, because these functions are all explicitly annotated + // with a ref-qualifier, the trailing &, that makes them illegal to call + // on temporaries.) + + /// Mutably set the device of `TensorOptions`. + void set_device(std::optional device) & noexcept { + if (device) { + device_ = *device; + has_device_ = true; + } else { + has_device_ = false; + } + } + + /// Mutably set the dtype of `TensorOptions`. + void set_dtype(std::optional dtype) & noexcept { + if (dtype) { + dtype_ = *dtype; + has_dtype_ = true; + } else { + has_dtype_ = false; + } + } + + // legacy function to support ScalarType + void set_dtype(std::optional dtype) & noexcept { + if (dtype) { + dtype_ = scalarTypeToTypeMeta(*dtype); + has_dtype_ = true; + } else { + has_dtype_ = false; + } + } + + /// Mutably set the layout of `TensorOptions`. + void set_layout(std::optional layout) & noexcept { + if (layout) { + layout_ = *layout; + has_layout_ = true; + } else { + has_layout_ = false; + } + } + + /// Mutably set the `requires_grad` property of `TensorOptions`. + void set_requires_grad(std::optional requires_grad) & noexcept { + if (requires_grad) { + requires_grad_ = *requires_grad; + has_requires_grad_ = true; + } else { + has_requires_grad_ = false; + } + } + + /// Mutably set the `pinned_memory` property of `TensorOptions`. + void set_pinned_memory(std::optional pinned_memory) & noexcept { + if (pinned_memory) { + pinned_memory_ = *pinned_memory; + has_pinned_memory_ = true; + } else { + has_pinned_memory_ = false; + } + } + + /// Mutably set the `memory_Format` property of `TensorOptions`. + void set_memory_format(std::optional memory_format) & noexcept { + if (memory_format) { + memory_format_ = *memory_format; + has_memory_format_ = true; + } else { + has_memory_format_ = false; + } + } + + // WARNING: If you edit TensorOptions to add more options, you + // may need to adjust the implementation of Tensor::options. + // The criteria for whether or not Tensor::options must be adjusted + // is whether or not the new option you added should preserved + // by functions such as empty_like(); if it should be preserved, + // you must adjust options(). + // + // TODO: MemoryFormat is not implemented in this way + + // NB: We didn't use std::optional here, because then we can't pack + // the has_***_ boolean fields. + + Device device_ = at::kCPU; // 16-bit + caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make(); // 16-bit + Layout layout_ = at::kStrided; // 8-bit + MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit + + // Bitmask required here to get this to fit inside 32 bits (or even 64 bits, + // for that matter) + + bool requires_grad_ : 1; + bool pinned_memory_ : 1; + + bool has_device_ : 1; + bool has_dtype_ : 1; + bool has_layout_ : 1; + bool has_requires_grad_ : 1; + bool has_pinned_memory_ : 1; + bool has_memory_format_ : 1; +}; + +// We should aspire to fit in one machine-size word; but a size greater than two +// words is too much. (We are doing terribly on 32-bit archs, where we require +// three machine size words to store tensor options. Eek!) +static_assert( + sizeof(TensorOptions) <= sizeof(int64_t) * 2, + "TensorOptions must fit in 128-bits"); + +/// Convenience function that returns a `TensorOptions` object with the `dtype` +/// set to the given one. +inline TensorOptions dtype(caffe2::TypeMeta dtype) { + return TensorOptions().dtype(dtype); +} + +// legacy function to support ScalarType +inline TensorOptions dtype(ScalarType dtype) { + return TensorOptions().dtype(scalarTypeToTypeMeta(dtype)); +} + +/// Convenience function that returns a `TensorOptions` object with the `layout` +/// set to the given one. +inline TensorOptions layout(Layout layout) { + return TensorOptions().layout(layout); +} + +/// Convenience function that returns a `TensorOptions` object with the `device` +/// set to the given one. +inline TensorOptions device(Device device) { + return TensorOptions().device(device); +} + +/// Convenience function that returns a `TensorOptions` object with the +/// `device` set to CUDA and the `device_index` set to the given one. +inline TensorOptions device_index(c10::DeviceIndex device_index) { + return TensorOptions().device_index(device_index); +} + +/// Convenience function that returns a `TensorOptions` object with the +/// `requires_grad` set to the given one. +inline TensorOptions requires_grad(bool requires_grad = true) { + return TensorOptions().requires_grad(requires_grad); +} + +/// Convenience function that returns a `TensorOptions` object with the +/// `memory_format` set to the given one. +inline TensorOptions memory_format(MemoryFormat memory_format) { + return TensorOptions().memory_format(memory_format); +} + +C10_API std::ostream& operator<<( + std::ostream& stream, + const TensorOptions& options); + +template +inline TensorOptions dtype() { + return dtype(caffe2::TypeMeta::Make()); +} + +inline std::string toString(const TensorOptions& options) { + std::ostringstream stream; + stream << options; + return stream.str(); +} + +// This is intended to be a centralized location by which we can determine +// what an appropriate DispatchKey for a tensor is. +inline DispatchKey computeDispatchKey( + std::optional dtype, + std::optional layout, + std::optional device) { + const auto layout_ = layout_or_default(layout); + const auto device_ = device_or_default(device); + switch (layout_) { + case Layout::Jagged: + case Layout::Strided: { + const auto dtype_ = dtype_or_default(dtype); + switch (device_.type()) { +#define DO_CASE(device, _) \ + case c10::DeviceType::device: { \ + if (isQIntType(dtype_)) { \ + return DispatchKey::Quantized##device; \ + } \ + return DispatchKey::device; \ + } + C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused) +#undef DO_CASE + case c10::DeviceType::FPGA: + return DispatchKey::FPGA; + case c10::DeviceType::MAIA: + return DispatchKey::MAIA; + case c10::DeviceType::Vulkan: + return DispatchKey::Vulkan; + case c10::DeviceType::Metal: + return DispatchKey::Metal; + case c10::DeviceType::MKLDNN: + case c10::DeviceType::OPENGL: + case c10::DeviceType::OPENCL: + case c10::DeviceType::IDEEP: + TORCH_INTERNAL_ASSERT( + 0, + "This is a grandfathered Caffe2 device type ", + device_.type(), + ", it shouldn't ever convert to a DispatchKey. File a bug describing what you were doing if you think this is in error."); + default: + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "Unsupported device type for dense layout: ", + device_.type()); + } + } + case Layout::Sparse: + switch (device_.type()) { +#define DO_CASE(device, _) \ + case c10::DeviceType::device: { \ + return DispatchKey::Sparse##device; \ + } + C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused) +#undef DO_CASE + default: + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "Unsupported device type for sparse layout: ", + device_.type()); + } + case Layout::Mkldnn: + switch (device_.type()) { + case c10::DeviceType::CPU: + return DispatchKey::MkldnnCPU; + default: + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "Unsupported device type for mkldnn layout: ", + device_.type()); + } + case Layout::SparseCsr: + case Layout::SparseCsc: + case Layout::SparseBsr: + case Layout::SparseBsc: + switch (device_.type()) { +#define DO_CASE(device, _) \ + case c10::DeviceType::device: { \ + return DispatchKey::SparseCsr##device; \ + } + C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused) +#undef DO_CASE + default: + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "Unsupported device type for ", + layout_, + " layout: ", + device_.type()); + } + default: + TORCH_CHECK(false, "Unsupported layout: ", layout_); + } +} + +inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) { + switch (dispatch_key) { +#define DO_CASE(bc, _) case DispatchKey::Sparse##bc: + C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused) +#undef DO_CASE + return Layout::Sparse; +#define DO_CASE(bc, _) case DispatchKey::SparseCsr##bc: + C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused) +#undef DO_CASE + TORCH_CHECK( + false, "Cannot map DispatchKey ", dispatch_key, " to a unique layout."); + case DispatchKey::MkldnnCPU: + return Layout::Mkldnn; + default: + return Layout::Strided; + } +} + +inline c10::DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) { + switch (dispatch_key) { + // stuff that's real +#define DO_CASE(suffix, prefix) \ + case DispatchKey::prefix##suffix: \ + return c10::DeviceType::suffix; +#define DO_CASES(_, prefix) C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, prefix) + C10_FORALL_FUNCTIONALITY_KEYS(DO_CASES) +#undef DO_CASES +#undef DO_CASE + + case DispatchKey::MkldnnCPU: + return c10::DeviceType::CPU; + case DispatchKey::Vulkan: + return c10::DeviceType::Vulkan; + + case DispatchKey::MAIA: + return c10::DeviceType::MAIA; + default: + TORCH_CHECK( + false, + "DispatchKey ", + dispatch_key, + " doesn't correspond to a device"); + } +} + +inline TensorOptions dispatchKeyToTensorOptions(DispatchKey dispatch_key) { + return TensorOptions() + .layout(dispatchKeyToLayout(dispatch_key)) + .device(dispatchKeyToDeviceType(dispatch_key)); +} + +namespace detail { +inline bool backend_supports_empty_operator(const TensorOptions& options) { + // Quantized backends don't support at::empty(). + // They have separate operators like at::empty_quantized() that take in + // extra information about how to quantize the tensor. + return !isQIntType(typeMetaToScalarType(options.dtype())); +} + +} // namespace detail + +} // namespace c10 + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..3a8381e887f90556b66f8b654bb5376e16afe074 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h @@ -0,0 +1,54 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10 { + +struct C10_API UndefinedTensorImpl final : public TensorImpl { + public: + // Without this, we get: + // error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in + // device code + // (ostensibly because the constexpr tricks MSVC into trying to compile this + // function for device as well). +#ifdef _WIN32 + static inline TensorImpl* singleton() { + return &getInstance(); + } +#else + static constexpr inline TensorImpl* singleton() { + return &_singleton; + } +#endif + +#ifdef DEBUG + bool has_storage() const override; +#endif + void set_storage_offset(int64_t offset) override; + + protected: + c10::SymBool sym_is_contiguous_custom(MemoryFormat format) const override; + IntArrayRef strides_custom() const override; + SymIntArrayRef sym_strides_custom() const override; + + private: + UndefinedTensorImpl(); +#ifdef _WIN32 + static UndefinedTensorImpl& getInstance(); +#else + static UndefinedTensorImpl _singleton; +#endif + const char* tensorimpl_type_name() const override; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h new file mode 100644 index 0000000000000000000000000000000000000000..02570ae84ffdb64c1b2c8b20deb52178c606f57d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +namespace detail { +// This template can only be specialized at int64_t and c10::SymInt; +// you'll get linker errors otherwise +template +C10_API T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar); +} // namespace detail + +template +T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) { + // Inline the fast paths + if (C10_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) { + // For SymInts, we want an explicit control flow to trigger a guard, so we + // may as well branch too. + if (dim < 0) { + return dim + dim_post_expr; + } + return dim; + } + // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors) + return c10::detail::maybe_wrap_dim_slow( + std::move(dim), std::move(dim_post_expr), wrap_scalar); +} + +inline int64_t maybe_wrap_dim( + int64_t dim, + int64_t dim_post_expr, + bool wrap_scalar = true) { + return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar); +} + +inline c10::SymInt maybe_wrap_dim( + c10::SymInt dim, + c10::SymInt dim_post_expr, + bool wrap_scalar = true) { + return _maybe_wrap_dim(std::move(dim), std::move(dim_post_expr), wrap_scalar); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h new file mode 100644 index 0000000000000000000000000000000000000000..4ef01f7bfa99c473ebb6612a83f0cdde53eeec6b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { + +#ifdef C10_MOBILE +// Use 16-byte alignment on mobile +// - ARM NEON AArch32 and AArch64 +// - x86[-64] < AVX +constexpr size_t gAlignment = 16; +#else +// Use 64-byte alignment should be enough for computation up to AVX512. +constexpr size_t gAlignment = 64; +#endif + +constexpr size_t gPagesize = 4096; +// since the default thp pagesize is 2MB, enable thp only +// for buffers of size 2MB or larger to avoid memory bloating +constexpr size_t gAlloc_threshold_thp = static_cast(2) * 1024 * 1024; + +// Cache line size used to avoid false sharing between threads. Falls back to 64 +// bytes if C++17 feature is unavailable. +#ifdef __cpp_lib_hardware_interference_size +using std::hardware_destructive_interference_size; +#else +constexpr std::size_t hardware_destructive_interference_size = 64; +#endif +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h new file mode 100644 index 0000000000000000000000000000000000000000..1ef394e6e3536530af4a6427f16f0a383c39c5be --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { +struct StorageImpl; +class DataPtr; +} // namespace c10 + +namespace c10::impl::cow { + +// Creates a Copy-on-write (COW) clone of the given storage. This will also +// convert the given storage into a COW storage if it is not COW already. +// +// Converting the storage into a COW storage will not be successful if the +// storage's DataPtr has some context (`DataPtr::get_context()`) which is not +// equal to the data pointer (`DataPtr::get()`). In this case, a nullptr is +// returned. +C10_API c10::intrusive_ptr lazy_clone_storage( + StorageImpl& storage); + +// Check if a storage has a simple DataPtr with no abnormal context +C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage); + +// Check if a DataPtr is COW +C10_API bool is_cow_data_ptr(const c10::DataPtr& data_ptr); + +// Eagerly copies a COW storage's data, turning it into a non-COW storage. +C10_API void materialize_cow_storage(StorageImpl& storage); + +} // namespace c10::impl::cow + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h new file mode 100644 index 0000000000000000000000000000000000000000..90a618003c995ce6fe949b8f0ea5110a8a47b74a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h @@ -0,0 +1,71 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace c10::impl::cow { + +// A COWDeleterContext object is used as the `ctx` argument for DataPtr +// to implement a Copy-on-write (COW) DataPtr. +class C10_API COWDeleterContext { + public: + // Creates an instance, holding the pair of data and original + // deleter. + // + // Note that the deleter will only be called in our destructor if + // the last reference to this goes away without getting + // materialized. + explicit COWDeleterContext(std::unique_ptr data); + + // Increments the current refcount. + void increment_refcount(); + + // See README.md in this directory to understand the locking + // strategy. + + // Represents a reference to the context. + // + // This is returned by decrement_refcount to allow the caller to + // copy the data under the shared lock. + using NotLastReference = std::shared_lock; + + // Represents the last reference to the context. + // + // This will be returned by decrement_refcount when it is the last + // reference remaining and after any pending copies have completed. + using LastReference = std::unique_ptr; + + // Decrements the refcount, returning a handle indicating what to + // do with it. + std::variant decrement_refcount(); + + private: + // The destructor is hidden, this should only ever be used within + // UniqueVoidPtr using cow::delete_context as the deleter. + ~COWDeleterContext(); + + std::shared_mutex mutex_; + std::unique_ptr data_; + std::atomic refcount_ = 1; +}; + +// `cow_deleter` is used as the `ctx_deleter` for DataPtr to implement a COW +// DataPtr. +// +// Warning: This should only be called on a pointer to a COWDeleterContext that +// was allocated on the heap with `new`, because when the refcount reaches 0, +// the context is deleted with `delete`. +C10_API void cow_deleter(void* ctx); + +} // namespace c10::impl::cow + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..f8b12a993a2a82c4b09b74e5c26ca48bcff3f4bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h @@ -0,0 +1,417 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +// Just for C10_ANONYMOUS_VARIABLE +#include +#include + +#include +#include + +namespace c10 { + +// Forward declaration +class DataPtr; + +/** + * Note [Flags defining the behavior of events] + * + * PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The + * BACKEND_DEFAULT is what a particular backend would select if no + * flags were given. PYTORCH_DEFAULT is the PyTorch's framework default + * choice for events on that backend, which may not be the same. + * + * The mapping of PYTORCH_DEFAULT and BACKEND_DEFAULT is done by each + * backend implementation. + */ +enum class EventFlag { + // Disable timing + PYTORCH_DEFAULT, + // Enable timing + BACKEND_DEFAULT, + // FOR TESTING ONLY + INVALID +}; + +namespace impl { + +/** + * DeviceGuardImplInterface represents the virtual interface which provides + * functionality to provide an RAII class for device and stream switching, + * via DeviceGuard. Every distinct device type, e.g., CUDA and HIP, is + * expected to implement and register an implementation of this interface. + * All classes which inherit from DeviceGuardImplInterface should be declared + * 'final'. + * + * This class exists because we provide a unified interface for performing + * device guards via DeviceGuard, but we cannot assume that we have actually + * compiled against the, e.g., CUDA library, which actually implements + * this guard functionality. In this case, a dynamic dispatch is required + * to cross the library boundary. + * + * If possible, you should directly use implementations of this interface; + * those uses will be devirtualized. + */ +struct C10_API DeviceGuardImplInterface { + DeviceGuardImplInterface() = default; + DeviceGuardImplInterface(const DeviceGuardImplInterface&) = default; + DeviceGuardImplInterface& operator=(const DeviceGuardImplInterface&) = + default; + DeviceGuardImplInterface(DeviceGuardImplInterface&&) noexcept = default; + DeviceGuardImplInterface& operator=(DeviceGuardImplInterface&&) noexcept = + default; + + /** + * Return the type of device managed by this guard implementation. + */ + virtual DeviceType type() const = 0; + + /** + * Set the current device to Device, and return the previous Device. + */ + virtual Device exchangeDevice(Device) const = 0; + // NB: Implementations of exchangeDevice can be a bit boilerplatey. You might + // consider replacing exchangeDevice with a non-virtual function with a baked + // in implementation; however, note that this will triple the number of + // virtual calls (when you implement exchangeDevice in a final subclass, + // the compiler gets to devirtualize everything; it won't do that if you don't + // define it in the subclass!) A common way to solve this problem is to use + // some sort of CRTP; however, we can template DeviceGuardImplInterface since + // we really *do* need it to be virtual. A little boilerplate seems easiest + // to explain. (Another way around this problem is to provide inline + // functions that provide the default implementations, but this seems a little + // hard to explain. In any case, we're only going to have on order of ten + // implementations of this anyway.) + + /** + * Get the current device. + */ + virtual Device getDevice() const = 0; + + /** + * Set the current device to Device. + */ + virtual void setDevice(Device) const = 0; + + /** + * Set the current device to Device, without checking for errors + * (so, e.g., this can be called from a destructor). + */ + virtual void uncheckedSetDevice(Device) const noexcept = 0; + + /** + * Get the current stream for a given device. + */ + virtual Stream getStream(Device) const = 0; + + /** + * Get the default stream for a given device. + */ + virtual Stream getDefaultStream(Device /*unused*/) const { + TORCH_CHECK(false, "Backend doesn't support acquiring a default stream.") + } + + /** + * Get a stream from the global pool for a given device. + */ + virtual Stream getStreamFromGlobalPool( + Device /*unused*/, + bool isHighPriority = false) const { + (void)isHighPriority; // Suppress unused variable warning + TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.") + } + + /** + * Return a new stream for a given device and priority. The stream will be + * copied and shared around, device backend should be able to correctly handle + * the lifetime of the stream. + */ + virtual Stream getNewStream(Device /*unused*/, int priority = 0) const { + (void)priority; + TORCH_CHECK(false, "Backend doesn't support create a new Stream.") + } + + /** + * Set a stream to be the thread local current stream for its device. + * Return the previous stream for that device. You are NOT required + * to set the current device to match the device of this stream. + */ + virtual Stream exchangeStream(Stream) const = 0; + + /** + * Destroys the given event. + */ + virtual void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/) + const noexcept {} + + /** + * Increments the event's version and enqueues a job with this version + * in the stream's work queue. When the stream process that job + * it notifies all streams waiting on / blocked by that version of the + * event to continue and marks that version as recorded. + * */ + virtual void record( + void** /*event*/, + const Stream& /*stream*/, + const DeviceIndex /*device_index*/, + const c10::EventFlag /*flag*/) const { + TORCH_CHECK(false, "Backend doesn't support events."); + } + + /** + * Does nothing if the event has not been scheduled to be recorded. + * If the event was previously enqueued to be recorded, a command + * to wait for the version of the event that exists at the time of this call + * is inserted in the stream's work queue. + * When the stream reaches this command it will stop processing + * additional commands until that version of the event is marked as recorded. + */ + virtual void block(void* /*event*/, const Stream& /*stream*/) const { + TORCH_CHECK(false, "Backend doesn't support events."); + } + + /** + * Returns true if (and only if) + * (1) the event has never been scheduled to be recorded + * (2) the current version is marked as recorded. + * Returns false otherwise. + */ + virtual bool queryEvent(void* /*event*/) const { + TORCH_CHECK(false, "Backend doesn't support events."); + } + + /** + * Get the number of devices. WARNING: This is REQUIRED to not raise + * an exception. If there is some sort of problem, e.g., driver error, + * you should report that there are zero available devices. + */ + virtual DeviceIndex deviceCount() const noexcept = 0; + + /** + * Get the following capabilities of the current device: + * (1) Data type support + * Returns DeviceCapability object. + */ + virtual DeviceCapability getDeviceCapability(Device /*unused*/) const { + TORCH_CHECK(false, "Backend doesn't support getting device capabilities."); + } + + /** + * Return true if all the work previously enqueued on the stream for + * asynchronous execution has completed running on the device. + */ + virtual bool queryStream(const Stream& /*stream*/) const { + TORCH_CHECK(false, "Backend doesn't support querying streams."); + } + + /** + * Wait (by blocking the calling thread) until all the work previously + * enqueued on the stream has completed running on the device. + */ + virtual void synchronizeStream(const Stream& /*stream*/) const { + TORCH_CHECK(false, "Backend doesn't support synchronizing streams."); + } + + /** + * Wait (by blocking the calling thread) until all the work previously + * recorded on the event has completed running on the device. + */ + virtual void synchronizeEvent(void* /*event*/) const { + TORCH_CHECK(false, "Backend doesn't support synchronizing events."); + } + + /** + * Wait (by blocking the calling thread) until all the work previously + * enqueued on the device has been completed. + */ + virtual void synchronizeDevice(const DeviceIndex /*device_index*/) const { + TORCH_CHECK( + false, "Backend doesn't support synchronizing all streams on device."); + } + + /** + * Ensure the caching allocator (if any) is aware that the given DataPtr is + * being used on the given stream, and that it should thus avoid recycling the + * DataPtr until all work on that stream is done. + */ + virtual void recordDataPtrOnStream( + const c10::DataPtr& /*unused*/, + const Stream& /*unused*/) const {} + + /** + * Fetch the elapsed time between two recorded events. + */ + virtual double elapsedTime( + void* /*event1*/, + void* /*event2*/, + const DeviceIndex /*device_index*/) const { + TORCH_CHECK(false, "Backend doesn't support elapsedTime."); + } + + /** + * Intended use of this class is to leak the DeviceGuardImpl at program end. + * So you better not call the destructor, buster! + */ + virtual ~DeviceGuardImplInterface() = default; +}; + +// A no-op device guard impl that doesn't do anything interesting. Useful +// for devices that don't actually have a concept of device index. Prominent +// examples are CPU and Meta. +template +struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface { + NoOpDeviceGuardImpl() = default; + DeviceType type() const override { + return D; + } + Device exchangeDevice(Device /*unused*/) const override { + return Device(D, -1); // no-op + } + Device getDevice() const override { + return Device(D, -1); + } + void setDevice(Device /*unused*/) const override { + // no-op + } + void uncheckedSetDevice(Device /*unused*/) const noexcept override { + // no-op + } + Stream getStream(Device /*unused*/) const noexcept override { + // no-op + return Stream(Stream::DEFAULT, Device(D, -1)); + } + + Stream getNewStream(Device /*unused*/, int priority = 0) const override { + // no-op + (void)priority; + return Stream(Stream::DEFAULT, Device(D, -1)); + } + + // NB: These do NOT set the current device + Stream exchangeStream(Stream /*unused*/) const noexcept override { + // no-op + return Stream(Stream::DEFAULT, Device(D, -1)); + } + DeviceIndex deviceCount() const noexcept override { + return 1; + } + + DeviceCapability getDeviceCapability(Device /*unused*/) const override { + DeviceCapability cap; + if constexpr (D == DeviceType::Meta) { + cap.capability_data.capability_bits = 0; + // Meta only supports basic types for shape inference + // Byte, Char, Short, Int, Long, Float, Double, + // Bool, ComplexFloat, ComplexDouble + cap.capability_data.capability_bits = (1ULL << kIndex_Byte) | + (1ULL << kIndex_Char) | (1ULL << kIndex_Short) | + (1ULL << kIndex_Int) | (1ULL << kIndex_Long) | + (1ULL << kIndex_Float) | (1ULL << kIndex_Double) | + (1ULL << kIndex_ComplexFloat) | (1ULL << kIndex_ComplexDouble) | + (1ULL << kIndex_Bool); + } + return cap; + } + + // Event-related functions + void record( + void** /*event*/, + const Stream& /*stream*/, + const DeviceIndex /*device_index*/, + const EventFlag /*flag*/) const override { + TORCH_CHECK(false, D, " backend doesn't support events."); + } + void block(void* /*event*/, const Stream& /*stream*/) const override { + TORCH_CHECK(false, D, " backend doesn't support events.") + } + bool queryEvent(void* /*event*/) const override { + TORCH_CHECK(false, D, " backend doesn't support events.") + } + void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/) + const noexcept override {} + + // Stream-related functions + bool queryStream(const Stream& /*stream*/) const override { + return true; + } + void synchronizeStream(const Stream& /*stream*/) const override { + // Don't wait for anything. + } +}; + +// The registry is NON-owning. Each stored pointer is std::atomic so +// that under all interleavings of registry calls the structure is +// race-free. This doesn't cost us anything on reads in X86. (An +// unsynchronized implementation probably is OK too, but I didn't want +// to prove that we never read from device_guard_impl_registry at the +// same time some registration is occurring. Shiver.) +// +// I'd like this registry to be valid even at program destruction time +// (in case someone uses a DeviceGuard in a destructor to do some cleanup +// in the CUDA API.) Since there are no direct accesses of the underlying +// owning objects which I can use to enforce initialization order (unlike +// in a Meyer singleton), it implies that you must *leak* objects when +// putting them in the registry. This is done by deleting the destructor +// on DeviceGuardImplInterface. +extern C10_API std::array< + std::atomic, + static_cast(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)> + device_guard_impl_registry; + +// I can't conveniently use c10/util/Registry.h for the following reason: +// c10/util/Registry.h gives me a slow way of Create'ing a object of some +// interface from the registry, but no way of quickly accessing an already +// created object. I'll be banging on getDeviceGuardImpl every time we do a +// DeviceGuard, so I really don't want to be doing an unordered_map lookup. +// Better if the registration mechanism directly drops its implementation +// into device_guard_impl_registry. + +class C10_API DeviceGuardImplRegistrar { + public: + DeviceGuardImplRegistrar( + DeviceType /*type*/, + const DeviceGuardImplInterface* /*impl*/); +}; + +#define C10_REGISTER_GUARD_IMPL(DevType, DeviceGuardImpl) \ + static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE( \ + g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl()); + +inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) { + // Two adjacent int16_t fields DeviceType and DeviceIndex has field access + // miscompiled on NVCC. To workaround this issue, we apply a mask to the + // DeviceType. First check if the DeviceType is 16-bit. + // FB employees can see + // https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/ + // for more details + static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit"); + auto p = device_guard_impl_registry[static_cast(type) & 0xFF].load(); + + // This seems to be the first place where you make use of a device + // when you pass devices to factory functions. Give a nicer error + // message in this case. + TORCH_CHECK(p, "PyTorch is not linked with support for ", type, " devices"); + return p; +} + +void C10_API +registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl); + +inline bool hasDeviceGuardImpl(DeviceType type) { + return device_guard_impl_registry[static_cast(type)].load(); +} + +void C10_API ensureCUDADeviceGuardSet(); + +} // namespace impl +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..902a4d3febafc5d9ea5c5695c428d25be7c171c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h @@ -0,0 +1,107 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +namespace c10::impl { + +// FakeGuardImpl is hardcoded to have eight devices. Not for +// any good reason, just to simplify code. +constexpr DeviceIndex kFakeGuardImplMaxDevices = 8; + +/** + * A fake implementation of DeviceGuardImplInterface suitable for testing. + * The current device is modeled as a mutable field in the guard implementation + * class. See DeviceGuard_test.cpp for an example use. + */ +template +struct FakeGuardImpl final : public DeviceGuardImplInterface { + static constexpr DeviceType static_type = T; + // Runtime device type is not used + FakeGuardImpl(DeviceType /*unused*/) {} + FakeGuardImpl() = default; + DeviceType type() const override { + return T; + } + Device exchangeDevice(Device d) const override { + AT_ASSERT(d.type() == type()); + AT_ASSERT(d.index() < kFakeGuardImplMaxDevices); + Device old_device = getDevice(); + if (old_device.index() != d.index()) { + current_device_ = d.index(); + } + return old_device; + } + Device getDevice() const override { + return Device(type(), current_device_); + } + void setDevice(Device d) const override { + AT_ASSERT(d.type() == type()); + AT_ASSERT(d.index() >= 0); + AT_ASSERT(d.index() < kFakeGuardImplMaxDevices); + current_device_ = d.index(); + } + void uncheckedSetDevice(Device d) const noexcept override { + current_device_ = d.index(); + } + Stream getStream(Device d) const noexcept override { + return Stream(Stream::UNSAFE, d, current_streams_[d.index()]); + } + Stream exchangeStream(Stream s) const noexcept override { + auto old_id = current_streams_[s.device_index()]; + current_streams_[s.device_index()] = s.id(); + return Stream(Stream::UNSAFE, s.device(), old_id); + } + DeviceIndex deviceCount() const noexcept override { + return kFakeGuardImplMaxDevices; + } + + // Event-related functions + void record( + void** /*event*/, + const Stream& /*stream*/, + const DeviceIndex /*device_index*/, + const EventFlag /*flag*/) const override {} + void block(void* /*event*/, const Stream& /*stream*/) const override {} + bool queryEvent(void* /*event*/) const override { + return true; + } + void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/) + const noexcept override {} + + // Convenience methods for testing + static DeviceIndex getDeviceIndex() { + return current_device_; + } + static void setDeviceIndex(DeviceIndex i) { + AT_ASSERT(i >= 0); + AT_ASSERT(i < kFakeGuardImplMaxDevices); + current_device_ = i; + } + static StreamId getCurrentStreamIdFor(DeviceIndex i) { + return current_streams_.at(i); + } + static void resetStreams() { + current_streams_.fill(0); + } + + private: + thread_local static DeviceIndex current_device_; + thread_local static std::array + current_streams_; +}; + +template +thread_local DeviceIndex FakeGuardImpl::current_device_ = 0; + +template +thread_local std::array + FakeGuardImpl::current_streams_ = {0, 0, 0, 0, 0, 0, 0, 0}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h new file mode 100644 index 0000000000000000000000000000000000000000..57761cff9bc254158816d43451ed5bc01f60411f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { + +struct C10_API GPUTrace { + // On the x86 architecture the atomic operations are lock-less. + static std::atomic gpuTraceState; + + // When PyTorch migrates to C++20, this should be changed to an atomic flag. + // Currently, the access to this variable is not synchronized, on the basis + // that it will only be flipped once and by the first interpreter that + // accesses it. + static bool haveState; + + // This function will only register the first interpreter that tries to invoke + // it. For all of the next ones it will be a no-op. + static void set_trace(const PyInterpreter* /*trace*/); + + static const PyInterpreter* get_trace() { + if (!haveState) + return nullptr; + return gpuTraceState.load(std::memory_order_acquire); + } +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h new file mode 100644 index 0000000000000000000000000000000000000000..032b90a20bd297b742711ada1d9d5ed1501a5e7e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::impl { + +// This TLS controls whether or not we permanently associate PyObject +// with Tensor the first time it is allocated. When hermetic PyObject +// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor, +// meaning you get a distinct PyObject whenever you execute the code in +// question. +struct C10_API HermeticPyObjectTLS { + static void set_state(bool state); + static bool get_state() { + // Hypothetical fastpath if torchdeploy/multipy // codespell:ignore multipy + // isn't used. Per + // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf + // this qualifies relaxed access because it is a single-location data + // structure (only the boolean here). + // + // Forgetting about data races for a moment, is there a logical race? + // + // - Boolean only ever transitions from false to true. So the + // critical situation is when one interpreter is already running + // when a second interpreter switches haveState from false to true. + // + // - The first interpreter is indifferent whether or not it sees + // hasState true/false; obviously false works (this is what the + // interpreter was previously using; more directly, the interpreter + // calls into itself as the handler, so being hermetic is not + // required), and true simply means serviced python operator calls will + // be hermetic; in these cases it is expected to be functionally + // equivalent. + // + // - The second interpreter MUST see hasState true (as its requests will + // be forwarded to the first interpreter), but it is assumed that there + // is a synchronization between the interpreter initialization, and + // when we actually perform operations, so it is guaranteed to see + // hasState true. + // + // QED. + // + // This fastpath is currently disabled so that we can more easily test that + // hermetic mode works correctly even on stock build of PyTorch. + if (false && !haveState_.load(std::memory_order_relaxed)) + return false; + return get_tls_state(); + } + // Call this from the multipy/torchdeploy // codespell:ignore multipy + // top level + static void init_state(); + + private: + // This only flipped once from false to true during + // torchdeploy/multipy initialization, // codespell:ignore multipy + // and never again. + static std::atomic haveState_; + static bool get_tls_state(); +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..34d6dff97654888cd12d52ce1f44441f30247e44 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h @@ -0,0 +1,438 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file provides implementations of InlineDeviceGuard and +// InlineOptionalDeviceGuard. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10::impl { + +/** + * A DeviceGuard is an RAII class that sets a device to some value + * on construction, and resets the device to its original value on + * destruction. + * + * InlineDeviceGuard is a helper class for implementing DeviceGuards. + * It is templated over a DeviceGuardImpl (anything that implements + * DeviceGuardImplInterface). There are two primary ways to instantiate + * InlineDeviceGuard: + * + * - With a concrete implementation of DeviceGuardImpl, e.g., CUDAGuardImpl. + * This is the best way to use InlineDeviceGuard, as all calls are + * devirtualized, giving you code as efficient as straight line + * calls to cudaGetDevice/cudaSetDevice. + * + * - With VirtualGuardImpl, which does a virtual dispatch to a DeviceGuardImpl + * retrieved from a DeviceType registry. We have explicitly instantiated + * InlineDeviceGuard this way as c10::DeviceGuard. + * + * If you are in a hurry, you can use InlineDeviceGuard directly: + * + * using CUDAGuard = impl::InlineDeviceGuard; + * + * However, you can provide a better user experience if you explicitly write a + * wrapper class that itself contains the template instantiation: + * + * class CUDAGuard { + * public: + * // ... the API ... + * private: + * impl::InlineDeviceGuard guard_; + * } + * + * The wrapper class provides a good place to write documentation, and helps + * avoid weird template instantiation errors when a user incorrectly uses the + * class. + * + * If you need to test this class, consider instantiating it with FakeGuardImpl. + */ +template +class InlineDeviceGuard { + public: + // Note [Omitted default constructor from RAII] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // In principle, we could add a default constructor to + // DeviceGuard which reads the current device and promises to + // restore to that device on exit. However, most cases where you + // would have written this, you probably meant to actually just + // use DeviceGuard (since you don't actually need the + // restore to happen if you don't ever actually set the device). + // We remove the constructor here to encourage you to think about + // what you actually want to happen. + explicit InlineDeviceGuard() = delete; + + /// Set the current device to the passed Device. + explicit InlineDeviceGuard(Device device) + : impl_(device.type()), + original_device_( + device.index() == -1 ? impl_.getDevice() + : impl_.exchangeDevice(device)), + current_device_(device.index() == -1 ? original_device_ : device) {} + + /// Set the current device index to the passed DeviceIndex. (The + /// device type is inferred from the template parameter T). + template < + typename U = T, + typename = + typename std::enable_if_t>> + explicit InlineDeviceGuard(DeviceIndex device_index) + : InlineDeviceGuard(Device(U::static_type, device_index)) {} + + /// Construct an InlineDeviceGuard using VirtualGuardImpl with an explicit + /// DeviceGuardImplInterface pointer. + template < + typename U = T, + typename = typename std::enable_if_t>> + explicit InlineDeviceGuard( + Device device, + const DeviceGuardImplInterface* impl) + : impl_( + VirtualGuardImpl(impl ? impl : getDeviceGuardImpl(device.type()))), + original_device_( + device.index() == -1 ? impl_.getDevice() + : impl_.exchangeDevice(device)), + current_device_(device.index() == -1 ? original_device_ : device) {} + + /// Copy is disallowed + InlineDeviceGuard(const InlineDeviceGuard&) = delete; + InlineDeviceGuard& operator=(const InlineDeviceGuard&) = delete; + + /// Move is disallowed, as DeviceGuard does not have an uninitialized state, + /// which is required for moves on types with nontrivial destructors. + InlineDeviceGuard(InlineDeviceGuard&& other) = delete; + InlineDeviceGuard& operator=(InlineDeviceGuard&& other) = delete; + + ~InlineDeviceGuard() { + impl_.uncheckedSetDevice(original_device_); + } + + /// Sets the device to the given one. + template < + typename U = T, + typename std::enable_if_t, int> = 0> + void set_device(at::Device device) { + AT_ASSERT( + (U::static_type == DeviceType::HIP && device.is_cuda()) || + device.type() == U::static_type); + auto index = device.index(); + if (index == -1) + return; + impl_.setDevice(device); + current_device_ = device; + } + + /// Resets the currently set device to its original device, and then sets the + /// current device to the passed device. This is effectively equivalent to + /// set_device when a guard supports only a single device type. + template + typename std::enable_if_t> reset_device( + at::Device device) { + set_device(device); + } + + /// Resets the currently set device to its original device, and then sets the + /// current device to the passed device (for a possibly different device + /// type). + /// + /// This method is named reset_device to highlight the fact that previous + /// device settings from this guard are NOT preserved, even if the device + /// has a different device type. For example: + /// + /// // CUDA device is 0 + /// DeviceGuard g(Device(kCUDA, 1)); + /// g.reset_device(Device(kHIP, 2)); + /// // CUDA device is 0 (!!) + /// + /// NOTE: this implementation may skip some device setting if it can prove + /// that it is unnecessary. + /// + /// Optional argument is for testing only. + template + typename std::enable_if_t> reset_device( + at::Device device, + const impl::DeviceGuardImplInterface* impl = nullptr) { + auto index = device.index(); + if (index == -1) + return; + if (device.type() == original_device_.type()) { + AT_ASSERT(impl == nullptr || impl->type() == device.type()); + impl_.setDevice(device); + current_device_ = device; + } else { + // Destruct and reconstruct the DeviceGuard in place + impl_.setDevice(original_device_); + impl_ = !impl ? VirtualGuardImpl(device.type()) : VirtualGuardImpl(impl); + original_device_ = impl_.exchangeDevice(device); + current_device_ = device; + } + } + + /// Sets the device index to the given one. The device type is inferred + /// from the original device type. + void set_index(DeviceIndex index) { + reset_device(Device(original_device_.type(), index)); + } + + /// Returns the device that was set at the time the most recent + /// reset_device(), or otherwise the device at construction time. + Device original_device() const { + return original_device_; + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device/reset_device/set_index. + Device current_device() const { + return current_device_; + } + + protected: + T impl_; + + private: + Device original_device_; + Device current_device_; +}; + +/** + * A OptionalDeviceGuard is an RAII class that sets a device to some value on + * initialization, and resets the device to its original value on destruction. + * + * InlineOptionalDeviceGuard is a helper class for implementing + * OptionalDeviceGuards. See guidance in InlineDeviceGuard on how to + * use this. See OptionalDeviceGuard for user-oriented usage notes. + */ +template +class InlineOptionalDeviceGuard { + public: + // Note [Explicit initialization of optional fields] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Explicit initialization of optional fields + // required to workaround an nvcc bug; see + // https://github.com/pytorch/pytorch/issues/12117 + + /// Creates an uninitialized OptionalDeviceGuard. + explicit InlineOptionalDeviceGuard() + : guard_() // See Note [Explicit initialization of optional fields] + {} + ~InlineOptionalDeviceGuard() = default; + + /// Set the current device to the passed Device, if it is not nullopt. + explicit InlineOptionalDeviceGuard(std::optional device_opt) + : guard_() { // See Note [Explicit initialization of optional fields] + if (device_opt.has_value()) { + guard_.emplace(device_opt.value()); + } + } + + /// Set the current device to the passed DeviceIndex, if it is not nullopt. + template < + typename U = T, + typename = + typename std::enable_if_t>> + explicit InlineOptionalDeviceGuard( + std::optional device_index_opt) + : guard_() { // See Note [Explicit initialization of optional fields] + if (device_index_opt.has_value()) { + guard_.emplace(device_index_opt.value()); + } + } + + /// All constructors of DeviceGuard are valid for OptionalDeviceGuard + /// and result in initialized OptionalDeviceGuard. + template + explicit InlineOptionalDeviceGuard(Args&&... args) + : guard_(std::in_place, std::forward(args)...) {} + + // TODO: Consider reading Tensor and TensorList constructors here, when + // Tensor moves to c10. (These are only valid on OptionalDeviceGuard, + // because a Tensor may be undefined, in which case we need an uninitialized + // tensor guard.) + + // Note [Move construction for RAII guards is tricky] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // In principle, move construction is useful for terminating + // the lifetime of a `OptionalDeviceGuard` early; for example: + // + // // current device is d0 + // OptionalDeviceGuard g1(d1); + // // current device is d1 + // { + // OptionalDeviceGuard g2(std::move(g1)); + // } + // // current device is d0!! + // + // However, it's difficult to implement the move constructor + // in a way that works in all situations. For example, consider + // the following example: + // + // OptionalDeviceGuard g1(d1); + // { + // OptionalDeviceGuard g2(d2); + // { + // OptionalDeviceGuard g3(std::move(g1)); // !!! + // } + // } + // + // What should the current device be while g3 in scope... and what + // should it be after it goes out of scope? What about g2? + // There don't seem to be satisfactory answers for these questions. + // + // It's in principle possible to raise an error when this occurs + // by doing some extra thread-local bookkeeping. But why bother? + // Just don't provide the constructor. + InlineOptionalDeviceGuard(const InlineOptionalDeviceGuard& other) = delete; + InlineOptionalDeviceGuard(InlineOptionalDeviceGuard&& other) = delete; + + // Note [Move assignment for RAII guards is tricky] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Move assignment is deleted, because you need to know which guard was + // defined "first", as that guard's original_device_ wins--with the current + // representation, we have no way of telling which is the case. (Move + // construction does not have this problem, as one guard is always + // uninitialized.) + // + // We can make this clear by way of a pair of examples: + // + // Example 1: + // + // // initial device is n0 + // { + // CUDAGuard g1(n1); + // { + // CUDAGuard g2(n2); + // // current device should be n2 + // g1 = std::move(g2); + // // current device should still be n2 + // } + // // current device should still be n2 + // } + // // current device should be n0 + // + // Example 2 (flip the order of the two guards): + // + // // initial device is n0 + // { + // CUDAGuard g2(n2); + // { + // CUDAGuard g1(n1); + // // current device should be n1 + // g1 = std::move(g2); + // // current device should be n2 + // } + // // current device should be n0 (since g2 has been vacated) + // } + // + // In both examples, we need g1 to restore to n0 after move assignment. + // However, in example 1, this is determined by the restore value of g1 + // (prior to the move). In example 2, however, it is determined by the the + // restore value of g2(!!). We don't know which one should win, without having + // a way of telling which guard was allocated first. + // + // We could solve this with an extra thread-local variable. But no one is + // actually using move-assignment. So just get rid of it. + InlineOptionalDeviceGuard& operator=(const InlineOptionalDeviceGuard& other) = + delete; + InlineOptionalDeviceGuard& operator=(InlineOptionalDeviceGuard&& other) = + delete; + + /// Sets the device to the given one. Initializes OptionalDeviceGuard if it + /// is not already initialized. + template < + typename U = T, + typename = + typename std::enable_if_t>> + void set_device(at::Device device) { + if (!guard_.has_value()) { + guard_.emplace(device); + } else { + guard_->set_device(device); + } + } + + /// Resets the currently set device to its original device, and then sets the + /// current device to the passed device (for a possibly different device + /// type). Initializes OptionalDeviceGuard if it is not already initialized. + /// + /// See notes on why this is called reset_device on InlineDeviceGuard. + /// + /// Optional argument is for testing only. + template < + typename U = T, + typename = typename std::enable_if_t>> + void reset_device( + at::Device device, + const DeviceGuardImplInterface* impl = nullptr) { + if (!guard_.has_value()) { + guard_.emplace(device, impl); + } else { + guard_->reset_device(device, impl); + } + } + + /// Resets the currently set device to its original device, and then sets the + /// current device to the passed device. Initializes the guard if it is + /// not already initialized. This is effectively equivalent to set_device + /// when a guard supports only a single device type. + template < + typename U = T, + typename = + typename std::enable_if_t>> + void reset_device(at::Device device) { + if (!guard_.has_value()) { + guard_.emplace(device); + } else { + guard_->reset_device(device); + } + } + + /// Sets the device index to the given one. The device type is statically + /// known. + template < + typename U = T, + typename = + typename std::enable_if_t>> + void set_index(DeviceIndex index) { + if (!guard_.has_value()) { + guard_.emplace(index); + } else { + guard_->set_index(index); + } + } + + /// Returns the device that was set immediately prior to initialization of + /// the, guard, or nullopt if the guard is uninitialized. + std::optional original_device() const { + return guard_.has_value() ? std::make_optional(guard_->original_device()) + : std::nullopt; + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + std::optional current_device() const { + return guard_.has_value() ? std::make_optional(guard_->current_device()) + : std::nullopt; + } + + /// Restore the original device, resetting this guard to uninitialized state. + void reset() { + guard_.reset(); + } + + private: + std::optional> guard_; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..15d4083daab7439295a132ca3b157eae1ba6745d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h @@ -0,0 +1,152 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace c10::impl { + +template +struct InlineEvent final { + InlineEvent() = delete; + InlineEvent( + const DeviceType _device_type, + const EventFlag _flag = EventFlag::PYTORCH_DEFAULT) + : backend_{_device_type}, device_type_{_device_type}, flag_{_flag} {} + + // Copy constructor and copy assignment operator (deleted) + InlineEvent(const InlineEvent&) = delete; + InlineEvent& operator=(const InlineEvent&) = delete; + + // Move constructor and move assignment operator + InlineEvent(InlineEvent&& other) noexcept + : event_(other.event_), + backend_(std::move(other.backend_)), + device_type_(other.device_type_), + device_index_(other.device_index_), + flag_(other.flag_), + was_marked_for_recording_(other.was_marked_for_recording_) { + other.event_ = nullptr; + } + InlineEvent& operator=(InlineEvent&& other) noexcept { + swap(other); + return *this; + } + + void swap(InlineEvent& other) noexcept { + std::swap(event_, other.event_); + std::swap(backend_, other.backend_); + std::swap(device_type_, other.device_type_); + std::swap(device_index_, other.device_index_); + std::swap(flag_, other.flag_); + std::swap(was_marked_for_recording_, other.was_marked_for_recording_); + } + + ~InlineEvent() noexcept { + if (event_) + backend_.destroyEvent(event_, device_index_); + } + + DeviceType device_type() const noexcept { + return device_type_; + } + DeviceIndex device_index() const noexcept { + return device_index_; + } + EventFlag flag() const noexcept { + return flag_; + } + bool was_marked_for_recording() const noexcept { + return was_marked_for_recording_; + } + + void recordOnce(const Stream& stream) { + if (!was_marked_for_recording_) + record(stream); + } + + void record(const Stream& stream) { + TORCH_CHECK( + stream.device_type() == device_type_, + "Event device type ", + DeviceTypeName(device_type_), + " does not match recording stream's device type ", + DeviceTypeName(stream.device_type()), + "."); + + backend_.record(&event_, stream, device_index_, flag_); + was_marked_for_recording_ = true; + device_index_ = stream.device_index(); + } + + void block(const Stream& stream) const { + if (!was_marked_for_recording_) + return; + + TORCH_CHECK( + stream.device_type() == device_type_, + "Event device type ", + DeviceTypeName(device_type_), + " does not match blocking stream's device type ", + DeviceTypeName(stream.device_type()), + "."); + + backend_.block(event_, stream); + } + + bool query() const { + if (!was_marked_for_recording_) + return true; + return backend_.queryEvent(event_); + } + + void* eventId() const { + return event_; + } + + double elapsedTime(const InlineEvent& other) const { + TORCH_CHECK( + other.device_type() == device_type_, + "Event device type ", + DeviceTypeName(device_type_), + " does not match other's device type ", + DeviceTypeName(other.device_type()), + "."); + TORCH_CHECK_VALUE( + (flag_ == EventFlag::BACKEND_DEFAULT) && + (other.flag_ == EventFlag::BACKEND_DEFAULT), + "Both events must be created with argument 'enable_timing=True'."); + TORCH_CHECK_VALUE( + was_marked_for_recording() && other.was_marked_for_recording(), + "Both events must be recorded before calculating elapsed time."); + // elapsedTime in MPS can wait event to be completed if event is not ready, + // which is a little different from CUDA + TORCH_CHECK( + (query() && other.query()) || device_type_ == DeviceType::MPS, + "Both events must be completed before calculating elapsed time."); + + return backend_.elapsedTime(event_, other.event_, device_index_); + } + + void synchronize() const { + if (!was_marked_for_recording_) + return; + backend_.synchronizeEvent(event_); + } + + private: + void* event_ = nullptr; + T backend_; + DeviceType device_type_; + DeviceIndex device_index_ = -1; + EventFlag flag_ = EventFlag::PYTORCH_DEFAULT; + bool was_marked_for_recording_ = false; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..7ce87a9a8eb55a30e8e6fb0ab6e5a38bc065dab9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h @@ -0,0 +1,265 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10::impl { + +/** + * A StreamGuard is an RAII class that changes the current device + * to the device corresponding to some stream, and changes the + * default stream on that device to be this stream. + * + * InlineStreamGuard is a helper class for implementing StreamGuards. + * See InlineDeviceGuard for guidance on how to use this class. + */ +template +class InlineStreamGuard : private InlineDeviceGuard { + public: + /// No default constructor, see Note [Omitted default constructor from RAII] + explicit InlineStreamGuard() = delete; + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + explicit InlineStreamGuard(Stream stream) + : InlineDeviceGuard(stream.device()), + original_stream_of_original_device_( + this->impl_.getStream(original_device())), + original_stream_of_current_device_(this->impl_.exchangeStream(stream)), + current_stream_(stream) {} + + /// This constructor exists purely for testing + template < + typename U = T, + typename = typename std::enable_if_t>> + explicit InlineStreamGuard( + Stream stream, + const DeviceGuardImplInterface* impl) + : InlineDeviceGuard( + stream.device(), + impl ? impl : getDeviceGuardImpl(stream.device_type())), + original_stream_of_original_device_( + this->impl_.getStream(original_device())), + original_stream_of_current_device_(this->impl_.exchangeStream(stream)), + current_stream_(stream) {} + + /// Copy is disallowed + InlineStreamGuard(const InlineStreamGuard&) = delete; + InlineStreamGuard& operator=(const InlineStreamGuard&) = delete; + + /// Move is disallowed, as StreamGuard does not have an uninitialized state, + /// which is required for moves on types with nontrivial destructors. + InlineStreamGuard(InlineStreamGuard&& other) = delete; + InlineStreamGuard& operator=(InlineStreamGuard&& other) = delete; + + ~InlineStreamGuard() { + this->impl_.exchangeStream(original_stream_of_current_device_); + } + + /// Resets the currently set stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// + /// NOTE: this implementation may skip some stream/device setting if + /// it can prove that it is unnecessary. + /// + /// WARNING: reset_stream does NOT preserve previously set streams on + /// different devices. If you need to set streams on multiple devices + /// use MultiStreamGuard instead. + void reset_stream(Stream stream) { + // TODO: make a version that takes an impl argument. Unfortunately, + // that will require SFINAE because impl is only valid for the + // VirtualGuardImpl specialization. + if (stream.device() == this->current_device()) { + this->impl_.exchangeStream(stream); + current_stream_ = stream; + } else { + // Destruct and reconstruct the StreamGuard in-place + this->impl_.exchangeStream(original_stream_of_current_device_); + this->reset_device(stream.device()); + original_stream_of_current_device_ = this->impl_.exchangeStream(stream); + current_stream_ = stream; + } + } + + // It's not clear if set_device should also reset the current stream + // if the device is unchanged; therefore, we don't provide it. + // The situation is somewhat clearer with reset_device, but it's still + // a pretty weird thing to do, so haven't added this either. + + /// Returns the stream of the original device prior to this guard. Subtly, + /// the stream returned here is the original stream of the *original* + /// device; i.e., it's the stream that your computation *would* have + /// been put on, if it hadn't been for this meddling stream guard. + /// This is usually what you want. + Stream original_stream() const { + return original_stream_of_original_device_; + } + + /// Returns the most recent stream that was set using this device guard, + /// either from construction, or via set_stream. + Stream current_stream() const { + return current_stream_; + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device/reset_device/set_index. + Device current_device() const { + return InlineDeviceGuard::current_device(); + } + + /// Returns the device that was set at the most recent reset_stream(), + /// or otherwise the device at construction time. + Device original_device() const { + return InlineDeviceGuard::original_device(); + } + + private: + Stream + original_stream_of_original_device_; // what the user probably cares about + Stream original_stream_of_current_device_; // what we need to restore + Stream current_stream_; +}; + +/** + * An OptionalStreamGuard is an RAII class that sets a device to some value on + * initialization, and resets the device to its original value on destruction. + * See InlineOptionalDeviceGuard for more guidance on how to use this class. + */ +template +class InlineOptionalStreamGuard { + public: + /// Creates an uninitialized stream guard. + explicit InlineOptionalStreamGuard() + : guard_() // See Note [Explicit initialization of optional fields] + {} + ~InlineOptionalStreamGuard() = default; + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream, + /// if the passed stream is not nullopt. + explicit InlineOptionalStreamGuard(std::optional stream_opt) + : guard_() { + if (stream_opt.has_value()) { + guard_.emplace(stream_opt.value()); + } + } + + /// All constructors of StreamGuard are valid for OptionalStreamGuard + template + explicit InlineOptionalStreamGuard(Args&&... args) + : guard_(std::in_place, std::forward(args)...) {} + + InlineOptionalStreamGuard(const InlineOptionalStreamGuard& other) = delete; + InlineOptionalStreamGuard& operator=(const InlineOptionalStreamGuard& other) = + delete; + // See Note [Move construction for RAII guards is tricky] + InlineOptionalStreamGuard(InlineOptionalStreamGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + InlineOptionalStreamGuard& operator=(InlineOptionalStreamGuard&& other) = + delete; + + /// Resets the currently set stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// Initializes the OptionalStreamGuard if it was not previously initialized. + void reset_stream(Stream stream) { + if (guard_.has_value()) { + guard_->reset_stream(stream); + } else { + guard_.emplace(stream); + } + } + + /// Returns the stream that was set at the time the guard was most recently + /// initialized, or nullopt if the guard is uninitialized. + std::optional original_stream() const { + return guard_.has_value() ? std::make_optional(guard_->original_stream()) + : std::nullopt; + } + + /// Returns the most recent stream that was set using this stream guard, + /// either from construction, or via reset_stream, if the guard is + /// initialized, or nullopt if the guard is uninitialized. + std::optional current_stream() const { + return guard_.has_value() ? std::make_optional(guard_->current_stream()) + : std::nullopt; + } + + /// Restore the original device and stream, resetting this guard to + /// uninitialized state. + void reset() { + guard_.reset(); + } + + private: + std::optional> guard_; +}; + +template +class InlineMultiStreamGuard { + public: + /// Calls `set_stream` on each of the streams in the list. + /// This may be useful if you need to set different streams + /// for different devices. + explicit InlineMultiStreamGuard(ArrayRef streams) { + if (!streams.empty()) { + impl_.emplace(getDeviceTypeOfStreams(streams)); + original_streams_.reserve(streams.size()); + for (const Stream& s : streams) { + original_streams_.emplace_back(this->impl_->exchangeStream(s)); + } + } + } + + /// Copy is disallowed + InlineMultiStreamGuard(const InlineMultiStreamGuard&) = delete; + InlineMultiStreamGuard& operator=(const InlineMultiStreamGuard&) = delete; + + /// Move is disallowed, as StreamGuard does not have an uninitialized state, + /// which is required for moves on types with nontrivial destructors. + InlineMultiStreamGuard(InlineMultiStreamGuard&& other) = delete; + InlineMultiStreamGuard& operator=(InlineMultiStreamGuard&& other) = delete; + + ~InlineMultiStreamGuard() noexcept { + if (this->impl_.has_value()) { + for (const Stream& s : original_streams_) { + this->impl_->exchangeStream(s); + } + } + } + + protected: + std::optional impl_; + + private: + /// The original streams that were active on all devices. + std::vector original_streams_; + + static DeviceType getDeviceTypeOfStreams(ArrayRef streams) { + TORCH_INTERNAL_ASSERT(!streams.empty()); + DeviceType type = streams[0].device_type(); + for (const auto idx : c10::irange(1, streams.size())) { + TORCH_CHECK_VALUE( + streams[idx].device_type() == type, + "Streams have a mix of device types: stream 0 is on ", + streams[0].device(), + " while stream ", + idx, + " is on device ", + streams[idx].device()); + } + return type; + } +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h new file mode 100644 index 0000000000000000000000000000000000000000..123a288a0834468abc2e8bc7dc90b6e775506621 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h @@ -0,0 +1,174 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +// TLS management for DispatchKeySet (the "local" DispatchKeySet(s)) +// +// This manages two thread-local DispatchKeySets: +// +// - The included type set, which adds a tensor type for consideration +// in dispatch. (For example, you might add Profiling to +// the included type set to turn on profiling on all tensor operations.) +// +// - The excluded type set, which disqualifies a tensor type from dispatch. +// (For example, after redispatching on variable, we disqualify +// Autograd so we don't attempt to handle variable again.) +// (Exclusion wins over inclusion.) +// +// NB: Originally, I implemented the excluded type set as storing the inverted +// set, but TLS is defined to be zero-initialized, so this doesn't actually work +// (if it's inverted, you want the set to be -1 initialized). + +namespace c10::impl { + +// POD version of LocalDispatchKeySet. Declared here just so that +// we can put it in the guards. +// This struct encapsulates special handling for TLS initialization +// in set_included()/included() API so that they reflect the truth. +// If you want to create PODLocalDispatchKeySet with non-zero state, +// use set_included() instead of default constructor. +struct C10_API PODLocalDispatchKeySet { + uint64_t included_; + uint64_t excluded_; + + // See Note [TLS Initialization] + DispatchKeySet included() const { + return DispatchKeySet(DispatchKeySet::RAW, included_) ^ + c10::default_included_set; + } + DispatchKeySet excluded() const { + return DispatchKeySet(DispatchKeySet::RAW, excluded_) ^ + c10::default_excluded_set; + } + + void set_included(DispatchKeySet x) { + included_ = (x ^ c10::default_included_set).raw_repr(); + } + void set_excluded(DispatchKeySet x) { + excluded_ = (x ^ c10::default_excluded_set).raw_repr(); + } +}; +static_assert( + std::is_trivial_v, + "PODLocalDispatchKeySet must be a POD type."); + +struct C10_API LocalDispatchKeySet { + /* implicit */ LocalDispatchKeySet(PODLocalDispatchKeySet x) + : included_(x.included()), excluded_(x.excluded()) {} + DispatchKeySet included_; + DispatchKeySet excluded_; +}; + +// thread_local variables cannot be C10_API on Windows. +// Inlining this seems to break AutoDispatchBelowAutograd on Android. +#if defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE) +C10_API LocalDispatchKeySet tls_local_dispatch_key_set(); +#else // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE) +extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; + +inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() { + // Don't let people fiddle with the thread_local directly just + // because they include this header. + return raw_local_dispatch_key_set; +} +#endif // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE) + +// Internal, use ThreadLocalStateGuard +C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set); + +// RAII API for manipulating the thread-local dispatch state. + +class C10_API IncludeDispatchKeyGuard { + public: + IncludeDispatchKeyGuard(DispatchKeySet /*include*/); + IncludeDispatchKeyGuard(DispatchKey k) + : IncludeDispatchKeyGuard(DispatchKeySet(k)) {} + IncludeDispatchKeyGuard(const IncludeDispatchKeyGuard&) = delete; + IncludeDispatchKeyGuard operator=(const IncludeDispatchKeyGuard&) = delete; + IncludeDispatchKeyGuard(IncludeDispatchKeyGuard&&) = delete; + IncludeDispatchKeyGuard operator=(IncludeDispatchKeyGuard&&) = delete; + ~IncludeDispatchKeyGuard(); + + private: + // A little micro-optimization to save us from tls_get_addr call + // on destruction + PODLocalDispatchKeySet* tls_; + DispatchKeySet include_; +}; + +class C10_API ExcludeDispatchKeyGuard { + public: + ExcludeDispatchKeyGuard(DispatchKeySet /*exclude*/); + ExcludeDispatchKeyGuard(DispatchKey k) + : ExcludeDispatchKeyGuard(DispatchKeySet(k)) {} + ExcludeDispatchKeyGuard(const ExcludeDispatchKeyGuard&) = delete; + ExcludeDispatchKeyGuard operator=(const ExcludeDispatchKeyGuard&) = delete; + ExcludeDispatchKeyGuard(ExcludeDispatchKeyGuard&&) = delete; + ExcludeDispatchKeyGuard operator=(ExcludeDispatchKeyGuard&&) = delete; + ~ExcludeDispatchKeyGuard(); + + private: + // A little micro-optimization to save us from tls_get_addr call + // on destruction + PODLocalDispatchKeySet* tls_; + DispatchKeySet exclude_; +}; + +struct C10_API ForceDispatchKeyGuard { + public: + ForceDispatchKeyGuard() + : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {} + ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) + : ForceDispatchKeyGuard() { + c10::impl::_force_tls_local_dispatch_key_set(key_set); + } + ForceDispatchKeyGuard( + c10::DispatchKeySet include, + c10::DispatchKeySet exclude) + : ForceDispatchKeyGuard() { + auto updated_set = saved_keyset_; + updated_set.included_ = include; + updated_set.excluded_ = exclude; + c10::impl::_force_tls_local_dispatch_key_set(updated_set); + } + + ForceDispatchKeyGuard(ForceDispatchKeyGuard&&) noexcept = delete; + ForceDispatchKeyGuard(const ForceDispatchKeyGuard&) = delete; + ForceDispatchKeyGuard& operator=(const ForceDispatchKeyGuard&) = delete; + ForceDispatchKeyGuard& operator=(ForceDispatchKeyGuard&&) = delete; + ~ForceDispatchKeyGuard() { + c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_); + } + + private: + c10::impl::LocalDispatchKeySet saved_keyset_; +}; + +// Non-RAII API for manipulating the thread-local dispatch state. +// Please prefer the RAII API. The non-RAII API may be useful when +// the included/excluded state of a given DispatchKey must span +// many calls from the Python to the C++, so you cannot conveniently +// use an RAII guard. +// +// Example use case: a Python context manager that includes a certain +// DispatchKey, to ensure ops running under the context manager dispatch +// through that DispatchKey's registered overrides. +// +// The non-RAII API is less efficient than the RAII guards because both the +// getter and setter will do a tls_getaddr lookup (the RAII struct only needs +// one!) + +C10_API bool tls_is_dispatch_key_excluded(DispatchKey x); +C10_API void tls_set_dispatch_key_excluded(DispatchKey x, bool desired_state); +C10_API bool tls_is_dispatch_key_included(DispatchKey x); +C10_API void tls_set_dispatch_key_included(DispatchKey x, bool desired_state); +C10_API bool tls_is_dispatch_keyset_excluded(DispatchKeySet ks); +C10_API bool tls_is_dispatch_keyset_included(DispatchKeySet ks); + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..ce74e9b9050b3db0db196ff4ef9f3cad198c9beb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h @@ -0,0 +1,257 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declarations + +namespace c10 { +struct IValue; +class OperatorHandle; +struct TensorImpl; +namespace impl { +struct PyObjectSlot; +} // namespace impl +} // namespace c10 + +namespace torch::jit { +using Stack = std::vector; +} + +// Actual implementation + +namespace c10::impl { + +struct C10_API PyInterpreter; + +// Note [Python interpreter tag] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Traditionally, PyTorch is layered such that our Python library +// (libtorch_python) references our pure C++ library (libtorch) as the +// natural order of things. However, sometimes this natural order is +// subverted: C++ objects refer to Python objects (for example, we +// store a PyObject* pointer on TensorImpl so that converting from a +// C++ Tensor to a Python Tensor is just a memory dereference). +// +// These unusual orderings must be treated with care. To start, you need to +// virtualize the destructor so that the PyObject can be decref'ed on +// destruction (because the C++ object itself doesn't know anything about +// Python--remember, layering!). This process itself is fraught, since +// acquiring the GIL could lead to deadlocks if someone is blocking on you +// while holding the GIL. Furthermore, if the C++ objects outlive the +// interpreter (which can happen if you stash them in a static global +// variable defined in libtorch), you may attempt to decref the object when +// the Python interpreter has already been shutdown. +// +// BUT WAIT, IT GETS WORSE. With torchdeploy, there may be multiple Python +// interpreters in a single process. If a C++ object is accessible from +// multiple interpreters, we must take care not to accidentally pass a +// PyObject from one interpreter with another interpreter. +// +// To prevent these mixups, we introduce a PyInterpreter "tag" (object with +// a vtable), which specifies a specific Python interpreter. +// +// - Any given object can be associated with AT MOST one Python interpreter. +// We represent the interpreter tag as a memory address to an instance of +// a virtual class that is allocated once per interpreter (this is so that +// we can request the interpreter to perform operations for us, if +// necessary). +// +// - It can be recorded with a PyObject (PyInterpreterObject) so that +// we know what interpreter the object is associated with, and we can +// raise an error if you try to use the PyObject from the wrong +// interpreter context. +// +// - It contains a vtable that can be used to perform various Python +// operations from ordinary C++ code that ordinarily wouldn't be accessible +// from libtorch. +// +// A simple use case is when a C++ object must be associated with a PyObject. +// However, for TensorImpl, we lazily allocate a PyObject the first time the +// object passes into Python. The invariants for this situation are more +// subtle: +// +// - A given TensorImpl's interpreter tag can only go from uninitialized to +// tagged; once tagged, this is a quiescent state (once tagged to an +// interpreter, ALWAYS tagged to that interpreter) +// +// - A thread may mutate the PyObject field of a TensorImpl if and only if it +// holds the GIL for the interpreter tagged on the TensorImpl. (If the +// TensorImpl is not tagged, it must first atomically claim its tag before it +// can validly write) +// +// WARNING: This class has to be written very carefully, because it may be +// possible for a Tensor to have a reference an interpreter corresponding to +// a shared library that has ALREADY BEEN UNLOADED. This makes blindly calling +// virtual methods very dangerous, because the vtable may be garbage at that +// point (on a good day, you might get "pure virtual method called"). +// +// The idea to solve this problem is we always leak PyInterpreters (so they +// always stay live even after dlclose), and make sure we can disarm their +// virtual methods by indirecting through a separate PyInterpreterVTable +// object. This can be replaced with a no-op vtable from libc10.so, which +// is guaranteed to stick around until the bitter end. +// +// NB: The downside with representing PyInterpreter tags as full objects is that +// it takes an extra word on TensorImpl. If tags were instead just integer +// indices, on 64-bit architectures we could pack the tag and PyObject together +// into a single atomic word. On 32-bit architectures we could simply say that +// only one Python interpreter is supported (erroring if a nontrivial +// interpreter tag is attempted to be set). +// +// The difficulty with this scheme is we need to maintain an out-of-line table +// to get at the PyInterpreters so that we can do virtual method calls on them, +// and registration/deregistration to this table must be done in a thread safe +// manner. This can be easily done if the number of possible PyInterpreters is +// small enough (e.g., 8-bit integer) by simply preallocating an array of +// sufficient size to hold all possible interpreters. Surely 128 threads is +// more than enough for anyone! +// +// I didn't decide to do this technique at the moment, because the extra word +// added by the PyInterpreter tag takes us to 24 words, which means that we +// still fit inside three eight word cache lines. If you need to penny pinch +// another word consider doing this! + +struct C10_API PyInterpreterVTable { + virtual ~PyInterpreterVTable() = default; + + // Report the name of this interpreter + virtual std::string name() const = 0; + + // Run Py_INCREF on a PyObject. + virtual void incref(PyObject* pyobj) const = 0; + // Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call. + virtual void decref(PyObject* pyobj) const = 0; + // Run PyUnstable_TryIncRef on a PyObject if it's not NULL. + virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0; + // Run Py_REFCNT on a PyObject. + virtual size_t refcnt(PyObject* pyobj) const = 0; + + // Perform a detach by deferring to the __torch_dispatch__ implementation of + // detach, which will also arrange for the PyObject to get copied in this + // situation + virtual c10::intrusive_ptr detach( + const TensorImpl* self) const = 0; + + // Invoke the Python boxed fallback dispatch to go back into Python + virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack) + const = 0; + + virtual void reportErrorCallback(PyObject* callback, DispatchKey key) + const = 0; + + // This is only invoked in the multipy/torchdeploy // codespell:ignore multipy + // situation from pythonOpRegistrationTrampoline; this lets us get to the + // Python interpreter to actually find the appropriate Python op registration + // entry to call. + virtual void python_op_registration_trampoline( + const c10::OperatorHandle& op, + c10::DispatchKey, + c10::DispatchKeySet keyset, + torch::jit::Stack* stack, + bool with_keyset, + bool with_op) const = 0; + + virtual void throw_abstract_impl_not_imported_error( + std::string opname, + const char* pymodule, + const char* context) const = 0; + + // Invoke the Python dispatcher to handle this call + virtual void python_dispatcher( + const c10::OperatorHandle& op, + c10::DispatchKeySet, + torch::jit::Stack* stack) const = 0; + + virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat) + const = 0; + virtual c10::SymBool sym_is_contiguous( + const TensorImpl* self, + at::MemoryFormat) const = 0; + virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat) + const = 0; + virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0; + virtual c10::Device device(const TensorImpl* self) const = 0; + virtual int64_t dim(const TensorImpl* self) const = 0; + virtual c10::IntArrayRef strides(const TensorImpl* self) const = 0; + virtual c10::IntArrayRef sizes(const TensorImpl* self) const = 0; + virtual c10::SymIntArrayRef sym_sizes(const TensorImpl* self) const = 0; + virtual c10::Layout layout(const TensorImpl* self) const = 0; + virtual int64_t numel(const TensorImpl* self) const = 0; + virtual c10::SymInt sym_numel(const TensorImpl* self) const = 0; + virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0; + virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0; + + virtual void trace_gpu_event_creation( + c10::DeviceType device_type, + uintptr_t event) const = 0; + virtual void trace_gpu_event_deletion( + c10::DeviceType device_type, + uintptr_t event) const = 0; + virtual void trace_gpu_event_record( + c10::DeviceType device_type, + uintptr_t event, + uintptr_t stream) const = 0; + virtual void trace_gpu_event_wait( + c10::DeviceType device_type, + uintptr_t event, + uintptr_t stream) const = 0; + virtual void trace_gpu_memory_allocation( + c10::DeviceType device_type, + uintptr_t ptr) const = 0; + virtual void trace_gpu_memory_deallocation( + c10::DeviceType device_type, + uintptr_t ptr) const = 0; + virtual void trace_gpu_stream_creation( + c10::DeviceType device_type, + uintptr_t stream) const = 0; + virtual void trace_gpu_device_synchronization( + c10::DeviceType device_type) const = 0; + virtual void trace_gpu_stream_synchronization( + c10::DeviceType device_type, + uintptr_t stream) const = 0; + virtual void trace_gpu_event_synchronization( + c10::DeviceType device_type, + uintptr_t event) const = 0; + + virtual void reset_backward_hooks(const TensorImpl* self) const = 0; +}; + +struct C10_API PyInterpreter { + const PyInterpreterVTable* vtable_; + + PyInterpreter(const PyInterpreterVTable* vtable) : vtable_(vtable) {} + + const PyInterpreterVTable& operator*() const noexcept { + return *vtable_; + } + const PyInterpreterVTable* operator->() const noexcept { + return vtable_; + } + + // Disarm this PyInterpreter, making all of its methods noops. + // The vtable pointer is not an atomic at the moment, which means + // a disarm() invocation that is concurrent with active destructors + // is not thread safe and will trigger TSAN. My hope is that this + // situations doesn't ever actually happen; tensor destruction should + // quiesce when a dlclose happens, and any long lived tensors whose + // destructors would be disarmed here only begin the destruction process + // on process shutdown (long after the dlclose has occurred). + void disarm() noexcept; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..acd2003569302cffcce5a907bd7fd506ac984a7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace c10::impl { + +// Minimal interface for PyInterpreter hooks +struct C10_API PyInterpreterHooksInterface { + virtual ~PyInterpreterHooksInterface() = default; + + // Get the PyInterpreter instance + // Stub implementation throws error when Python is not available + virtual PyInterpreter* getPyInterpreter() const { + TORCH_CHECK( + false, + "PyTorch was compiled without Python support. " + "Cannot access Python interpreter from C++."); + } +}; + +struct C10_API PyInterpreterHooksArgs{}; + +C10_DECLARE_REGISTRY( + PyInterpreterHooksRegistry, + PyInterpreterHooksInterface, + PyInterpreterHooksArgs); + +#define REGISTER_PYTHON_HOOKS(clsname) \ + C10_REGISTER_CLASS(PyInterpreterHooksRegistry, clsname, clsname) + +// Get the global PyInterpreter hooks instance +C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks(); + +// Helper function to get the global interpreter +C10_API PyInterpreter* getGlobalPyInterpreter(); + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h new file mode 100644 index 0000000000000000000000000000000000000000..8ba0688f66e597d4398d4a7d0407b2683ceb30aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h @@ -0,0 +1,70 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace torch::utils { +class PyObjectPreservation; +} + +namespace c10::impl { + +struct C10_API PyObjectSlot { + public: + PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {} + + // Query the PyObject interpreter. This may return null if there is no + // interpreter. + PyInterpreter* pyobj_interpreter() const { + return pyobj_interpreter_.load(std::memory_order_acquire); + } + + PyInterpreter& load_pyobj_interpreter() const { + auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); + TORCH_INTERNAL_ASSERT( + interpreter, "cannot access PyObject for Tensor - no interpreter set"); + return *interpreter; + } + + PyObject* load_pyobj() const { + return pyobj_.load(std::memory_order_acquire); + } + + void store_pyobj(PyObject* obj) { + pyobj_.store(obj, std::memory_order_release); + } + + bool has_unique_reference() const { + PyObject* pyobj = load_pyobj(); + return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1; + } + + void clear() { + pyobj_.store(nullptr, std::memory_order_relaxed); + pyobj_interpreter_.store(nullptr, std::memory_order_relaxed); + } + + private: + // This is now always the global interpreter if the PyObject is set. + // Maybe we can remove this field some day... + std::atomic pyobj_interpreter_; + + // The PyObject representing this Tensor or nullptr. Ownership is managed + // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this + // reference is already dead. + std::atomic pyobj_; + + friend class torch::utils::PyObjectPreservation; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h new file mode 100644 index 0000000000000000000000000000000000000000..cffb7fc31e3d18b4544027b261b98c686f81274a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::impl { + +struct C10_API PythonDispatcherTLS { + static void set_state(PyInterpreter* state); + static PyInterpreter* get_state(); + static void reset_state(); +}; + +struct C10_API DisablePythonDispatcher { + DisablePythonDispatcher() : old_(PythonDispatcherTLS::get_state()) { + PythonDispatcherTLS::set_state({}); + } + + DisablePythonDispatcher(DisablePythonDispatcher&& other) = delete; + DisablePythonDispatcher(const DisablePythonDispatcher&) = delete; + DisablePythonDispatcher& operator=(const DisablePythonDispatcher&) = delete; + DisablePythonDispatcher& operator=(DisablePythonDispatcher&&) = delete; + ~DisablePythonDispatcher() { + PythonDispatcherTLS::set_state(old_); + } + PyInterpreter* old_; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h new file mode 100644 index 0000000000000000000000000000000000000000..da3a9a0c4abacf6165ca946e62257771cf2790ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h @@ -0,0 +1,336 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include + +#define C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE 5 + +namespace c10::impl { + +// Packed container for TensorImpl sizes and strides. +// This design improves on the previous approach of using a pair of +// c10::SmallVector by specializing for the operations we +// actually use and enforcing that the number of sizes is the same as +// the number of strides. The memory layout is as follows: +// +// 1 size_t for the size +// 5 eightbytes of inline sizes and 5 eightbytes of inline strides, OR pointer +// to out-of-line array +class C10_API SizesAndStrides { + public: + // TODO: different iterator types for sizes & strides to prevent + // mixing the two accidentally. + using sizes_iterator = int64_t*; + using sizes_const_iterator = const int64_t*; + using strides_iterator = int64_t*; + using strides_const_iterator = const int64_t*; + + SizesAndStrides() { + size_at_unchecked(0) = 0; + stride_at_unchecked(0) = 1; + } + + ~SizesAndStrides() { + if (C10_UNLIKELY(!isInline())) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + free(outOfLineStorage_); + } + } + + SizesAndStrides(const SizesAndStrides& rhs) : size_(rhs.size_) { + if (C10_LIKELY(rhs.isInline())) { + copyDataInline(rhs); + } else { + allocateOutOfLineStorage(size_); + copyDataOutline(rhs); + } + } + + bool operator==(const SizesAndStrides& other) const { + if (size_ != other.size_) { + return false; + } + return !( + isInline() + ? std::memcmp( + inlineStorage_, other.inlineStorage_, sizeof(inlineStorage_)) + : std::memcmp( + outOfLineStorage_, + other.outOfLineStorage_, + storageBytes(size_))); + } + + bool operator!=(const SizesAndStrides& other) const { + return !(*this == other); + } + + SizesAndStrides& operator=(const SizesAndStrides& rhs) { + if (this == &rhs) { + return *this; + } + if (C10_LIKELY(rhs.isInline())) { + if (C10_UNLIKELY(!isInline())) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + free(outOfLineStorage_); + } + copyDataInline(rhs); + } else { + if (isInline()) { + allocateOutOfLineStorage(rhs.size_); + } else { + resizeOutOfLineStorage(rhs.size_); + } + copyDataOutline(rhs); + } + size_ = rhs.size_; + return *this; + } + + // Move from rhs. rhs.size() == 0 afterwards. + SizesAndStrides(SizesAndStrides&& rhs) noexcept : size_(rhs.size_) { + if (C10_LIKELY(isInline())) { + memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_)); + } else { + outOfLineStorage_ = rhs.outOfLineStorage_; + rhs.outOfLineStorage_ = nullptr; + } + + rhs.size_ = 0; + } + + // Move from rhs. rhs.size() == 0 afterwards. + SizesAndStrides& operator=(SizesAndStrides&& rhs) noexcept { + if (this == &rhs) { + return *this; + } + if (C10_LIKELY(rhs.isInline())) { + if (C10_UNLIKELY(!isInline())) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + free(outOfLineStorage_); + } + copyDataInline(rhs); + } else { + // They're outline. We're going to steal their vector. + if (!isInline()) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + free(outOfLineStorage_); + } + outOfLineStorage_ = rhs.outOfLineStorage_; + rhs.outOfLineStorage_ = nullptr; + } + size_ = rhs.size_; + rhs.size_ = 0; + + return *this; + } + + size_t size() const noexcept { + return size_; + } + + const int64_t* sizes_data() const noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[0]; + } else { + return &outOfLineStorage_[0]; + } + } + + int64_t* sizes_data() noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[0]; + } else { + return &outOfLineStorage_[0]; + } + } + + sizes_const_iterator sizes_begin() const noexcept { + return sizes_data(); + } + + sizes_iterator sizes_begin() noexcept { + return sizes_data(); + } + + sizes_const_iterator sizes_end() const noexcept { + return sizes_begin() + size(); + } + + sizes_iterator sizes_end() noexcept { + return sizes_begin() + size(); + } + + IntArrayRef sizes_arrayref() const noexcept { + return IntArrayRef{sizes_data(), size()}; + } + + void set_sizes(IntArrayRef newSizes) { + resize(newSizes.size()); + std::copy(newSizes.begin(), newSizes.end(), sizes_begin()); + } + + void set_strides(IntArrayRef strides) { + TORCH_INTERNAL_ASSERT(strides.size() == size()); + std::copy(strides.begin(), strides.end(), strides_begin()); + } + + const int64_t* strides_data() const noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE]; + } else { + return &outOfLineStorage_[size()]; + } + } + + int64_t* strides_data() noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE]; + } else { + return &outOfLineStorage_[size()]; + } + } + + strides_const_iterator strides_begin() const noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE]; + } else { + return &outOfLineStorage_[size()]; + } + } + + strides_iterator strides_begin() noexcept { + if (C10_LIKELY(isInline())) { + return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE]; + } else { + return &outOfLineStorage_[size()]; + } + } + + strides_const_iterator strides_end() const noexcept { + return strides_begin() + size(); + } + + strides_iterator strides_end() noexcept { + return strides_begin() + size(); + } + + IntArrayRef strides_arrayref() const noexcept { + return IntArrayRef{strides_data(), size()}; + } + + // Size accessors. + int64_t size_at(size_t idx) const noexcept { + assert(idx < size()); + return sizes_data()[idx]; + } + + int64_t& size_at(size_t idx) noexcept { + assert(idx < size()); + return sizes_data()[idx]; + } + + int64_t size_at_unchecked(size_t idx) const noexcept { + return sizes_data()[idx]; + } + + int64_t& size_at_unchecked(size_t idx) noexcept { + return sizes_data()[idx]; + } + + // Size accessors. + int64_t stride_at(size_t idx) const noexcept { + assert(idx < size()); + return strides_data()[idx]; + } + + int64_t& stride_at(size_t idx) noexcept { + assert(idx < size()); + return strides_data()[idx]; + } + + int64_t stride_at_unchecked(size_t idx) const noexcept { + return strides_data()[idx]; + } + + int64_t& stride_at_unchecked(size_t idx) noexcept { + return strides_data()[idx]; + } + + void resize(size_t newSize) { + const auto oldSize = size(); + if (newSize == oldSize) { + return; + } + if (C10_LIKELY( + newSize <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE && isInline())) { + if (oldSize < newSize) { + const auto bytesToZero = + (newSize - oldSize) * sizeof(inlineStorage_[0]); + memset(&inlineStorage_[oldSize], 0, bytesToZero); + memset( + &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE + oldSize], + 0, + bytesToZero); + } + size_ = newSize; + } else { + resizeSlowPath(newSize, oldSize); + } + } + + void resizeSlowPath(size_t newSize, size_t oldSize); + + private: + bool isInline() const noexcept { + return size_ <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE; + } + + void copyDataInline(const SizesAndStrides& rhs) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.isInline()); + memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_)); + } + + static size_t storageBytes(size_t size) noexcept { + return size * 2 * sizeof(int64_t); + } + + void allocateOutOfLineStorage(size_t size) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + outOfLineStorage_ = static_cast(malloc(storageBytes(size))); + TORCH_CHECK( + outOfLineStorage_, + "Could not allocate memory for Tensor SizesAndStrides!"); + } + + void resizeOutOfLineStorage(size_t newSize) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isInline()); + outOfLineStorage_ = static_cast( + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + realloc(outOfLineStorage_, storageBytes(newSize))); + TORCH_CHECK( + outOfLineStorage_, + "Could not allocate memory for Tensor SizesAndStrides!"); + } + + void copyDataOutline(const SizesAndStrides& rhs) noexcept { + memcpy(outOfLineStorage_, rhs.outOfLineStorage_, storageBytes(rhs.size_)); + } + + size_t size_{1}; + union { + int64_t* outOfLineStorage_; + // NOLINTNEXTLINE(*c-array*) + int64_t inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE * 2]{}; + }; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h new file mode 100644 index 0000000000000000000000000000000000000000..002bf4283806448b0cf9470116758b21fa5499e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::impl { + +enum class TorchDispatchModeKey : int8_t { + FAKE, + PROXY, + FUNCTIONAL, + NUM_MODE_KEYS +}; + +using PyObject_TorchDispatchMode = SafePyObjectT; + +struct C10_API TorchDispatchModeTLS { + // This API is NOT invariant safe. + // It must not take in an infra mode that uses TorchDispatchModeKey + // If you're pushing an infra mode onto the stack, we expect + // you to use set_mode + static void push_non_infra_mode_onto_stack( + std::shared_ptr mode); + // Pops the top mode of the stack, + // giving precedence to user modes before attempting to pop + // any infra modes + static const std::shared_ptr pop_stack(); + // Returns the highest-priority infra mode on the stack, + // along with its mode key. + static const std:: + tuple, TorchDispatchModeKey> + pop_highest_infra_mode(); + + static const std::shared_ptr& get_stack_at( + int64_t idx); + static int64_t stack_len(); + + static const std::optional> + get_mode(TorchDispatchModeKey mode_key); + static const std::optional> + unset_mode(TorchDispatchModeKey mode_key); + static void set_mode( + const std::shared_ptr& mode, + TorchDispatchModeKey mode_key); + + static const TorchDispatchModeTLS& get_state(); + static void set_state(TorchDispatchModeTLS state); + + static bool any_modes_set(bool skip_infra_modes = false); + + private: + std::vector> stack_; + // Users are allowed to push multiple ProxyTorchDispatchMode objects onto the + // stack + // However, we only allow a single FakeTensorMode onto the stack at a time + // (Pushing additional FakeTensorModes onto the stack is a no-op) + std::array< + std::optional>, + static_cast(TorchDispatchModeKey::NUM_MODE_KEYS)> + infra_modes_; +}; + +C10_API bool dispatch_mode_enabled(); + +C10_API std::string to_string(TorchDispatchModeKey mode_key); + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..16b1970bfa1bbc7d6dc9c1a0463d17f3cb08b9fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h @@ -0,0 +1,117 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { + +/** + * An implementation of DeviceGuardImplInterface which delegates + * to virtual dispatch on the DeviceGuardImpl registry. + */ +class VirtualGuardImpl final : public DeviceGuardImplInterface { + public: + VirtualGuardImpl(DeviceType device_type) + : impl_(getDeviceGuardImpl(device_type)) {} + // This constructor exists purely for testing + VirtualGuardImpl(const DeviceGuardImplInterface* impl) : impl_(impl) {} + + // Copying and moving is OK! + VirtualGuardImpl(const VirtualGuardImpl&) = default; + VirtualGuardImpl& operator=(const VirtualGuardImpl&) = default; + VirtualGuardImpl(VirtualGuardImpl&&) noexcept = default; + VirtualGuardImpl& operator=(VirtualGuardImpl&&) noexcept = default; + ~VirtualGuardImpl() override = default; + + DeviceType type() const override { + return impl_->type(); + } + Device exchangeDevice(Device d) const override { + return impl_->exchangeDevice(d); + } + Device getDevice() const override { + return impl_->getDevice(); + } + void setDevice(Device d) const override { + impl_->setDevice(d); + } + void uncheckedSetDevice(Device d) const noexcept override { + impl_->uncheckedSetDevice(d); + } + Stream getStream(Device d) const override { + return impl_->getStream(d); + } + Stream getNewStream(Device d, int priority = 0) const override { + return impl_->getNewStream(d, priority); + } + Stream getDefaultStream(Device d) const override { + return impl_->getDefaultStream(d); + } + Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) + const override { + return impl_->getStreamFromGlobalPool(d, isHighPriority); + } + Stream exchangeStream(Stream s) const override { + return impl_->exchangeStream(s); + } + DeviceIndex deviceCount() const noexcept override { + return impl_->deviceCount(); + } + + DeviceCapability getDeviceCapability(Device d) const override { + return impl_->getDeviceCapability(d); + } + + // Event functions + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override { + impl_->record(event, stream, device_index, flag); + } + void block(void* event, const Stream& stream) const override { + impl_->block(event, stream); + } + bool queryEvent(void* event) const override { + return impl_->queryEvent(event); + } + void destroyEvent(void* event, const DeviceIndex device_index) + const noexcept override { + impl_->destroyEvent(event, device_index); + } + + bool queryStream(const Stream& stream) const override { + return impl_->queryStream(stream); + } + void synchronizeStream(const Stream& stream) const override { + impl_->synchronizeStream(stream); + } + + void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) + const override { + impl_->recordDataPtrOnStream(data_ptr, stream); + } + + double elapsedTime(void* event1, void* event2, const DeviceIndex device_index) + const override { + return impl_->elapsedTime(event1, event2, device_index); + } + + void synchronizeEvent(void* event) const override { + impl_->synchronizeEvent(event); + } + + void synchronizeDevice(const DeviceIndex device_index) const override { + impl_->synchronizeDevice(device_index); + } + + private: + const DeviceGuardImplInterface* impl_ = nullptr; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..ef28ed469f010d3aedeb5d68ad5405c2ffdaa055 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +namespace c10 { + +C10_API void* alloc_cpu(size_t nbytes); +C10_API void free_cpu(void* data); + +#if defined(__linux__) && !defined(__ANDROID__) +C10_API size_t c10_compute_alignment(size_t nbytes); +#endif + +#ifdef USE_MIMALLOC_ON_MKL +namespace mi_malloc_wrapper { +C10_API void* c10_mi_malloc(size_t size); +C10_API void* c10_mi_calloc(size_t count, size_t size); +C10_API void* c10_mi_realloc(void* p, size_t newsize); +C10_API void* c10_mi_malloc_aligned(size_t size, size_t alignment); +C10_API void c10_mi_free(void* p); +} // namespace mi_malloc_wrapper +#endif + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..85b9a73d6bfa7bdf5a815c6e659f0c4af6bd8ef8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h @@ -0,0 +1,125 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +class C10_API TaskThreadPoolBase { + public: + virtual void run(std::function func) = 0; + + virtual size_t size() const = 0; + + /** + * The number of available (i.e. idle) threads in this thread pool. + */ + virtual size_t numAvailable() const = 0; + + /** + * Check if the current thread is from the thread pool. + */ + virtual bool inThreadPool() const = 0; + + virtual ~TaskThreadPoolBase() noexcept = default; + + static size_t defaultNumThreads(); +}; + +class C10_API ThreadPool : public c10::TaskThreadPoolBase { + protected: + struct task_element_t { + bool run_with_id; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const std::function no_id; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const std::function with_id; + + explicit task_element_t(std::function f) + : run_with_id(false), no_id(std::move(f)), with_id(nullptr) {} + explicit task_element_t(std::function f) + : run_with_id(true), no_id(nullptr), with_id(std::move(f)) {} + }; + + std::queue tasks_; + std::vector threads_; + mutable std::mutex mutex_; + std::condition_variable condition_; + std::condition_variable completed_; + std::atomic_bool running_; + bool complete_; + std::size_t available_; + std::size_t total_; + int numa_node_id_; + + public: + ThreadPool() = delete; + + explicit ThreadPool( + int pool_size, + int numa_node_id = -1, + const std::function& init_thread = nullptr); + + ~ThreadPool() override; + + size_t size() const override; + + size_t numAvailable() const override; + + bool inThreadPool() const override; + + void run(std::function func) override; + + template + void runTaskWithID(Task task) { + std::unique_lock lock(mutex_); + + // Set task and signal condition variable so that a worker thread will + // wake up and use the task. + tasks_.emplace(static_cast>(task)); + complete_ = false; + condition_.notify_one(); + } + + /// @brief Wait for queue to be empty + void waitWorkComplete(); + + private: + // @brief Entry point for pool threads. + void main_loop(std::size_t index); +}; + +class C10_API TaskThreadPool : public c10::ThreadPool { + public: + explicit TaskThreadPool(int pool_size, int numa_node_id = -1) + : ThreadPool(pool_size, numa_node_id, [numa_node_id]() { + setThreadName("CaffeTaskThread"); + NUMABind(numa_node_id); + }) {} +}; + +C10_DECLARE_SHARED_REGISTRY( + ThreadPoolRegistry, + TaskThreadPoolBase, + int, + int, + bool); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h new file mode 100644 index 0000000000000000000000000000000000000000..62995e142a3e84bf83e2e7143cdc6bc8eb67f91f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS +#include +#include +#include +#include +#endif +namespace c10::cuda { +#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS +template +__forceinline__ __device__ Iter +lower_bound(Iter start, Iter end, Scalar value) { + return thrust::lower_bound(thrust::device, start, end, value); +} +#else +// thrust::lower_bound is broken on device, see +// https://github.com/NVIDIA/thrust/issues/1734 Implementation inspired by +// https://github.com/pytorch/pytorch/blob/805120ab572efef66425c9f595d9c6c464383336/aten/src/ATen/native/cuda/Bucketization.cu#L28 +template +__device__ Iter lower_bound(Iter start, Iter end, Scalar value) { + while (start < end) { + auto mid = start + ((end - start) >> 1); + if (*mid < value) { + start = mid + 1; + } else { + end = mid; + } + } + return end; +} +#endif // THRUST_DEVICE_LOWER_BOUND_WORKS +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h new file mode 100644 index 0000000000000000000000000000000000000000..286eb3daecb5aa73711392c839776ab5e0444275 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h @@ -0,0 +1,211 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10::cuda::CUDACachingAllocator { + +enum class Expandable_Segments_Handle_Type : int { + UNSPECIFIED = 0, + POSIX_FD = 1, + FABRIC_HANDLE = 2, +}; + +// Environment config parser +class C10_CUDA_API CUDAAllocatorConfig { + public: + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.") + static size_t max_split_size() { + return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size(); + } + + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.") + static double garbage_collection_threshold() { + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + garbage_collection_threshold(); + } + + static bool expandable_segments() { + bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig:: + use_expandable_segments(); +#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED + if (enabled) { + TORCH_WARN_ONCE("expandable_segments not supported on this platform") + } + return false; +#else + return enabled; +#endif + } + + static Expandable_Segments_Handle_Type expandable_segments_handle_type() { + return instance().m_expandable_segments_handle_type; + } + + static void set_expandable_segments_handle_type( + Expandable_Segments_Handle_Type handle_type) { + instance().m_expandable_segments_handle_type = handle_type; + } + + static bool release_lock_on_cudamalloc() { + return instance().m_release_lock_on_cudamalloc; + } + + static bool graph_capture_record_stream_reuse() { + return instance().m_graph_capture_record_stream_reuse; + } + + static double per_process_memory_fraction() { + return instance().m_per_process_memory_fraction; + } + + /** Pinned memory allocator settings */ + static bool pinned_use_cuda_host_register() { + return instance().m_pinned_use_cuda_host_register; + } + + static size_t pinned_num_register_threads() { + return instance().m_pinned_num_register_threads; + } + + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.") + static bool pinned_use_background_threads() { + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + pinned_use_background_threads(); + } + + static size_t pinned_reserve_segment_size_mb() { + return instance().m_pinned_reserve_segment_size_mb; + } + + static size_t pinned_max_register_threads() { + // Based on the benchmark results, we see better allocation performance + // with 8 threads. However on future systems, we may need more threads + // and limiting this to 128 threads. + return 128; + } + + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") + static size_t roundup_power2_divisions(size_t size) { + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + roundup_power2_divisions(size); + } + + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") + static std::vector roundup_power2_divisions() { + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + roundup_power2_divisions(); + } + + static size_t max_non_split_rounding_size() { + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + max_non_split_rounding_size(); + } + + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.") + static std::string last_allocator_settings() { + return c10::CachingAllocator::getAllocatorSettings(); + } + + static CUDAAllocatorConfig& instance() { + static CUDAAllocatorConfig* s_instance = ([]() { + auto inst = new CUDAAllocatorConfig(); + auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF"); +#ifdef USE_ROCM + // convenience for ROCm users, allow alternative HIP token + if (!env.has_value()) { + env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF"); + } +#endif + // Note: keep the parsing order and logic stable to avoid potential + // performance regressions in internal tests. + if (!env.has_value()) { + env = c10::utils::get_env("PYTORCH_ALLOC_CONF"); + } + if (env.has_value()) { + inst->parseArgs(env.value()); + } + return inst; + })(); + return *s_instance; + } + + // Use `Construct On First Use Idiom` to avoid `Static Initialization Order` + // issue. + static const std::unordered_set& getKeys() { + static std::unordered_set keys{ + "backend", + // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues + // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors) + "release_lock_on_cud" + "amalloc", + "pinned_use_cud" + "a_host_register", + // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors) + "release_lock_on_hipmalloc", + "pinned_use_hip_host_register", + "graph_capture_record_stream_reuse", + "pinned_reserve_segment_size_mb", + "pinned_num_register_threads", + "per_process_memory_fraction"}; + return keys; + } + + void parseArgs(const std::string& env); + + private: + CUDAAllocatorConfig() = default; + + size_t parseAllocatorConfig( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i, + bool& used_cudaMallocAsync); + size_t parsePinnedUseCudaHostRegister( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); + size_t parsePinnedNumRegisterThreads( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); + size_t parsePinnedReserveSegmentSize( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); + size_t parseGraphCaptureRecordStreamReuse( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); + double parsePerProcessMemoryFraction( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); + + std::atomic m_pinned_num_register_threads{1}; + std::atomic m_pinned_reserve_segment_size_mb{0}; + std::atomic m_expandable_segments_handle_type +#if CUDA_VERSION >= 12030 + {Expandable_Segments_Handle_Type::UNSPECIFIED}; +#else + {Expandable_Segments_Handle_Type::POSIX_FD}; +#endif + std::atomic m_release_lock_on_cudamalloc{false}; + std::atomic m_pinned_use_cuda_host_register{false}; + std::atomic m_graph_capture_record_stream_reuse{false}; + std::atomic m_per_process_memory_fraction{1.0}; +}; + +// Keep this for backwards compatibility +using c10::CachingAllocator::setAllocatorSettings; + +} // namespace c10::cuda::CUDACachingAllocator + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..b425157814aa15296d38633501e47035e2804130 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h @@ -0,0 +1,582 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +// Caching allocator will execute every registered callback if it unable to find +// block inside of already allocated area. +class C10_CUDA_API FreeMemoryCallback { + public: + virtual ~FreeMemoryCallback() = default; + virtual bool Execute() = 0; +}; + +C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback); +#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \ + C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__) +} // namespace c10 + // +// TODO: Turn this into an honest to goodness class. I briefly attempted to do +// this, but it was a bit irritating to figure out how to also correctly +// apply pimpl pattern so I didn't have to leak any internal implementation +// details in the header (CUDACachingAllocator could be made a pimpl, but +// you also need to appropriately define a class which is a subclass +// of Allocator. Not impossible, but required a bit more surgery than +// I wanted to do at the time.) +// +// Why is this using a namespace rather than old-style THCCachingAllocator_ +// prefix? Mostly because it made the HIPify rules easier to write; _ is +// not counted as a word boundary, so you would otherwise have to list each +// of these functions. + +namespace c10::cuda::CUDACachingAllocator { + +// Preserved only for BC reasons +// NOLINTNEXTLINE(misc-unused-using-decls) +using c10::CachingAllocator::kLargeBuffer; +using c10::CachingDeviceAllocator::DeviceStats; + +typedef std::shared_ptr (*CreateContextFn)(); + +// Struct containing info of an allocation block (i.e. a fractional part of a +// cudaMalloc).. +struct BlockInfo { + size_t size = 0; + size_t requested_size = 0; + int32_t gc_counter = 0; + bool allocated = false; + bool active = false; + std::shared_ptr + context_when_allocated; // per-watcher context +}; + +// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc). +struct SegmentInfo { + c10::DeviceIndex device = 0; + size_t address = 0; + size_t total_size = 0; + size_t requested_size = 0; // unrounded, actually requested size + size_t allocated_size = 0; + size_t active_size = 0; + cudaStream_t stream = nullptr; + bool is_large = false; + bool is_expandable = false; + MempoolId_t owner_private_pool_id = {0, 0}; + std::vector blocks; + std::shared_ptr context_when_allocated; +}; + +struct AllocatorState { + virtual ~AllocatorState() = default; +}; + +union trace_time_ { + time_t t_; + approx_time_t approx_t_; +}; + +struct TraceEntry { + enum Action { + ALLOC, // API made to the caching allocator for new memory + FREE_REQUESTED, // API call made to the caching allocator to free memory + FREE_COMPLETED, // The allocator might have to delay a free because + // it is still in use on another stream via record_stream + // This event is generated when a free actually completes. + SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS + SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to + // defragment or empty_caches) + SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments) + SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments) + SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace + // events + OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free + // bytes reported by cuda) + }; + TraceEntry( + Action action, + c10::DeviceIndex device, + size_t addr, + size_t size, + cudaStream_t stream, + MempoolId_t mempool, + approx_time_t time, + std::shared_ptr context = nullptr, + std::string compile_context = "", + std::string user_metadata = "") + : action_(action), + device_(device), + addr_(addr), + context_(std::move(context)), + stream_(stream), + size_(size), + mempool_(std::move(mempool)), + compile_context_(std::move(compile_context)), + user_metadata_(std::move(user_metadata)) { + time_.approx_t_ = time; + } + Action action_; + c10::DeviceIndex device_; + size_t addr_; // for OOM, this is the amount of free bytes reported by cuda + std::shared_ptr context_; + cudaStream_t stream_{}; + size_t size_; + MempoolId_t mempool_; + trace_time_ time_{}; + std::string compile_context_; + std::string user_metadata_; +}; + +// Calls made by record_function will save annotations +struct AnnotationEntry { + AnnotationEntry(c10::DeviceIndex device, approx_time_t time) + : device_(device) { + time_.approx_t_ = time; + } + + void recordUserMetadata(const std::string& name, std::string value) { + metadata_[name] = std::move(value); + } + + c10::DeviceIndex device_; + trace_time_ time_{}; + std::unordered_map metadata_; +}; + +struct AllocatorConfigInfo { + double garbage_collection_threshold; + size_t max_split_size; + size_t pinned_num_register_threads; + bool expandable_segments; + bool release_lock_on_malloc; + bool pinned_use_host_register; + bool graph_capture_record_stream_reuse; + std::string last_allocator_settings; + std::vector roundup_power2_divisions; +}; + +struct SnapshotInfo { + std::vector segments; + std::vector> device_traces; + std::vector external_annotations; + AllocatorConfigInfo config_metadata; +}; + +// returns the pointers freed in the pool +// and the pointers allocated. Note: a pointer +// may appear in both freed and allocated +struct CheckpointDelta { + std::vector ptrs_freed; + std::vector dataptrs_allocd; +}; + +enum struct RecordContext { + NEVER = 0, + STATE = 1, // only keep stacks for active allocations + ALLOC = 2, // additionally keep stacks for allocations in the trace history + ALL = 3, // additionally record stacks for when something is freed +}; + +using OutOfMemoryObserver = std::function; + +using AllocatorTraceTracker = std::function; + +struct ShareableHandle { + ptrdiff_t offset; + std::string handle; +}; + +struct StreamSegmentSize { + StreamSegmentSize(cudaStream_t s, bool small, size_t sz) + : stream(s), is_small_pool(small), total_size(sz) {} + cudaStream_t stream; + bool is_small_pool; + size_t total_size; +}; + +class CUDAAllocator : public DeviceAllocator { + public: + virtual void* raw_alloc(size_t nbytes) = 0; + virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0; + virtual void raw_delete(void* ptr) = 0; + virtual void init(int device_count) = 0; + virtual double getMemoryFraction(c10::DeviceIndex device) = 0; + virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0; + virtual std::vector getExpandableSegmentSizes( + c10::DeviceIndex device) = 0; + virtual void enable(bool value) = 0; + virtual bool isEnabled() const = 0; + virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0; + virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; + // Keep for BC only + virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0; + void recordStream(const DataPtr& ptr, c10::Stream stream) override { + CUDAStream cuda_stream = CUDAStream(stream); + recordStream(ptr, cuda_stream); + } + virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0; + virtual void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) = 0; + virtual void endAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id) = 0; + virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0; + virtual int getPoolUseCount( + c10::DeviceIndex /*device*/, + MempoolId_t /*mempool_id*/) { + TORCH_CHECK( + false, + name(), + " does not yet support getPoolUseCount. " + "If you need it, please file an issue describing your use case."); + } + virtual void createOrIncrefPool( + c10::DeviceIndex /*device*/, + MempoolId_t /*mempool_id*/, + CUDAAllocator* allocator = nullptr) { + TORCH_CHECK( + false, + name(), + " does not yet support createOrIncrefPool. " + "If you need it, please file an issue describing your use case."); + } + virtual void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + TORCH_CHECK( + false, + name(), + " does not yet support setUseOnOOM. " + "If you need it, please file an issue describing your use case."); + } + virtual void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) { + TORCH_CHECK( + false, + name(), + " does not yet support setNoSplit. " + "If you need it, please file an issue describing your use case."); + } + + // returns true if the allocated blocks are equal to expected live allocations + virtual bool checkPoolLiveAllocations( + c10::DeviceIndex /*device*/, + MempoolId_t /*mempool_id*/, + const std::unordered_set& /*expected_live_allocations*/) { + TORCH_CHECK( + false, + name(), + " does not yet support checkPoolLiveAllocations. " + "If you need it, please file an issue describing your use case."); + } + virtual ShareableHandle shareIpcHandle(void* ptr) = 0; + virtual std::shared_ptr getIpcDevPtr(std::string handle) = 0; + virtual bool isHistoryEnabled() { + TORCH_CHECK( + false, + name(), + " does not yet support recordHistory. " + "If you need it, please file an issue describing your use case."); + } + virtual void recordHistory( + bool enabled, + CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + RecordContext when, + bool clearHistory) = 0; + virtual void recordAnnotation( + const std::vector>& /*md*/) {} + virtual void pushCompileContext(std::string& md) {} + virtual void popCompileContext() {} + virtual void setUserMetadata(const std::string& metadata) {} + virtual std::string getUserMetadata() { + return ""; + } + virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0; + + // Attached AllocatorTraceTracker callbacks will be called while the + // per-device allocator lock is held. Any additional locks taken from within + // the callback must be proven to always have the lock order that never + // triggers a deadlock. In particular, Python's GIL may be held when + // calling the allocator so it is unsafe to try to acquire the GIL in this + // callback. + virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0; + + virtual void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access) = 0; + + // memory not allocated from cudaMalloc cannot be copied + // across devices using cudaMemcpyAsync if peer to peer access is disabled. + // instead it requires cudaMemcpyAsyncPeer + // with P2P Enabled, all combinations work + // with P2P Disabled: + // cudaMalloc cudaMallocAsync/cuMemMap + // cudaMemcpyAsyncPeer works works + // cudaMemcpyAsync works error + + // This function performs chooses to use the Peer version of + // memcpy if required based on where the allocated put dst/src. + virtual cudaError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + cudaStream_t stream, + bool p2p_enabled) = 0; + virtual std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) = 0; + virtual CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) = 0; + virtual std::string name() = 0; + std::pair getMemoryInfo(c10::DeviceIndex device) override { + c10::DeviceGuard device_guard({at::kCUDA, device}); + size_t free = 0; + size_t total = 0; + C10_CUDA_CHECK(cudaMemGetInfo(&free, &total)); + return {free, total}; + } +}; + +// Allocator object, statically initialized +// See BackendInitializer in CUDACachingAllocator.cpp. +// Atomic loads on x86 are just normal loads, +// (atomic stores are different), so reading this value +// is no different than loading a pointer. +C10_CUDA_API extern std::atomic allocator; + +inline CUDAAllocator* get() { + return allocator.load(); +} + +// Called directly by clients. +inline void* raw_alloc(size_t nbytes) { + return get()->raw_alloc(nbytes); +} + +inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) { + return get()->raw_alloc_with_stream(nbytes, stream); +} + +inline void raw_delete(void* ptr) { + get()->raw_delete(ptr); +} + +inline void init(int device_count) { + get()->init(device_count); +} + +inline double getMemoryFraction(c10::DeviceIndex device) { + return get()->getMemoryFraction(device); +} + +inline void setMemoryFraction(double fraction, c10::DeviceIndex device) { + get()->setMemoryFraction(fraction, device); +} + +inline std::vector getExpandableSegmentSizes( + c10::DeviceIndex device) { + return get()->getExpandableSegmentSizes(device); +} + +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + get()->emptyCache(mempool_id); +} + +inline void enable(bool value) { + get()->enable(value); +} + +inline bool isEnabled() { + return get()->isEnabled(); +} + +inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) { + get()->cacheInfo(device, largestBlock); +} + +inline void* getBaseAllocation(void* ptr, size_t* size) { + return get()->getBaseAllocation(ptr, size); +} + +inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) { + get()->recordStream(dataPtr, stream); +} + +inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) { + return get()->getDeviceStats(device); +} + +inline void resetAccumulatedStats(c10::DeviceIndex device) { + get()->resetAccumulatedStats(device); +} + +inline void resetPeakStats(c10::DeviceIndex device) { + get()->resetPeakStats(device); +} + +inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) { + return get()->snapshot(mempool_id); +} + +inline std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) { + return get()->getCheckpointState(device, id); +} + +inline CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) { + return get()->setCheckpointPoolState(device, std::move(pps)); +} + +// CUDAGraph interactions +inline void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) { + get()->beginAllocateToPool(device, mempool_id, std::move(filter)); +} + +inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->endAllocateToPool(device, mempool_id); +} + +inline void recordHistory( + bool enabled, + CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + RecordContext when, + bool clearHistory) { + get()->recordHistory( + enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +} + +inline void recordAnnotation( + const std::vector>& md) { + get()->recordAnnotation(md); +} + +inline void pushCompileContext(std::string& md) { + get()->pushCompileContext(md); +} + +inline void popCompileContext() { + get()->popCompileContext(); +} + +inline bool isHistoryEnabled() { + return get()->isHistoryEnabled(); +} + +inline bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) { + return get()->checkPoolLiveAllocations( + device, mempool_id, expected_live_allocations); +} + +inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) { + get()->attachOutOfMemoryObserver(std::move(observer)); +} + +inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) { + get()->attachAllocatorTraceTracker(std::move(tracker)); +} + +inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->releasePool(device, mempool_id); +} +inline void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + CUDAAllocator* allocator_ptr = nullptr) { + get()->createOrIncrefPool(device, mempool_id, allocator_ptr); +} +inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setUseOnOOM(device, mempool_id); +} +inline void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setNoSplit(device, mempool_id); +} +inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->getPoolUseCount(device, mempool_id); +} + +// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE +inline std::shared_ptr getIpcDevPtr(std::string handle) { + return get()->getIpcDevPtr(std::move(handle)); +} + +inline ShareableHandle shareIpcHandle(void* ptr) { + return get()->shareIpcHandle(ptr); +} + +inline std::string name() { + return get()->name(); +} + +inline cudaError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + cudaStream_t stream, + bool p2p_enabled) { + return get()->memcpyAsync( + dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); +} + +inline void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access) { + get()->enablePeerAccess(dev, dev_to_access); +} + +inline void setUserMetadata(const std::string& metadata) { + get()->setUserMetadata(metadata); +} + +inline std::string getUserMetadata() { + return get()->getUserMetadata(); +} + +} // namespace c10::cuda::CUDACachingAllocator + +namespace c10::cuda { +// Keep BC only +using c10::CaptureId_t; +using c10::MempoolId_t; +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h new file mode 100644 index 0000000000000000000000000000000000000000..294734601cb78d68aff50da939b3452c948adb80 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::cuda { + +#ifdef TORCH_USE_CUDA_DSA +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") +// Copy string from `src` to `dst` +static __device__ void dstrcpy(char* dst, const char* src) { + int i = 0; + // Copy string from source to destination, ensuring that it + // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1` + while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) { + *dst++ = *src++; + } + *dst = '\0'; +} + +static __device__ void dsa_add_new_assertion_failure( + DeviceAssertionsData* assertions_data, + const char* assertion_msg, + const char* filename, + const char* function_name, + const int line_number, + const uint32_t caller, + const dim3 block_id, + const dim3 thread_id) { + // `assertions_data` may be nullptr if device-side assertion checking + // is disabled at run-time. If it is disabled at compile time this + // function will never be called + if (!assertions_data) { + return; + } + + // Atomically increment so other threads can fail at the same time + // Note that incrementing this means that the CPU can observe that + // a failure has happened and can begin to respond before we've + // written information about that failure out to the buffer. + const auto nid = atomicAdd(&(assertions_data->assertion_count), 1); + + if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) { + // At this point we're ran out of assertion buffer space. + // We could print a message about this, but that'd get + // spammy if a lot of threads did it, so we just silently + // ignore any other assertion failures. In most cases the + // failures will all probably be analogous anyway. + return; + } + + // Write information about the assertion failure to memory. + // Note that this occurs only after the `assertion_count` + // increment broadcasts that there's been a problem. + auto& self = assertions_data->assertions[nid]; + dstrcpy(self.assertion_msg, assertion_msg); + dstrcpy(self.filename, filename); + dstrcpy(self.function_name, function_name); + self.line_number = line_number; + self.caller = caller; + self.block_id[0] = block_id.x; + self.block_id[1] = block_id.y; + self.block_id[2] = block_id.z; + self.thread_id[0] = thread_id.x; + self.thread_id[1] = thread_id.y; + self.thread_id[2] = thread_id.z; +} +C10_CLANG_DIAGNOSTIC_POP() + +// Emulates a kernel assertion. The assertion won't stop the kernel's progress, +// so you should assume everything the kernel produces is garbage if there's an +// assertion failure. +// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are +// arguments of the kernel and therefore accessible. +#define CUDA_KERNEL_ASSERT2(condition) \ + do { \ + if (C10_UNLIKELY(!(condition))) { \ + /* Has an atomic element so threads can fail at the same time */ \ + c10::cuda::dsa_add_new_assertion_failure( \ + assertions_data, \ + C10_STRINGIZE(condition), \ + __FILE__, \ + __FUNCTION__, \ + __LINE__, \ + assertion_caller_id, \ + blockIdx, \ + threadIdx); \ + /* Now that the kernel has failed we early exit the kernel, but */ \ + /* otherwise keep going and rely on the host to check UVM and */ \ + /* determine we've had a problem */ \ + return; \ + } \ + } while (false) +#else +#define CUDA_KERNEL_ASSERT2(condition) assert(condition) +#endif + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h new file mode 100644 index 0000000000000000000000000000000000000000..2d4921a100a1c73e2fd5a69284cd92435b7f70f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h @@ -0,0 +1,169 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef USE_CUDA +#define TORCH_USE_CUDA_DSA +#endif + +/// Number of assertion failure messages we can store. If this is too small +/// threads will fail silently. +constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10; +constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512; + +namespace c10::cuda { + +/// Holds information about any device-side assertions that fail. +/// Held in managed memory and access by both the CPU and the GPU. +struct DeviceAssertionData { + /// Stringification of the assertion + // NOLINTNEXTLINE(*-c-arrays) + char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{}; + /// File the assertion was in + // NOLINTNEXTLINE(*-c-arrays) + char filename[C10_CUDA_DSA_MAX_STR_LEN]{}; + /// Name of the function the assertion was in + // NOLINTNEXTLINE(*-c-arrays) + char function_name[C10_CUDA_DSA_MAX_STR_LEN]{}; + /// Line number the assertion was at + int line_number{}; + /// Number uniquely identifying the kernel launch that triggered the assertion + uint32_t caller{}; + /// block_id of the thread that failed the assertion + // NOLINTNEXTLINE(*-c-arrays) + int32_t block_id[3]{}; + /// third_id of the thread that failed the assertion + // NOLINTNEXTLINE(*-c-arrays) + int32_t thread_id[3]{}; +}; + +/// Used to hold assertions generated by the device +/// Held in managed memory and access by both the CPU and the GPU. +struct DeviceAssertionsData { + /// Total number of assertions found; a subset of these will be recorded + /// in `assertions` + int32_t assertion_count{}; + /// An array of assertions that will be written to in a race-free manner + // NOLINTNEXTLINE(*-c-arrays) + DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{}; +}; + +/// Use to hold info about kernel launches so that we can run kernels +/// asynchronously and still associate launches with device-side +/// assertion failures +struct CUDAKernelLaunchInfo { + /// Filename of the code where the kernel was launched from + const char* launch_filename; + /// Function from which the kernel was launched + const char* launch_function; + /// Line number of where the code was launched from + uint32_t launch_linenum; + /// Backtrace of where the kernel was launched from, only populated if + /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True + std::string launch_stacktrace; + /// Kernel that was launched + const char* kernel_name; + /// Device the kernel was launched on + int device; + /// Stream the kernel was launched on + int32_t stream; + /// A number that uniquely identifies the kernel launch + uint64_t generation_number; +}; + +/// Circular buffer used to hold information about kernel launches +/// this is later used to reconstruct how a device-side kernel assertion failure +/// occurred CUDAKernelLaunchRegistry is used as a singleton +class C10_CUDA_API CUDAKernelLaunchRegistry { + private: + /// Assume that this is the max number of kernel launches that might ever be + /// enqueued across all streams on a single device + static constexpr int max_kernel_launches = 1024; + /// How many kernel launch infos we've inserted. Used to ensure that circular + /// queue doesn't provide false information by always increasing, but also to + /// mark where we are inserting into the queue +#ifdef TORCH_USE_CUDA_DSA + uint64_t generation_number = 0; +#endif + /// Shared mutex between writer and accessor to ensure multi-threaded safety. + mutable std::mutex read_write_mutex; + /// Used to ensure prevent race conditions in GPU memory allocation + mutable std::mutex gpu_alloc_mutex; + /// Pointer to managed memory keeping track of device-side assertions. There + /// is one entry for each possible device the process might work with. Unused + /// entries are nullptrs. We could also use an unordered_set here, but this + /// vector design will be faster and the wasted memory is small since we + /// expect the number of GPUs per node will always be small + std::vector< + std::unique_ptr> + uvm_assertions; + /// A single circular buffer holds information about every kernel launch the + /// process makes across all devices. + std::vector kernel_launches; + bool check_env_for_enable_launch_stacktracing() const; + bool check_env_for_dsa_enabled() const; + + public: + CUDAKernelLaunchRegistry(); + /// Register a new kernel launch and obtain a generation number back to be + /// passed to the kernel + uint32_t insert( + const char* launch_filename, + const char* launch_function, + const uint32_t launch_linenum, + const char* kernel_name, + const int32_t stream_id); + /// Get copies of the kernel launch registry and each device's assertion + /// failure buffer so they can be inspected without raising race conditions + std:: + pair, std::vector> + snapshot() const; + /// Get a pointer to the current device's assertion failure buffer. If no such + /// buffer exists then one is created. This means that the first kernel launch + /// made on each device will be slightly slower because memory allocations are + /// required + DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device(); + /// Gets the global singleton of the registry + static CUDAKernelLaunchRegistry& get_singleton_ref(); + /// If not all devices support DSA, we disable it + const bool do_all_devices_support_managed_memory = false; + /// Whether or not to gather stack traces when launching kernels + bool gather_launch_stacktrace = false; + /// Whether or not host-side DSA is enabled or disabled at run-time + /// Note: Device-side code cannot be enabled/disabled at run-time + bool enabled_at_runtime = false; + /// Whether or not a device has indicated a failure + bool has_failed() const; +#ifdef TORCH_USE_CUDA_DSA + const bool enabled_at_compile_time = true; +#else + const bool enabled_at_compile_time = false; +#endif +}; + +C10_CUDA_API std::string c10_retrieve_device_side_assertion_info(); + +} // namespace c10::cuda + +// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH +// requires the same input arguments. We introduce the following macro to +// standardize these. +#define TORCH_DSA_KERNEL_ARGS \ + [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \ + [[maybe_unused]] uint32_t assertion_caller_id + +// This macro can be used to pass the DSA arguments onward to another +// function +#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h new file mode 100644 index 0000000000000000000000000000000000000000..71a5a9b86d8833ca28adad37f36061b201b2d5d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h @@ -0,0 +1,102 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// Note [CHECK macro] +// ~~~~~~~~~~~~~~~~~~ +// This is a macro so that AT_ERROR can get accurate __LINE__ +// and __FILE__ information. We could split this into a short +// macro and a function implementation if we pass along __LINE__ +// and __FILE__, but no one has found this worth doing. + +// Used to denote errors from CUDA framework. +// This needs to be declared here instead util/Exception.h for proper conversion +// during hipify. +namespace c10 { +class C10_CUDA_API CUDAError : public c10::Error { + using Error::Error; +}; +} // namespace c10 + +#define C10_CUDA_CHECK(EXPR) \ + do { \ + const cudaError_t __err = EXPR; \ + c10::cuda::c10_cuda_check_implementation( \ + static_cast(__err), \ + __FILE__, \ + __func__, /* Line number data type not well-defined between \ + compilers, so we perform an explicit cast */ \ + static_cast(__LINE__), \ + true); \ + } while (0) + +#define C10_CUDA_CHECK_WARN(EXPR) \ + do { \ + const cudaError_t __err = EXPR; \ + if (C10_UNLIKELY(__err != cudaSuccess)) { \ + [[maybe_unused]] auto error_unused = cudaGetLastError(); \ + TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \ + } \ + } while (0) + +// Indicates that a CUDA error is handled in a non-standard way +#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR + +// Intentionally ignore a CUDA error +#define C10_CUDA_IGNORE_ERROR(EXPR) \ + do { \ + const cudaError_t __err = EXPR; \ + if (C10_UNLIKELY(__err != cudaSuccess)) { \ + [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \ + } \ + } while (0) + +// Clear the last CUDA error +#define C10_CUDA_CLEAR_ERROR() \ + do { \ + [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \ + } while (0) + +// This should be used directly after every kernel launch to ensure +// the launch happened correctly and provide an early, close-to-source +// diagnostic if it didn't. +#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError()) + +/// Launches a CUDA kernel appending to it all the information need to handle +/// device-side assertion failures. Checks that the launch was successful. +#define TORCH_DSA_KERNEL_LAUNCH( \ + kernel, blocks, threads, shared_mem, stream, ...) \ + do { \ + auto& launch_registry = \ + c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref(); \ + kernel<<>>( \ + __VA_ARGS__, \ + launch_registry.get_uvm_assertions_ptr_for_current_device(), \ + launch_registry.insert( \ + __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + } while (0) + +namespace c10::cuda { + +/// In the event of a CUDA failure, formats a nice error message about that +/// failure and also checks for device-side assertion failures +C10_CUDA_API void c10_cuda_check_implementation( + const int32_t err, + const char* filename, + const char* function_name, + const uint32_t line_number, + const bool include_device_assertions); + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..a97b3d89401a64afc834bbb3c573a4f1b2f21c22 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h @@ -0,0 +1,131 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This header provides C++ wrappers around commonly used CUDA API functions. +// The benefit of using C++ here is that we can raise an exception in the +// event of an error, rather than explicitly pass around error codes. This +// leads to more natural APIs. +// +// The naming convention used here matches the naming convention of torch.cuda + +#include +#include +#include +#include +#include +namespace c10::cuda { + +// NB: In the past, we were inconsistent about whether or not this reported +// an error if there were driver problems are not. Based on experience +// interacting with users, it seems that people basically ~never want this +// function to fail; it should just return zero if things are not working. +// Oblige them. +// It still might log a warning for user first time it's invoked +C10_CUDA_API DeviceIndex device_count() noexcept; + +// Version of device_count that throws is no devices are detected +C10_CUDA_API DeviceIndex device_count_ensure_non_zero(); + +C10_CUDA_API DeviceIndex current_device(); + +C10_CUDA_API void set_device(DeviceIndex device, const bool force = false); + +C10_CUDA_API void device_synchronize(); + +C10_CUDA_API void warn_or_error_on_sync(); + +// Raw CUDA device management functions +C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count); + +C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device); + +C10_CUDA_API cudaError_t +SetDevice(DeviceIndex device, const bool force = false); + +C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device); + +C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device); + +C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device); + +C10_CUDA_API void SetTargetDevice(); + +enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR }; + +// this is a holder for c10 global state (similar to at GlobalContext) +// currently it's used to store cuda synchronization warning state, +// but can be expanded to hold other related global state, e.g. to +// record stream usage +class WarningState { + public: + void set_sync_debug_mode(SyncDebugMode l) { + sync_debug_mode = l; + } + + SyncDebugMode get_sync_debug_mode() { + return sync_debug_mode; + } + + private: + SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED; +}; + +C10_CUDA_API __inline__ WarningState& warning_state() { + static WarningState warning_state_; + return warning_state_; +} +// the subsequent functions are defined in the header because for performance +// reasons we want them to be inline +C10_CUDA_API void __inline__ memcpy_and_sync( + void* dst, + const void* src, + int64_t nbytes, + cudaMemcpyKind kind, + cudaStream_t stream) { + if (C10_UNLIKELY( + warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) { + warn_or_error_on_sync(); + } + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_stream_synchronization( + c10::kCUDA, reinterpret_cast(stream)); + } +#if defined(USE_ROCM) && USE_ROCM + // As of ROCm 6.4.1, HIP runtime does not raise an error during capture of + // hipMemcpyWithStream which is a synchronous call. Thus, we add a check + // here explicitly. + hipStreamCaptureStatus captureStatus; + C10_CUDA_CHECK(hipStreamGetCaptureInfo(stream, &captureStatus, nullptr)); + if (C10_LIKELY(captureStatus == hipStreamCaptureStatusNone)) { + C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream)); + } else { + C10_CUDA_CHECK(hipErrorStreamCaptureUnsupported); + } +#else + C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream)); + C10_CUDA_CHECK(cudaStreamSynchronize(stream)); +#endif +} + +C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) { + if (C10_UNLIKELY( + warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) { + warn_or_error_on_sync(); + } + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_stream_synchronization( + c10::kCUDA, reinterpret_cast(stream)); + } + C10_CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index); +C10_CUDA_API std::optional getDeviceIndexWithPrimaryContext(); + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..176c9290c3906815228faf0bdb502c50260eb1e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h @@ -0,0 +1,81 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +// CUDA Graphs utils used by c10 and aten. +// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only. + +namespace c10::cuda { + +// RAII guard for "cudaStreamCaptureMode", a thread-local value +// that controls the error-checking strictness of a capture. +struct C10_CUDA_API CUDAStreamCaptureModeGuard { + CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) + : strictness_(desired) { + C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_)); + } + CUDAStreamCaptureModeGuard(const CUDAStreamCaptureModeGuard&) = delete; + CUDAStreamCaptureModeGuard(CUDAStreamCaptureModeGuard&&) = delete; + CUDAStreamCaptureModeGuard& operator=(const CUDAStreamCaptureModeGuard&) = + delete; + CUDAStreamCaptureModeGuard& operator=(CUDAStreamCaptureModeGuard&&) = delete; + ~CUDAStreamCaptureModeGuard() { + C10_CUDA_CHECK_WARN(cudaThreadExchangeStreamCaptureMode(&strictness_)); + } + + private: + cudaStreamCaptureMode strictness_; +}; + +// Protects against enum cudaStreamCaptureStatus implementation changes. +// Some compilers seem not to like static_assert without the messages. +static_assert( + int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0, + "unexpected int(cudaStreamCaptureStatusNone) value"); +static_assert( + int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1, + "unexpected int(cudaStreamCaptureStatusActive) value"); +static_assert( + int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2, + "unexpected int(cudaStreamCaptureStatusInvalidated) value"); + +enum class CaptureStatus : int { + None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone), + Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive), + Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) +}; + +inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { + switch (status) { + case CaptureStatus::None: + os << "cudaStreamCaptureStatusNone"; + break; + case CaptureStatus::Active: + os << "cudaStreamCaptureStatusActive"; + break; + case CaptureStatus::Invalidated: + os << "cudaStreamCaptureStatusInvalidated"; + break; + default: + TORCH_INTERNAL_ASSERT( + false, "Unknown CUDA graph CaptureStatus", int(status)); + } + return os; +} + +// Use this version where you're sure a CUDA context exists already. +inline CaptureStatus currentStreamCaptureStatusMayInitCtx() { + cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone}; + C10_CUDA_CHECK( + cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing)); + return CaptureStatus(is_capturing); +} + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..6cf6ce4be26c07d3869fb4c7d7242fc220128fe8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h @@ -0,0 +1,311 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10::cuda { + +// This code is kind of boilerplatey. See Note [Whither the DeviceGuard +// boilerplate] + +/// A variant of DeviceGuard that is specialized for CUDA. It accepts +/// integer indices (interpreting them as CUDA devices) and is a little +/// more efficient than DeviceGuard (it compiles to straight line +/// cudaSetDevice/cudaGetDevice calls); however, it can only be used +/// from code that links against CUDA directly. +struct CUDAGuard { + /// No default constructor; see Note [Omitted default constructor from RAII] + explicit CUDAGuard() = delete; + + /// Set the current CUDA device to the passed device index. + explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {} + + /// Sets the current CUDA device to the passed device. Errors if the passed + /// device is not a CUDA device. + explicit CUDAGuard(Device device) : guard_(device) {} + + // Copy is not allowed + CUDAGuard(const CUDAGuard&) = delete; + CUDAGuard& operator=(const CUDAGuard&) = delete; + + // Move is not allowed (there is no uninitialized state) + CUDAGuard(CUDAGuard&& other) = delete; + CUDAGuard& operator=(CUDAGuard&& other) = delete; + ~CUDAGuard() = default; + + /// Sets the CUDA device to the given device. Errors if the given device + /// is not a CUDA device. + void set_device(Device device) { + guard_.set_device(device); + } + + /// Sets the CUDA device to the given device. Errors if the given device + /// is not a CUDA device. (This method is provided for uniformity with + /// DeviceGuard). + void reset_device(Device device) { + guard_.reset_device(device); + } + + /// Sets the CUDA device to the given device index. + void set_index(DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set upon construction of the guard + Device original_device() const { + return guard_.original_device(); + } + + /// Returns the last device that was set via `set_device`, if any, otherwise + /// the device passed during construction. + Device current_device() const { + return guard_.current_device(); + } + + private: + /// The guard for the current device. + c10::impl::InlineDeviceGuard guard_; +}; + +/// A variant of OptionalDeviceGuard that is specialized for CUDA. See +/// CUDAGuard for when you can use this. +struct OptionalCUDAGuard { + /// Create an uninitialized OptionalCUDAGuard. + explicit OptionalCUDAGuard() = default; + + /// Set the current CUDA device to the passed Device, if it is not nullopt. + explicit OptionalCUDAGuard(std::optional device_opt) + : guard_(device_opt) {} + + /// Set the current CUDA device to the passed device index, if it is not + /// nullopt + explicit OptionalCUDAGuard(std::optional device_index_opt) + : guard_(device_index_opt) {} + + // Copy is not allowed + OptionalCUDAGuard(const OptionalCUDAGuard&) = delete; + OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete; + ~OptionalCUDAGuard() = default; + + /// Sets the CUDA device to the given device, initializing the guard if it + /// is not already initialized. Errors if the given device is not a CUDA + /// device. + void set_device(Device device) { + guard_.set_device(device); + } + + /// Sets the CUDA device to the given device, initializing the guard if it is + /// not already initialized. Errors if the given device is not a CUDA device. + /// (This method is provided for uniformity with OptionalDeviceGuard). + void reset_device(Device device) { + guard_.reset_device(device); + } + + /// Sets the CUDA device to the given device index, initializing the guard if + /// it is not already initialized. + void set_index(DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set immediately prior to initialization of the + /// guard, or nullopt if the guard is uninitialized. + std::optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + std::optional current_device() const { + return guard_.current_device(); + } + + /// Restore the original CUDA device, resetting this guard to uninitialized + /// state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + +/// A variant of StreamGuard that is specialized for CUDA. See CUDAGuard +/// for when you can use this. +struct CUDAStreamGuard { + /// No default constructor, see Note [Omitted default constructor from RAII] + explicit CUDAStreamGuard() = delete; + + /// Set the current CUDA device to the device associated with the passed + /// stream, and set the current CUDA stream on that device to the passed + /// stream. Errors if the Stream is not a CUDA stream. + explicit CUDAStreamGuard(Stream stream) : guard_(stream) {} + ~CUDAStreamGuard() = default; + + /// Copy is disallowed + CUDAStreamGuard(const CUDAStreamGuard&) = delete; + CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete; + + /// Move is disallowed, as CUDAStreamGuard does not have an uninitialized + /// state, which is required for moves on types with nontrivial destructors. + CUDAStreamGuard(CUDAStreamGuard&& other) = delete; + CUDAStreamGuard& operator=(CUDAStreamGuard&& other) = delete; + + /// Resets the currently set stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// Errors if the stream passed is not a CUDA stream. + /// + /// NOTE: this implementation may skip some stream/device setting if + /// it can prove that it is unnecessary. + /// + /// WARNING: reset_stream does NOT preserve previously set streams on + /// different devices. If you need to set streams on multiple devices + /// on CUDA, use CUDAMultiStreamGuard instead. + void reset_stream(Stream stream) { + guard_.reset_stream(stream); + } + + /// Returns the CUDA stream that was set at the time the guard was + /// constructed. + CUDAStream original_stream() const { + return CUDAStream(CUDAStream::UNCHECKED, guard_.original_stream()); + } + + /// Returns the most recent CUDA stream that was set using this device guard, + /// either from construction, or via set_stream. + CUDAStream current_stream() const { + return CUDAStream(CUDAStream::UNCHECKED, guard_.current_stream()); + } + + /// Returns the most recent CUDA device that was set using this device guard, + /// either from construction, or via set_device/reset_device/set_index. + Device current_device() const { + return guard_.current_device(); + } + + /// Returns the CUDA device that was set at the most recent reset_stream(), + /// or otherwise the device at construction time. + Device original_device() const { + return guard_.original_device(); + } + + private: + c10::impl::InlineStreamGuard guard_; +}; + +/// A variant of OptionalStreamGuard that is specialized for CUDA. See +/// CUDAGuard for when you can use this. +struct OptionalCUDAStreamGuard { + /// Create an uninitialized guard. + explicit OptionalCUDAStreamGuard() = default; + + /// Set the current CUDA device to the device associated with the passed + /// stream, and set the current CUDA stream on that device to the passed + /// stream. Errors if the Stream is not a CUDA stream. + explicit OptionalCUDAStreamGuard(Stream stream) : guard_(stream) {} + + /// Set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream, + /// if the passed stream is not nullopt. + explicit OptionalCUDAStreamGuard(std::optional stream_opt) + : guard_(stream_opt) {} + + /// Copy is disallowed + OptionalCUDAStreamGuard(const OptionalCUDAStreamGuard&) = delete; + OptionalCUDAStreamGuard& operator=(const OptionalCUDAStreamGuard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + OptionalCUDAStreamGuard(OptionalCUDAStreamGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + OptionalCUDAStreamGuard& operator=(OptionalCUDAStreamGuard&& other) = delete; + ~OptionalCUDAStreamGuard() = default; + + /// Resets the currently set CUDA stream to the original stream and + /// the currently set device to the original device. Then, + /// set the current device to the device associated with the passed stream, + /// and set the current stream on that device to the passed stream. + /// Initializes the guard if it was not previously initialized. + void reset_stream(Stream stream) { + guard_.reset_stream(stream); + } + + /// Returns the CUDA stream that was set at the time the guard was most + /// recently initialized, or nullopt if the guard is uninitialized. + std::optional original_stream() const { + auto r = guard_.original_stream(); + if (r.has_value()) { + return CUDAStream(CUDAStream::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + /// Returns the most recent CUDA stream that was set using this stream guard, + /// either from construction, or via reset_stream, if the guard is + /// initialized, or nullopt if the guard is uninitialized. + std::optional current_stream() const { + auto r = guard_.current_stream(); + if (r.has_value()) { + return CUDAStream(CUDAStream::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + /// Restore the original CUDA device and stream, resetting this guard to + /// uninitialized state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalStreamGuard guard_; +}; + +/// A variant of MultiStreamGuard that is specialized for CUDA. +struct CUDAMultiStreamGuard { + explicit CUDAMultiStreamGuard(ArrayRef streams) + : guard_(unwrapStreams(streams)) {} + + /// Copy is disallowed + CUDAMultiStreamGuard(const CUDAMultiStreamGuard&) = delete; + CUDAMultiStreamGuard& operator=(const CUDAMultiStreamGuard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + CUDAMultiStreamGuard(CUDAMultiStreamGuard&& other) = delete; + + // See Note [Move assignment for RAII guards is tricky] + CUDAMultiStreamGuard& operator=(CUDAMultiStreamGuard&& other) = delete; + ~CUDAMultiStreamGuard() = default; + + private: + c10::impl::InlineMultiStreamGuard guard_; + + static std::vector unwrapStreams(ArrayRef cudaStreams) { + std::vector streams; + streams.reserve(cudaStreams.size()); + for (const CUDAStream& cudaStream : cudaStreams) { + streams.push_back(cudaStream); + } + return streams; + } +}; + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h new file mode 100644 index 0000000000000000000000000000000000000000..93b371ce6ee854d074f6d47d0481c2a193e07d69 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS + +// We have not yet modified the AMD HIP build to generate this file so +// we add an extra option to specifically ignore it. +#ifndef C10_CUDA_NO_CMAKE_CONFIGURE_FILE +#include +#endif // C10_CUDA_NO_CMAKE_CONFIGURE_FILE + +#endif + +// See c10/macros/Export.h for a detailed explanation of what the function +// of these macros are. We need one set of macros for every separate library +// we build. + +#ifdef _WIN32 +#if defined(C10_CUDA_BUILD_SHARED_LIBS) +#define C10_CUDA_EXPORT __declspec(dllexport) +#define C10_CUDA_IMPORT __declspec(dllimport) +#else +#define C10_CUDA_EXPORT +#define C10_CUDA_IMPORT +#endif +#else // _WIN32 +#if defined(__GNUC__) +#define C10_CUDA_EXPORT __attribute__((__visibility__("default"))) +#else // defined(__GNUC__) +#define C10_CUDA_EXPORT +#endif // defined(__GNUC__) +#define C10_CUDA_IMPORT C10_CUDA_EXPORT +#endif // _WIN32 + +// This one is being used by libc10_cuda.so +#ifdef C10_CUDA_BUILD_MAIN_LIB +#define C10_CUDA_API C10_CUDA_EXPORT +#else +#define C10_CUDA_API C10_CUDA_IMPORT +#endif + +/** + * The maximum number of GPUs that we recognizes. Increasing this beyond the + * initial limit of 16 broke Caffe2 testing, hence the ifdef guards. + * This value cannot be more than 128 because our DeviceIndex is a uint8_t. +o */ +#ifdef FBCODE_CAFFE2 +// fbcode depends on this value being 16 +#define C10_COMPILE_TIME_MAX_GPUS 16 +#else +#define C10_COMPILE_TIME_MAX_GPUS 120 +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h new file mode 100644 index 0000000000000000000000000000000000000000..ec08cde0c1b71c9a0c8dd586e4fa7f6760e230f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h @@ -0,0 +1,157 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +/* This file defines math functions compatible across different gpu + * platforms (currently CUDA and HIP). + */ +#if defined(__CUDACC__) || defined(__HIPCC__) + +#include +#include + +#ifdef __HIPCC__ +#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE +#else /* __HIPCC__ */ +#ifdef __CUDACC_RTC__ +#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE +#else /* __CUDACC_RTC__ */ +#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE +#endif /* __CUDACC_RTC__ */ +#endif /* __HIPCC__ */ + +namespace c10::cuda::compat { + +__MATH_FUNCTIONS_DECL__ float abs(float x) { + return ::fabsf(x); +} +__MATH_FUNCTIONS_DECL__ double abs(double x) { + return ::fabs(x); +} + +__MATH_FUNCTIONS_DECL__ float exp(float x) { + return ::expf(x); +} +__MATH_FUNCTIONS_DECL__ double exp(double x) { + return ::exp(x); +} + +__MATH_FUNCTIONS_DECL__ float ceil(float x) { + return ::ceilf(x); +} +__MATH_FUNCTIONS_DECL__ double ceil(double x) { + return ::ceil(x); +} + +__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysignf(x, y); +#else + // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64 + // (e.g. Jetson), see PyTorch PR #51834 + // This host function needs to be here for the compiler but is never used + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} +__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysign(x, y); +#else + // see above + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} + +__MATH_FUNCTIONS_DECL__ float floor(float x) { + return ::floorf(x); +} +__MATH_FUNCTIONS_DECL__ double floor(double x) { + return ::floor(x); +} + +__MATH_FUNCTIONS_DECL__ float log(float x) { + return ::logf(x); +} +__MATH_FUNCTIONS_DECL__ double log(double x) { + return ::log(x); +} + +__MATH_FUNCTIONS_DECL__ float log1p(float x) { + return ::log1pf(x); +} + +__MATH_FUNCTIONS_DECL__ double log1p(double x) { + return ::log1p(x); +} + +__MATH_FUNCTIONS_DECL__ float max(float x, float y) { + return ::fmaxf(x, y); +} +__MATH_FUNCTIONS_DECL__ double max(double x, double y) { + return ::fmax(x, y); +} + +__MATH_FUNCTIONS_DECL__ float min(float x, float y) { + return ::fminf(x, y); +} +__MATH_FUNCTIONS_DECL__ double min(double x, double y) { + return ::fmin(x, y); +} + +__MATH_FUNCTIONS_DECL__ float pow(float x, float y) { + return ::powf(x, y); +} +__MATH_FUNCTIONS_DECL__ double pow(double x, double y) { + return ::pow(x, y); +} + +__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) { + return ::sincosf(x, sptr, cptr); +} +__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) { + return ::sincos(x, sptr, cptr); +} + +__MATH_FUNCTIONS_DECL__ float sqrt(float x) { + return ::sqrtf(x); +} +__MATH_FUNCTIONS_DECL__ double sqrt(double x) { + return ::sqrt(x); +} + +__MATH_FUNCTIONS_DECL__ float rsqrt(float x) { + return ::rsqrtf(x); +} +__MATH_FUNCTIONS_DECL__ double rsqrt(double x) { + return ::rsqrt(x); +} + +__MATH_FUNCTIONS_DECL__ float tan(float x) { + return ::tanf(x); +} +__MATH_FUNCTIONS_DECL__ double tan(double x) { + return ::tan(x); +} + +__MATH_FUNCTIONS_DECL__ float tanh(float x) { + return ::tanhf(x); +} +__MATH_FUNCTIONS_DECL__ double tanh(double x) { + return ::tanh(x); +} + +__MATH_FUNCTIONS_DECL__ float normcdf(float x) { + return ::normcdff(x); +} +__MATH_FUNCTIONS_DECL__ double normcdf(double x) { + return ::normcdf(x); +} + +} // namespace c10::cuda::compat + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..c44105fa61281b2d06f02524b789d7c7554374f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// this file is to avoid circular dependency between CUDAFunctions.h and +// CUDAExceptions.h + +#include +#include + +#include +#include + +namespace c10::cuda { +C10_CUDA_API std::string get_cuda_error_help(cudaError_t /*error*/) noexcept; +C10_CUDA_API const char* get_cuda_check_suffix() noexcept; +C10_CUDA_API std::mutex* getFreeMutex(); +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h new file mode 100644 index 0000000000000000000000000000000000000000..c0e616f584c5a41e40e75586c4e3d3ae8b381feb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h @@ -0,0 +1,273 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include + +/* + * Stream pool note. + * + * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams + * are backed by cuStreams, but they use several pools to minimize the costs + * associated with creating, retaining, and destroying cuStreams. + * + * There are three pools per device, and a device's pools are lazily created. + * + * The first pool contains only the default stream. When the default stream + * is requested it's returned. + * + * The second pool is the "low priority" or "default priority" streams. In + * HIP builds there is no distinction between streams in this pool and streams + * in the third pool (below). There are 32 of these streams per device, and + * when a stream is requested one of these streams is returned round-robin. + * That is, the first stream requested is at index 0, the second at index 1... + * to index 31, then index 0 again. + * + * This means that if 33 low priority streams are requested, the first and + * last streams requested are actually the same stream (under the covers) + * and kernels enqueued on them cannot run concurrently. + * + * The third pool is the "high priority" streams. The third pool acts like + * the second pool except the streams are created with a higher priority. + * + * These pools suggest that stream users should prefer many short-lived streams, + * as the cost of acquiring and releasing streams is effectively zero. If + * many longer-lived streams are required in performance critical scenarios + * then the functionality here may need to be extended to allow, for example, + * "reserving" a subset of the pool so that other streams do not accidentally + * overlap the performance critical streams. + * + * Note: although the notion of "current stream for device" is thread local + * (every OS thread has a separate current stream, as one might expect), + * the stream pool is global across all threads; stream 0 is always stream 0 + * no matter which thread you use it on. Multiple threads can synchronize + * on the same stream. Although the CUDA documentation is not very clear + * on the matter, streams are thread safe; e.g., it is safe to enqueue + * a kernel on the same stream from two different threads. + */ + +namespace c10::cuda { + +static constexpr int max_compile_time_stream_priorities = 4; + +// Value object representing a CUDA stream. This is just a wrapper +// around c10::Stream, but it comes with a little extra CUDA-specific +// functionality (conversion to cudaStream_t), and a guarantee that +// the wrapped c10::Stream really is a CUDA stream. +class C10_CUDA_API CUDAStream { + public: + enum Unchecked { UNCHECKED }; + + /// Construct a CUDAStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a CUDA stream. + explicit CUDAStream(Stream stream) : stream_(stream) { + TORCH_CHECK(stream_.device_type() == DeviceType::CUDA); + } + + /// Construct a CUDAStream from a Stream with no error checking. + /// This constructor uses the "named" constructor idiom, and can + /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream) + explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {} + + bool operator==(const CUDAStream& other) const noexcept { + return unwrap() == other.unwrap(); + } + + bool operator!=(const CUDAStream& other) const noexcept { + return unwrap() != other.unwrap(); + } + + /// Implicit conversion to cudaStream_t. + operator cudaStream_t() const { + return stream(); + } + + /// Implicit conversion to Stream (a.k.a., forget that the stream is a + /// CUDA stream). + operator Stream() const { + return unwrap(); + } + + /// Used to avoid baking in device type explicitly to Python-side API. + DeviceType device_type() const { + return DeviceType::CUDA; + } + + /// Get the CUDA device index that this stream is associated with. + DeviceIndex device_index() const { + return stream_.device_index(); + } + + /// Get the full Device that this stream is associated with. The Device + /// is guaranteed to be a CUDA device. + Device device() const { + return Device(DeviceType::CUDA, device_index()); + } + + /// Return the stream ID corresponding to this particular stream. + StreamId id() const { + return stream_.id(); + } + + bool query() const { + DeviceGuard guard{stream_.device()}; + cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream())); + + if (err == cudaSuccess) { + return true; + } else if (err != cudaErrorNotReady) { + C10_CUDA_CHECK(err); + } else { + // ignore and clear the error if not ready + (void)cudaGetLastError(); + } + + return false; + } + + void synchronize() const { + DeviceGuard guard{stream_.device()}; + c10::cuda::stream_synchronize(stream()); + } + + int priority() const { + DeviceGuard guard{stream_.device()}; + int priority = 0; + C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority)); + return priority; + } + + /// Explicit conversion to cudaStream_t. + cudaStream_t stream() const; + + /// Explicit conversion to Stream. + Stream unwrap() const { + return stream_; + } + + /// Reversibly pack a CUDAStream into a struct representation. + /// Previously the stream's data was packed into a single int64_t, + /// as it was assumed the fields would not require more than + /// 64 bits of storage in total. + /// See https://github.com/pytorch/pytorch/issues/75854 + /// for more information regarding newer platforms that may violate + /// this assumption. + /// + /// The CUDAStream can be unpacked using unpack(). + struct c10::StreamData3 pack3() const { + return stream_.pack3(); + } + + // Unpack a CUDAStream from the 3 fields generated by pack(). + static CUDAStream unpack3( + StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + return CUDAStream(Stream::unpack3(stream_id, device_index, device_type)); + } + + static std::tuple priority_range() { + // Note: this returns the range of priority **supported by PyTorch**, not + // the range of priority **supported by CUDA**. The former is a subset of + // the latter. + int least_priority = 0, greatest_priority = 0; + C10_CUDA_CHECK( + cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); +#ifdef USE_ROCM + // See Note [HIP stream priorities] + TORCH_INTERNAL_ASSERT( + least_priority == 1, "Unexpected HIP stream priority range"); + least_priority = 0; +#else + TORCH_INTERNAL_ASSERT( + least_priority == 0, "Unexpected CUDA stream priority range"); +#endif + TORCH_INTERNAL_ASSERT( + greatest_priority <= -1, "Unexpected CUDA stream priority range"); + greatest_priority = std::max( + -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority); + return std::make_tuple(least_priority, greatest_priority); + } + + // Deleted for now; use CUDAEvent::block instead + // void synchronize_with(const CUDAEvent& event) const; + + private: + Stream stream_; +}; + +/** + * Get a new stream from the CUDA stream pool. You can think of this + * as "creating" a new stream, but no such creation actually happens; + * instead, streams are preallocated from the pool and returned in a + * round-robin fashion. + * + * You can request a stream from the high priority pool by setting + * isHighPriority to true, or a stream for a specific device by setting device + * (defaulting to the current CUDA stream.) + */ +C10_API CUDAStream +getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1); +// no default priority to disambiguate overloads +C10_API CUDAStream +getStreamFromPool(const int priority, DeviceIndex device = -1); + +/** + * Get a CUDAStream from a externally allocated one. + * + * This is mainly for interoperability with different libraries where we + * want to operate on a non-torch allocated stream for data exchange or similar + * purposes + */ +C10_API CUDAStream +getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index); + +/** + * Get the default CUDA stream, for the passed CUDA device, or for the + * current device if no device index is passed. The default stream is + * where most computation occurs when you aren't explicitly using + * streams. + */ +C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1); + +/** + * Get the current CUDA stream, for the passed CUDA device, or for the + * current device if no device index is passed. The current CUDA stream + * will usually be the default CUDA stream for the device, but it may + * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard' + * or 'CUDAStreamGuard'. + */ +C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1); + +/** + * Set the current stream on the device of the passed in stream to be + * the passed in stream. Yes, you read that right: this function + * has *nothing* to do with the current device: it toggles the current + * stream of the device of the passed stream. + * + * Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead + * (which will switch both your current device and current stream in the way you + * expect, and reset it back to its original state afterwards). + */ +C10_API void setCurrentCUDAStream(CUDAStream stream); + +C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s); + +} // namespace c10::cuda + +namespace std { +template <> +struct hash { + size_t operator()(c10::cuda::CUDAStream s) const noexcept { + return std::hash{}(s.unwrap()); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h new file mode 100644 index 0000000000000000000000000000000000000000..49a5a131d4888f5f8f422bc07b74065db9315397 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h @@ -0,0 +1,124 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#define NVML_NO_UNVERSIONED_FUNC_DEFS +#include + +#include + +#define C10_CUDA_DRIVER_CHECK(EXPR) \ + do { \ + CUresult __err = EXPR; \ + if (__err != CUDA_SUCCESS) { \ + const char* err_str; \ + CUresult get_error_str_err [[maybe_unused]] = \ + c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \ + if (get_error_str_err != CUDA_SUCCESS) { \ + TORCH_CHECK(false, "CUDA driver error: unknown error"); \ + } else { \ + TORCH_CHECK(false, "CUDA driver error: ", err_str); \ + } \ + } \ + } while (0) + +#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \ + do { \ + CUresult __err = EXPR; \ + if (__err != CUDA_SUCCESS) { \ + const char* err_str; \ + CUresult get_error_str_err [[maybe_unused]] = \ + c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \ + if (get_error_str_err != CUDA_SUCCESS) { \ + TORCH_WARN("CUDA driver error: unknown error"); \ + } else { \ + TORCH_WARN("CUDA driver error: ", err_str); \ + } \ + goto NEXT; \ + } \ + } while (0) + +// The integer in the second column specifies the requested CUDA Driver API +// version. The dynamic loader will accept a driver with a newer version, but it +// ensures that the requested symbol exists in *at least* the specified version +// or earlier. + +// Keep these requested versions as low as possible to maximize compatibility +// across different driver versions. + +// Why do we pin to an older version instead of using the latest? +// If a user installs a newer driver, blindly resolving the symbol may bind to a +// newer version of the function with different behavior, potentially breaking +// PyTorch. + +#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \ + _(cuDeviceGetAttribute, 12000) \ + _(cuMemAddressReserve, 12000) \ + _(cuMemRelease, 12000) \ + _(cuMemMap, 12000) \ + _(cuMemAddressFree, 12000) \ + _(cuMemSetAccess, 12000) \ + _(cuMemUnmap, 12000) \ + _(cuMemCreate, 12000) \ + _(cuMemGetAllocationGranularity, 12000) \ + _(cuMemExportToShareableHandle, 12000) \ + _(cuMemImportFromShareableHandle, 12000) \ + _(cuMemsetD32Async, 12000) \ + _(cuStreamWriteValue32, 12000) \ + _(cuGetErrorString, 12000) + +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030) +#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \ + _(cuCtxFromGreenCtx, 12080) \ + _(cuCtxGetCurrent, 12080) \ + _(cuCtxPopCurrent, 12080) \ + _(cuCtxPushCurrent, 12080) \ + _(cuCtxSetCurrent, 12080) \ + _(cuGreenCtxCreate, 12080) \ + _(cuGreenCtxDestroy, 12080) \ + _(cuDevSmResourceSplitByCount, 12080) \ + _(cuDeviceGet, 12080) \ + _(cuDeviceGetDevResource, 12080) \ + _(cuDevResourceGenerateDesc, 12080) \ + _(cuMulticastAddDevice, 12030) \ + _(cuMulticastBindMem, 12030) \ + _(cuMulticastCreate, 12030) \ + _(cuMulticastUnbind, 12030) +#else +#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) +#endif + +#define C10_NVML_DRIVER_API(_) \ + _(nvmlInit_v2) \ + _(nvmlDeviceGetHandleByPciBusId_v2) \ + _(nvmlDeviceGetNvLinkRemoteDeviceType) \ + _(nvmlDeviceGetNvLinkRemotePciInfo_v2) \ + _(nvmlDeviceGetComputeRunningProcesses) \ + _(nvmlSystemGetCudaDriverVersion_v2) + +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040) +#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV) +#else +#define C10_NVML_DRIVER_API_OPTIONAL(_) +#endif + +namespace c10::cuda { + +struct DriverAPI { +#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_; +#define CREATE_MEMBER(name) decltype(&name) name##_; + C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED) + C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED) + C10_NVML_DRIVER_API(CREATE_MEMBER) + C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER) +#undef CREATE_MEMBER_VERSIONED +#undef CREATE_MEMBER + + static DriverAPI* get(); + static void* get_nvml_handle(); +}; + +} // namespace c10::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..24cb643a0599072f52eb1188bf53fc236368e957 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h @@ -0,0 +1,270 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace c10::cuda::impl { + +struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr DeviceType static_type = DeviceType::CUDA; + + CUDAGuardImpl() = default; + explicit CUDAGuardImpl(DeviceType t) { + TORCH_CHECK( + t == DeviceType::CUDA, + "CUDAGuardImpl initialized with non-CUDA DeviceType: ", + t); + } + DeviceType type() const override { + return DeviceType::CUDA; + } + Device exchangeDevice(Device d) const override { + TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d); + auto old_device_index = c10::cuda::ExchangeDevice(d.index()); + return Device(DeviceType::CUDA, old_device_index); + } + Device getDevice() const override { + DeviceIndex device = 0; + C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); + return Device(DeviceType::CUDA, device); + } + std::optional uncheckedGetDevice() const noexcept { + DeviceIndex device{-1}; + const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device)); + C10_CUDA_CHECK_WARN(err); + if (err != cudaSuccess) { + return std::nullopt; + } + return Device(DeviceType::CUDA, device); + } + void setDevice(Device d) const override { + TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d); + C10_CUDA_CHECK(c10::cuda::SetDevice(d.index())); + } + void uncheckedSetDevice(Device d) const noexcept override { + C10_CUDA_CHECK_WARN(c10::cuda::MaybeSetDevice(d.index())); + } + Stream getStream(Device d) const override { + return getCurrentCUDAStream(d.index()).unwrap(); + } + Stream getDefaultStream(Device d) const override { + return getDefaultCUDAStream(d.index()); + } + Stream getNewStream(Device d, int priority = 0) const override { + return getStreamFromPool(priority, d.index()); + } + Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) + const override { + return getStreamFromPool(isHighPriority, d.index()); + } + // NB: These do NOT set the current device + Stream exchangeStream(Stream s) const override { + CUDAStream cs(s); + auto old_stream = getCurrentCUDAStream(s.device().index()); + setCurrentCUDAStream(cs); + return old_stream.unwrap(); + } + DeviceIndex deviceCount() const noexcept override { + return device_count(); + } + + // Event-related functions + void createEvent(cudaEvent_t* cuda_event, const EventFlag flag) const { + // Maps PyTorch's Event::Flag to CUDA flag + auto cuda_flag = cudaEventDefault; + switch (flag) { + case EventFlag::PYTORCH_DEFAULT: + cuda_flag = cudaEventDisableTiming; + break; + case EventFlag::BACKEND_DEFAULT: + cuda_flag = cudaEventDefault; + break; + default: + TORCH_CHECK(false, "CUDA event received unknown flag"); + } + + C10_CUDA_CHECK(cudaEventCreateWithFlags(cuda_event, cuda_flag)); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_creation( + c10::kCUDA, reinterpret_cast(cuda_event)); + } + } + + void destroyEvent(void* event, const DeviceIndex device_index) + const noexcept override { + if (!event) + return; + auto cuda_event = static_cast(event); + DeviceIndex orig_device{-1}; + C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device)); + C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index)); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_deletion( + c10::kCUDA, reinterpret_cast(cuda_event)); + } + C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event)); + C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device)); + } + + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override { + TORCH_CHECK( + device_index == -1 || device_index == stream.device_index(), + "Event device index ", + device_index, + " does not match recording stream's device index ", + stream.device_index(), + "."); + + cudaEvent_t cuda_event = static_cast(*event); + CUDAStream cuda_stream{stream}; + + // Moves to stream's device to record + const auto orig_device = getDevice(); + setDevice(stream.device()); + + // Creates the event (lazily) + if (!cuda_event) + createEvent(&cuda_event, flag); + C10_CUDA_CHECK(cudaEventRecord(cuda_event, cuda_stream)); + // Makes the void* point to the (possibly just allocated) CUDA event + *event = cuda_event; + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_record( + c10::kCUDA, + reinterpret_cast(cuda_event), + reinterpret_cast(cuda_stream.stream())); + } + + // Resets device + setDevice(orig_device); + } + + void block(void* event, const Stream& stream) const override { + if (!event) + return; + cudaEvent_t cuda_event = static_cast(event); + CUDAStream cuda_stream{stream}; + const auto orig_device = getDevice(); + setDevice(stream.device()); + C10_CUDA_CHECK(cudaStreamWaitEvent( + cuda_stream, + cuda_event, + /*flags (must be zero)=*/0)); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_wait( + c10::kCUDA, + reinterpret_cast(cuda_event), + reinterpret_cast(cuda_stream.stream())); + } + setDevice(orig_device); + } + + // May be called from any device + bool queryEvent(void* event) const override { + if (!event) + return true; + cudaEvent_t cuda_event = static_cast(event); + // Note: cudaEventQuery can be safely called from any device + const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event)); + if (err != cudaErrorNotReady) { + C10_CUDA_CHECK(err); + } else { + // ignore and clear the error if not ready + (void)cudaGetLastError(); + } + return (err == cudaSuccess); + } + + // Stream-related functions + bool queryStream(const Stream& stream) const override { + CUDAStream cuda_stream{stream}; + return cuda_stream.query(); + } + + void synchronizeStream(const Stream& stream) const override { + CUDAStream cuda_stream{stream}; + cuda_stream.synchronize(); + } + + void synchronizeEvent(void* event) const override { + if (!event) + return; + cudaEvent_t cuda_event = static_cast(event); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_synchronization( + c10::kCUDA, reinterpret_cast(cuda_event)); + } + // Note: cudaEventSynchronize can be safely called from any device + C10_CUDA_CHECK(cudaEventSynchronize(cuda_event)); + } + + // Note: synchronizeDevice can be safely called from any device + void synchronizeDevice(const c10::DeviceIndex device_index) const override { + DeviceIndex orig_device{-1}; + C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device)); + C10_CUDA_CHECK(c10::cuda::SetDevice(device_index)); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_device_synchronization(c10::kCUDA); + } + C10_CUDA_CHECK(cudaDeviceSynchronize()); + C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device)); + } + + void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) + const override { + CUDAStream cuda_stream{stream}; + CUDACachingAllocator::recordStream(data_ptr, cuda_stream); + } + + double elapsedTime(void* event1, void* event2, const DeviceIndex device_index) + const override { + TORCH_CHECK( + event1 && event2, + "Both events must be recorded before calculating elapsed time."); + // Even though cudaEventElapsedTime can be safely called from any device, if + // the current device is not initialized, it will create a new cuda context, + // which will consume a lot of memory. + DeviceIndex orig_device{-1}; + C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device)); + C10_CUDA_CHECK(c10::cuda::SetDevice(device_index)); + cudaEvent_t cuda_event1 = static_cast(event1); + cudaEvent_t cuda_event2 = static_cast(event2); + float time_ms = 0; + // raise cudaErrorNotReady if either event is recorded but not yet completed + C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2)); + C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device)); + return static_cast(time_ms); + } +}; + +} // namespace c10::cuda::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h new file mode 100644 index 0000000000000000000000000000000000000000..3edcfe6d88a72a94120bf95d82a6bbc0a0798500 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h @@ -0,0 +1,14 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::cuda::impl { + +C10_CUDA_API int c10_cuda_test(); + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h new file mode 100644 index 0000000000000000000000000000000000000000..a2fb43f54676972b1df12b2be146786465a1b403 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h @@ -0,0 +1,11 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// Automatically generated header file for the C10 CUDA library. Do not +// include this file directly. Instead, include c10/cuda/CUDAMacros.h + +#define C10_CUDA_BUILD_SHARED_LIBS + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h new file mode 100644 index 0000000000000000000000000000000000000000..dfc4378c482c621ce05179900c719510e59ee8d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h new file mode 100644 index 0000000000000000000000000000000000000000..02fdbd4df99eaed11dfdc5dc190378156ea30177 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h new file mode 100644 index 0000000000000000000000000000000000000000..5d89f61f37a9db44fc7bbe5df20ce372e37dff4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h @@ -0,0 +1,10 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// This file exists for backwards compatibility and has been moved to +// torch/headeronly/macros/cmake_macros.h.in. No end user library should be +// including this file directly anyway (cuz they should be including +// Macros.h instead). +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h new file mode 100644 index 0000000000000000000000000000000000000000..4bec87d32d3efa5badc79d2b85d2cb018fe9c9a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h @@ -0,0 +1,182 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +namespace c10 { +namespace metal { + +// Atomic operations helper +template +struct AtomicType {}; +template +using AtomicType_t = typename AtomicType::type; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, float value) { + ::metal::atomic_fetch_add_explicit( + data + offset, value, ::metal::memory_order_relaxed); + } +}; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, int value) { + ::metal::atomic_fetch_add_explicit( + data + offset, value, ::metal::memory_order_relaxed); + } +}; + +// As of Metal3.2 atomic operations are not supported on half-precision floats, +// so they must be simulated Using atomic compare and exchange over 32-bit +// atomic type +template +static inline void atomic_add_helper( + device ::metal::atomic* data, + long offset, + T value) { + constexpr auto elem_per_enum = sizeof(uint) / sizeof(T); + auto ptr = data + (offset / elem_per_enum); + auto old = ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed); + union { + uint i; + T t[elem_per_enum]; + } val; + do { + val.i = old; + val.t[offset & (elem_per_enum - 1)] += value; + } while (!::metal::atomic_compare_exchange_weak_explicit( + ptr, + &old, + val.i, + ::metal::memory_order_relaxed, + ::metal::memory_order_relaxed)); +} + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, half value) { + atomic_add_helper(data, offset, value); + } +}; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, short value) { + atomic_add_helper(data, offset, value); + } +}; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, char value) { + atomic_add_helper(data, offset, value); + } +}; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, char value) { + atomic_add_helper(data, offset, value); + } +}; + +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, bfloat value) { + atomic_add_helper(data, offset, value); + } +}; + +// Metal supports atomic_store_explicit for bools, but +// sizeof(::metal::atomic_bool) is 4 Therefore it could not be used to +// atomically modify unaligned memory, so fall back to compare and exchange +// trick As accumulation over booleans are just or operation, do nothing if +// value is false +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, bool value) { + if (!value) { + return; + } + auto ptr = data + (offset >> 2); + auto old = + ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed); + union { + uint i; + bool t[4]; + } val; + do { + val.i = old; + val.t[offset & 3] = true; + } while (!::metal::atomic_compare_exchange_weak_explicit( + ptr, + &old, + val.i, + ::metal::memory_order_relaxed, + ::metal::memory_order_relaxed)); + } +}; + +// ComplexHalf atomic op +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, half2 value) { + auto ptr = data + offset; + auto old = + ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed); + while (!::metal::atomic_compare_exchange_weak_explicit( + ptr, + &old, + as_type(as_type(old) + value), + ::metal::memory_order_relaxed, + ::metal::memory_order_relaxed)) + ; + } +}; + +// There are no atomic 64-bit add in Metal yet, but templates below implements a +// consistent add I.e. if multiple threads are modify the same 64-bit value, +// results stored at the address will eventually be equal to its original value +// plus sum of all operands +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, long value) { + const auto value_bits = as_type(value); + const uint low = static_cast(value_bits); + uint high = static_cast(value_bits >> 32); + auto ptr = data + (offset << 1); + auto old_low = + atomic_fetch_add_explicit(ptr, low, ::metal::memory_order_relaxed); + high += (old_low + low < old_low) ? 1 : 0; + atomic_fetch_add_explicit(ptr + 1, high, ::metal::memory_order_relaxed); + } +}; + +// ComplexFloat atomic op, which again is not really atomic, but eventually +// consistent +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, float2 value) { + auto ptr = data + (offset << 1); + atomic_fetch_add_explicit(ptr + 0, value.x, ::metal::memory_order_relaxed); + atomic_fetch_add_explicit(ptr + 1, value.y, ::metal::memory_order_relaxed); + } +}; + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h new file mode 100644 index 0000000000000000000000000000000000000000..c508bbd55afa7077644bc5ff722ccbc46056e99c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// Set of global constants that could be shareable between CPU and Metal code + +#ifdef __METAL__ +#include +#define C10_METAL_CONSTEXPR constant constexpr +#else +#include +#define C10_METAL_CONSTEXPR constexpr +#endif + +#define C10_METAL_ALL_TYPES_FUNCTOR(_) \ + _(Byte, 0) \ + _(Char, 1) \ + _(Short, 2) \ + _(Int, 3) \ + _(Long, 4) \ + _(Half, 5) \ + _(Float, 6) \ + _(ComplexHalf, 8) \ + _(ComplexFloat, 9) \ + _(Bool, 11) \ + _(BFloat16, 15) + +namespace c10 { +namespace metal { +C10_METAL_CONSTEXPR unsigned max_ndim = 16; +C10_METAL_CONSTEXPR unsigned simdgroup_size = 32; + +#ifdef __METAL__ +template +using array = ::metal::array; +#else +template +using array = std::array; +#endif + +enum class ScalarType { +#define _DEFINE_ENUM_VAL_(_v, _n) _v = _n, + C10_METAL_ALL_TYPES_FUNCTOR(_DEFINE_ENUM_VAL_) +#undef _DEFINE_ENUM_VAL_ +}; + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h new file mode 100644 index 0000000000000000000000000000000000000000..25786e69bb6d9c37d69ce603aed53c8cb04a4a10 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h @@ -0,0 +1,116 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace c10 { +namespace metal { +C10_METAL_CONSTEXPR unsigned error_message_count = 30; +struct ErrorMessage { + char file[128]; + char func[128]; + char message[250]; + unsigned int line; +}; + +struct ErrorMessages { +#ifdef __METAL__ + ::metal::atomic count; +#else + unsigned int count; +#endif + ErrorMessage msg[error_message_count]; +}; + +#ifdef __METAL__ +namespace detail { +static uint strncpy(device char* dst, constant const char* src, unsigned len) { + uint i = 0; + while (src[i] != 0 && i < len - 1) { + dst[i] = src[i]; + i++; + } + dst[i] = 0; + return i; +} + +inline uint print_arg( + device char* ptr, + unsigned len, + constant const char* arg) { + return strncpy(ptr, arg, len); +} + +// Returns number length as string in base10 +static inline uint base10_length(long num) { + uint rc = 1; + if (num < 0) { + num = -num; + rc += 1; + } + while (num > 9) { + num /= 10; + rc++; + } + return rc; +} + +// Converts signed integer to string +inline uint print_arg(device char* ptr, unsigned len, long arg) { + const auto arg_len = base10_length(arg); + if (arg_len >= len) + return 0; + if (arg < 0) { + ptr[0] = '-'; + arg = -arg; + } + uint idx = 1; + do { + ptr[arg_len - idx] = '0' + (arg % 10); + arg /= 10; + idx++; + } while (arg > 0); + ptr[arg_len] = 0; + return arg_len; +} + +template +inline void print_args(device char* ptr, unsigned len, T arg) { + print_arg(ptr, len, arg); +} + +template +inline void print_args(device char* ptr, unsigned len, T arg, Args... args) { + const auto rc = print_arg(ptr, len, arg); + print_args(ptr + rc, len - rc, args...); +} + +} // namespace detail + +template +static void report_error( + device ErrorMessages* msgs, + constant const char* file, + int line, + constant const char* func, + Args... args) { + const auto idx = + atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed); + if (idx >= error_message_count) { + return; + } + device auto* msg = &msgs->msg[idx]; + detail::strncpy(msg->file, file, 128); + detail::strncpy(msg->func, func, 128); + detail::print_args(msg->message, 250, args...); + msg->line = line; +} + +#define TORCH_REPORT_ERROR(buf, ...) \ + ::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__) +#endif +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h new file mode 100644 index 0000000000000000000000000000000000000000..18061b711232ddc8053f6672b23814fee5023926 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h @@ -0,0 +1,102 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copy-and-pasted from: +// https://github.com/ml-explore/mlx/blob/99c33d011d63174f50cea37c3eede002958be6d3/mlx/backend/metal/kernels/expm1f.h + +#pragma once + +#include + +// Original license copied below: +// Copyright (c) 2015-2023 Norbert Juffa +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +namespace c10 { +namespace metal { + +/* Compute exponential base e minus 1. Maximum ulp error = 0.997458 + + i = rint(a/log(2)), f = a-i*log(2). Then expm1(a) = 2**i * (expm1(f)+1) - 1. + Compute r = expm1(f). Then expm1(a)= 2 * (0.5 * 2**i * r + 0.5 * 2**i - 0.5). + With t = 0.5*2**i, expm1(a) = 2*(r * t + t-0.5). However, for best accuracy, + when i == 1, expm1(a)= 2*(r + 0.5), and when i == 0, expm1(a) = r. + + NOTE: Scale factor b is only applied if i < 0 or i > 1 (should be power of 2) +*/ +inline float expm1f_scaled_unchecked(float a, float b) { + float f, j, r, s, t, u, v, x, y; + int i; + + // exp(a) = 2**i * exp(f); i = rintf (a / log(2)) + j = ::metal::fma(1.442695f, a, 12582912.f); // 0x1.715476p0, 0x1.8p23 + j = j - 12582912.0f; // 0x1.8p23 + i = (int)j; + f = ::metal::fma(j, -6.93145752e-1f, a); + + // approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2] + s = f * f; + if (a == 0.0f) + s = a; // ensure -0 is passed through + // err = 0.997458 ulp1 = 11081805 + r = 1.97350979e-4f; // 0x1.9de000p-13 + r = ::metal::fma(r, f, 1.39309070e-3f); // 0x1.6d30bcp-10 + r = ::metal::fma(r, f, 8.33343994e-3f); // 0x1.1111f6p-7 + r = ::metal::fma(r, f, 4.16668020e-2f); // 0x1.55559ep-5 + r = ::metal::fma(r, f, 1.66666716e-1f); // 0x1.55555cp-3 + r = ::metal::fma(r, f, 4.99999970e-1f); // 0x1.fffffep-2 + u = (j == 1) ? (f + 0.5f) : f; + v = ::metal::fma(r, s, u); + s = 0.5f * b; + t = ::metal::ldexp(s, i); + y = t - s; + x = (t - y) - s; // double-float canonicalization of difference + r = ::metal::fma(v, t, x) + y; + r = r + r; + if (j == 0) + r = v; + if (j == 1) + r = v + v; + return r; +} + +/* Compute exponential base e minus 1. max ulp err = 0.99746 */ +inline float expm1f(float a) { + float r; + + r = expm1f_scaled_unchecked(a, 1.0f); + /* handle severe overflow and underflow */ + if (::metal::abs(a - 1.0f) > 88.0f) { + r = ::metal::pow(2, a); + r = ::metal::fma(r, r, -1.0f); + } + return r; +} + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h new file mode 100644 index 0000000000000000000000000000000000000000..4fb235e226ad27e7bb94b76a02172df86ce4c17f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h @@ -0,0 +1,749 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +using namespace c10::metal; +using namespace metal; + +namespace c10 { +namespace metal { + +template +inline float log_gamma(const T); + +inline float expm1f(float a); + +template +float erfc(T x); + +} // namespace metal +} // namespace c10 + +namespace { + +template +inline float lgamma(const T a) { + return log_gamma(a); +} + +inline float expm1(float a) { + return expm1f(a); +} + +// NOTE: The following code was ported directly from the CUDA implementation in +// `aten/src/ATen/native/cuda/IGammaKernel.cu` + +/* + * This implementation of the regularized incomplete gamma functions and + * their helper functions are derived from the implementation of SciPy's + * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations. + * See NOTICE for the licenses. + */ +// regularized lower & upper incomplete gamma +template +scalar_t ratevl( + scalar_t x, + const scalar_t num[], + int64_t M, + const scalar_t denom[], + int64_t N) { + // evaluating rational function, i.e., the ratio of two polynomials + // the coefficients for numerator are given by `num` while coeffs for + // denumerator are given by `denom` + + using accscalar_t = opmath_t; + int64_t i, dir; + accscalar_t y, num_ans, denom_ans; + accscalar_t absx = ::fabs(x); + thread const accscalar_t* p; + + if (absx > 1) { + /* Evaluate as a polynomial in 1/x. */ + dir = -1; + p = num + M; + y = 1 / x; + } else { + dir = 1; + p = num; + y = x; + } + + /* Evaluate the numerator */ + num_ans = *p; + p += dir; + for (i = 1; i <= M; i++) { + num_ans = num_ans * y + *p; + p += dir; + } + /* Evaluate the denominator */ + if (absx > 1) { + p = denom + N; + } else { + p = denom; + } + + denom_ans = *p; + p += dir; + for (i = 1; i <= N; i++) { + denom_ans = denom_ans * y + *p; + p += dir; + } + if (absx > 1) { + i = N - M; + return ::pow(x, static_cast(i)) * num_ans / denom_ans; + } else { + return num_ans / denom_ans; + } +} + +template +scalar_t lanczos_sum_expg_scaled(scalar_t x) { + // lanczos approximation + using accscalar_t = opmath_t; + + const accscalar_t lanczos_sum_expg_scaled_num[13] = { + 0.006061842346248906525783753964555936883222, + 0.5098416655656676188125178644804694509993, + 19.51992788247617482847860966235652136208, + 449.9445569063168119446858607650988409623, + 6955.999602515376140356310115515198987526, + 75999.29304014542649875303443598909137092, + 601859.6171681098786670226533699352302507, + 3481712.15498064590882071018964774556468, + 14605578.08768506808414169982791359218571, + 43338889.32467613834773723740590533316085, + 86363131.28813859145546927288977868422342, + 103794043.1163445451906271053616070238554, + 56906521.91347156388090791033559122686859}; + const accscalar_t lanczos_sum_expg_scaled_denom[13] = { + 1., + 66., + 1925., + 32670., + 357423., + 2637558., + 13339535., + 45995730., + 105258076., + 150917976., + 120543840., + 39916800., + 0}; + return ratevl( + static_cast(x), + lanczos_sum_expg_scaled_num, + sizeof(lanczos_sum_expg_scaled_num) / + sizeof(lanczos_sum_expg_scaled_num[0]) - + 1, + lanczos_sum_expg_scaled_denom, + sizeof(lanczos_sum_expg_scaled_denom) / + sizeof(lanczos_sum_expg_scaled_denom[0]) - + 1); +} + +template +scalar_t _igam_helper_fac(scalar_t a, scalar_t x) { + // compute x^a * exp(-a) / gamma(a) + // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with + // exp(a - x). + + using accscalar_t = opmath_t; + accscalar_t ax, fac, res, num, numfac; + const accscalar_t MAXLOG = 88.72283905206835; + const accscalar_t EXP1 = 2.718281828459045; + const accscalar_t lanczos_g = 6.024680040776729583740234375; + + if (::fabs(a - x) > 0.4 * ::fabs(a)) { + ax = a * ::log(x) - x - ::lgamma(a); + if (ax < -MAXLOG) { + return 0.0; + } + return ::exp(ax); + } + + fac = a + lanczos_g - 0.5; + res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a); + + if ((a < 200) && (x < 200)) { + res *= ::exp(a - x) * ::pow(x / fac, a); + } else { + num = x - a - lanczos_g + 0.5; + numfac = num / fac; + res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac); + } + return res; +} + +template +scalar_t _igam_helper_series(scalar_t a, scalar_t x) { + // Compute igam using DLMF 8.11.4. [igam1] + + using accscalar_t = opmath_t; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const int MAXITER = 2000; + + int i; + accscalar_t ans, ax, c, r; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* power series */ + r = a; + c = 1.0; + ans = 1.0; + + for (i = 0; i < MAXITER; i++) { + r += 1.0; + c *= x / r; + ans += c; + if (c <= MACHEP * ans) { + break; + } + } + return (ans * ax / a); +} + +template +scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in + // _igam_helper_series but extra care is taken to avoid cancellation. + + using accscalar_t = opmath_t; + int n; + accscalar_t fac = 1; + accscalar_t sum = 0; + accscalar_t term, logx; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + + for (n = 1; n < MAXITER; n++) { + fac *= -x / n; + term = fac / (a + n); + sum += term; + if (::fabs(term) <= MACHEP * ::fabs(sum)) { + break; + } + } + + logx = ::log(x); + term = -::expm1(a * logx - ::lgamma(1 + a)); + return term - ::exp(a * logx - ::lgamma(a)) * sum; +} + +template +scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { + // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] + + using accscalar_t = opmath_t; + const accscalar_t d[25][25] = { + {-3.3333333333333333e-1, 8.3333333333333333e-2, + -1.4814814814814815e-2, 1.1574074074074074e-3, + 3.527336860670194e-4, -1.7875514403292181e-4, + 3.9192631785224378e-5, -2.1854485106799922e-6, + -1.85406221071516e-6, 8.296711340953086e-7, + -1.7665952736826079e-7, 6.7078535434014986e-9, + 1.0261809784240308e-8, -4.3820360184533532e-9, + 9.1476995822367902e-10, -2.551419399494625e-11, + -5.8307721325504251e-11, 2.4361948020667416e-11, + -5.0276692801141756e-12, 1.1004392031956135e-13, + 3.3717632624009854e-13, -1.3923887224181621e-13, + 2.8534893807047443e-14, -5.1391118342425726e-16, + -1.9752288294349443e-15}, + {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, + -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, + -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, + 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, + 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, + 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, + 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, + -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, + -4.13125571381061e-15}, + {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, + 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, + -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, + -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, + -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, + 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, + 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, + 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, + 8.8592218725911273e-15}, + {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, + 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, + 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, + -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, + -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, + -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, + -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, + 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, + 2.0453671226782849e-14}, + {-8.618882909167117e-4, 7.8403922172006663e-4, + -2.9907248030319018e-4, -1.4638452578843418e-6, + 6.6414982154651222e-5, -3.9683650471794347e-5, + 1.1375726970678419e-5, 2.5074972262375328e-10, + -1.6954149536558306e-6, 8.9075075322053097e-7, + -2.2929348340008049e-7, 2.956794137544049e-11, + 2.8865829742708784e-8, -1.4189739437803219e-8, + 3.4463580499464897e-9, -2.3024517174528067e-13, + -3.9409233028046405e-10, 1.8602338968504502e-10, + -4.356323005056618e-11, 1.2786001016296231e-15, + 4.6792750266579195e-12, -2.1492464706134829e-12, + 4.9088156148096522e-13, -6.3385914848915603e-18, + -5.0453320690800944e-14}, + {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, + -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, + -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, + -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, + 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, + 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, + 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, + -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, + -1.3249659916340829e-13}, + {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, + 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, + -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, + -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, + -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, + 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, + 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, + 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, + 3.6902800842763467e-13}, + {3.4436760689237767e-4, 5.1717909082605922e-5, + -3.3493161081142236e-4, 2.812695154763237e-4, + -1.0976582244684731e-4, -1.2741009095484485e-7, + 2.7744451511563644e-5, -1.8263488805711333e-5, + 5.7876949497350524e-6, 4.9387589339362704e-10, + -1.0595367014026043e-6, 6.1667143761104075e-7, + -1.7562973359060462e-7, -1.2974473287015439e-12, + 2.695423606288966e-8, -1.4578352908731271e-8, + 3.887645959386175e-9, -3.8810022510194121e-17, + -5.3279941738772867e-10, 2.7437977643314845e-10, + -6.9957960920705679e-11, 2.5899863874868481e-17, + 8.8566890996696381e-12, -4.403168815871311e-12, + 1.0865561947091654e-12}, + {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, + -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, + 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, + 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, + 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, + 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, + -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, + -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, + -3.3721464474854592e-12}, + {-5.9676129019274625e-4, -7.2048954160200106e-5, + 6.7823088376673284e-4, -6.4014752602627585e-4, + 2.7750107634328704e-4, 1.8197008380465151e-7, + -8.4795071170685032e-5, 6.105192082501531e-5, + -2.1073920183404862e-5, -8.8585890141255994e-10, + 4.5284535953805377e-6, -2.8427815022504408e-6, + 8.7082341778646412e-7, 3.6886101871706965e-12, + -1.5344695190702061e-7, 8.862466778790695e-8, + -2.5184812301826817e-8, -1.0225912098215092e-14, + 3.8969470758154777e-9, -2.1267304792235635e-9, + 5.7370135528051385e-10, -1.887749850169741e-19, + -8.0931538694657866e-11, 4.2382723283449199e-11, + -1.1002224534207726e-11}, + {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, + 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, + -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, + -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, + -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, + -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, + 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, + 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, + 3.7647749553543836e-11}, + {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, + 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, + 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, + 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, + -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, + -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, + -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, + -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, + 1.3481607129399749e-10}, + {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, + -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, + 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, + 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, + 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, + 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, + -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, + -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, + -5.0423112718105824e-10}, + {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, + -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, + -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, + -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, + 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, + 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, + 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, + 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, + -1.9661464453856102e-9}, + {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, + 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, + -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, + -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, + -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, + -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, + 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, + 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, + 7.9795091026746235e-9}, + {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, + 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, + 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, + 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, + -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, + -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, + -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, + -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, + 3.3654425209171788e-8}, + {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, + -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, + 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, + 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, + 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, + 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, + -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, + -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, + -1.4729737374018841e-7}, + {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, + -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, + -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, + -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, + 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, + 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, + 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, + 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, + -6.6812849447625594e-7}, + {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, + 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, + -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, + -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, + -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, + -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, + 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, + 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, + 3.1369106244517615e-6}, + {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, + 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, + 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, + 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, + -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, + -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, + -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, + 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, + 1.5227271505597605e-5}, + {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, + -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, + 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, + 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, + 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, + 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, + -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, + -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, + -7.6340103696869031e-5}, + {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, + -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, + -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, + -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, + 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, + 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, + 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, + -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, + -3.9479941246822517e-4}, + {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, + 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, + -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, + -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, + -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, + -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, + 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, + 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, + 2.1250180774699461e-3}, + {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, + 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, + 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, + 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, + -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, + -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, + -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, + 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, + 1.5109265210467774e-2}, + {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, + -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, + 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, + 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, + 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, + -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, + -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, + -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, + 4.8683443692930507e-1}}; + + int k, n, sgn; + int maxpow = 0; + const accscalar_t MACHEP = 5.9604644775390625E-8; + accscalar_t lambda = x / a; + accscalar_t sigma = (x - a) / a; + accscalar_t eta, res, ck, ckterm, term, absterm; + accscalar_t absoldterm = INFINITY; + accscalar_t etapow[25] = {1}; + accscalar_t sum = 0; + accscalar_t afac = 1; + + if (igam) { + sgn = -1; + } else { + sgn = 1; + } + + if (lambda > 1) { + eta = ::sqrt(-2 * (::log1p(sigma) - sigma)); + } else if (lambda < 1) { + eta = -::sqrt(-2 * (::log1p(sigma) - sigma)); + } else { + eta = 0; + } + res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2)); + + for (k = 0; k < 25; k++) { + ck = d[k][0]; + for (n = 1; n < 25; n++) { + if (n > maxpow) { + etapow[n] = eta * etapow[n - 1]; + maxpow += 1; + } + ckterm = d[k][n] * etapow[n]; + ck += ckterm; + if (::fabs(ckterm) < MACHEP * ::fabs(ck)) { + break; + } + } + term = ck * afac; + absterm = ::fabs(term); + if (absterm > absoldterm) { + break; + } + sum += term; + if (absterm < MACHEP * ::fabs(sum)) { + break; + } + absoldterm = absterm; + afac /= a; + } + res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a); + + return res; +} + +template +scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.9.2. [igam1] + + using accscalar_t = opmath_t; + int i; + accscalar_t ans, ax, c, yc, r, t, y, z; + accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const accscalar_t BIG = 16777216.; + const accscalar_t BIGINV = 5.9604644775390625E-8; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* continued fraction */ + y = 1.0 - a; + z = x + y + 1.0; + c = 0.0; + pkm2 = 1.0; + qkm2 = x; + pkm1 = x + 1.0; + qkm1 = z * x; + ans = pkm1 / qkm1; + + for (i = 0; i < MAXITER; i++) { + c += 1.0; + y += 1.0; + z += 2.0; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != 0) { + r = pk / qk; + t = ::fabs((ans - r) / r); + ans = r; + } else { + t = 1.0; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (::fabs(pk) > BIG) { + pkm2 *= BIGINV; + pkm1 *= BIGINV; + qkm2 *= BIGINV; + qkm1 *= BIGINV; + } + if (t <= MACHEP) { + break; + } + } + return ans * ax; +} + +template +scalar_t calc_igammac(scalar_t a, scalar_t x) { + /* the calculation of the regularized upper incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.4 [igam1]) + * - if x > 1.1 and x < a, using the subtraction from the regularized lower + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (5) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 0.0; + } else { + return NAN; + } + } else if (x == 0) { + return 1.0; + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 1.0; + } else if (isinf(x)) { + return 0.0; + } + + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 0); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 0); + } + + if (x > 1.1) { + if (x < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_continued_fraction(a, x); + } + } else if (x <= 0.5) { + if (-0.4 / ::log(x) < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } else { + if (x * 1.1 < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } +} + +template +scalar_t calc_igamma(scalar_t a, scalar_t x) { + /* the calculation of the regularized lower incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.3 [igam1]) + * - if x > 1 and x > a, using the subtraction from the regularized upper + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (4) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + // boundary values following SciPy + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 1.0; + } else { + return NAN; + } + } else if (x == 0) { + return 0.0; // zero integration limit + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 0.0; + } else if (isinf(x)) { + return 1.0; + } + + /* Asymptotic regime where a ~ x. */ + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 1); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 1); + } + + if ((x > 1.0) && (x > a)) { + return 1.0 - calc_igammac(a, x); + } + + return _igam_helper_series(a, x); +} + +} // namespace + +// end of regularized lower & upper incomplete gamma + +namespace c10 { +namespace metal { + +template +inline T igamma(T a, T b) { + return calc_igamma(a, b); +} + +template +inline T igammac(T a, T b) { + return calc_igammac(a, b); +} + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h new file mode 100644 index 0000000000000000000000000000000000000000..3a35aa1b87a2aa9a80cfaafd3d0cf0cf3076a215 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h @@ -0,0 +1,1050 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Metal indexing primitives +#pragma once +#include +#include +#include + +namespace c10 { +namespace metal { + +// Given coordinates and strides, calculates offset from the start of the +// tensors +template +inline T offset_from_coord( + thread T idx[max_ndim], + constant long* strides, + uint ndim) { + T rc = 0; + for (uint i = 0; i < ndim; ++i) { + rc += idx[i] * T(strides[i]); + } + return rc; +} + +// Given thread index calculates position in the ndim tensor +template +inline void pos_from_thread_index( + T idx, + thread T pos[max_ndim], + constant long* sizes, + uint ndim) { + for (uint i = 0; i < ndim; ++i) { + pos[i] = idx % T(sizes[i]); + idx /= T(sizes[i]); + } +} + +inline long offset_from_thread_index( + long idx, + constant long* sizes, + constant long* strides, + uint ndim) { + long pos[max_ndim]; + pos_from_thread_index(idx, pos, sizes, ndim); + return offset_from_coord(pos, strides, ndim); +} + +template +kernel void unary_dense( + device result_of* output [[buffer(0)]], + constant T* input [[buffer(1)]], + uint index [[thread_position_in_grid]]) { + F f; + output[index] = f(input[index]); +} + +template +kernel void unary_strided( + device result_of* output [[buffer(0)]], + constant T* input [[buffer(1)]], + constant long* sizes [[buffer(2)]], + constant long* input_strides [[buffer(3)]], + constant long* output_strides [[buffer(4)]], + constant uint& ndim [[buffer(5)]], + uint index [[thread_position_in_grid]]) { + F f; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim); + const auto input_offs = offset_from_coord(pos, input_strides, ndim); + const auto output_offs = offset_from_coord(pos, output_strides, ndim); + output[output_offs] = f(input[input_offs]); +} + +#define REGISTER_UNARY_OP(NAME, DTYPE0, DTYPE1) \ + static_assert( \ + ::metal:: \ + is_same_v>, \ + "Output dtype mismatch for unary op " #NAME " and input " #DTYPE0); \ + template [[host_name(#NAME "_dense_" #DTYPE1 "_" #DTYPE0)]] kernel void :: \ + c10::metal::unary_dense( \ + device ::c10::metal::result_of * output, \ + constant DTYPE0 * input, \ + uint index); \ + template [[host_name(#NAME "_strided_" #DTYPE1 "_" #DTYPE0)]] kernel void :: \ + c10::metal::unary_strided( \ + device ::c10::metal::result_of * output, \ + constant DTYPE0 * input, \ + constant long* sizes, \ + constant long* input_strides, \ + constant long* output_strides, \ + constant uint& ndim, \ + uint index) + +#define DEFINE_UNARY_FLOATING_FUNCTOR(NAME) \ + struct NAME##_functor { \ + template \ + inline ::metal::enable_if_t<::metal::is_floating_point_v, T> operator()( \ + const T x) { \ + return T(NAME(x)); \ + } \ + template \ + inline ::metal::enable_if_t<::metal::is_integral_v, float> operator()( \ + const T x) { \ + return NAME(static_cast(x)); \ + } \ + } + +template +kernel void unary_alpha_dense( + device result_of* output [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T2& alpha [[buffer(2)]], + uint index [[thread_position_in_grid]]) { + F f; + output[index] = f(input[index], alpha); +} + +template +kernel void unary_alpha_strided( + device result_of* output [[buffer(0)]], + constant T* input [[buffer(1)]], + constant long* sizes [[buffer(2)]], + constant long* input_strides [[buffer(3)]], + constant long* output_strides [[buffer(4)]], + constant uint& ndim [[buffer(5)]], + constant T2& alpha [[buffer(6)]], + uint index [[thread_position_in_grid]]) { + F f; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim); + const auto input_offs = offset_from_coord(pos, input_strides, ndim); + const auto output_offs = offset_from_coord(pos, output_strides, ndim); + output[output_offs] = f(input[input_offs], alpha); +} + +#define REGISTER_UNARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO) \ + static_assert( \ + ::metal::is_same_v< \ + DTYPEO, \ + ::c10::metal::result_of>, \ + "Output dtype mismatch for unary op " #NAME " and input " #DTYPEI); \ + template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + unary_alpha_dense( \ + device ::c10::metal::result_of * \ + output, \ + constant DTYPEI * input, \ + constant DTYPEA & alpha, \ + uint index); \ + template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + unary_alpha_strided( \ + device ::c10::metal::result_of * \ + output, \ + constant DTYPEI * input, \ + constant long* sizes, \ + constant long* input_strides, \ + constant long* output_strides, \ + constant uint& ndim, \ + constant DTYPEA& alpha, \ + uint index) + +template +inline T val_at_offs(constant void* ptr, long offs) { + return *reinterpret_cast( + static_cast(ptr) + offs); +} + +// Value at offset with dynamic cast from provided type +template +inline T val_at_offs(device void* ptr, long offs) { + return *reinterpret_cast(static_cast(ptr) + offs); +} + +template +inline T val_at_offs(P ptr, long offs, ScalarType type) { + switch (type) { + case ScalarType::Bool: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Byte: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Char: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Short: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Int: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Long: + return cast_to(val_at_offs(ptr, offs)); + // Floats + case ScalarType::Float: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::Half: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::BFloat16: + return cast_to(val_at_offs(ptr, offs)); + // Complex + case ScalarType::ComplexHalf: + return cast_to(val_at_offs(ptr, offs)); + case ScalarType::ComplexFloat: + return cast_to(val_at_offs(ptr, offs)); + } +} + +template +inline device T& ref_at_offs(device void* ptr, long offs) { + return *reinterpret_cast(static_cast(ptr) + offs); +} + +// Binary elementwise ops kernels +// Right now there are 4 flavors available: +// - binary_dense where both input, other and output are dense and share the +// same type +// - binary_strided when all inputs are of the same types, but some elements are +// strided +// - binary_dense_cast - inputs are dense, but of different dtypes +// - binary_strided_cast - inputs or output are strided and of different dtypes +// - binary_dense_broadcast - one input is dense, another one is broadcastable +// Note about accuracy (for more info see +// https://github.com/pytorch/pytorch/issues/152736) Sometimes when kernel is +// invoked to produce `half` output, but one of the arguments is float arguments +// should be upcast to float, rather than downcast to half At the moment this is +// expressed with `om_t` optional argument (which stands for opmath_type) which +// is identical to output type but could be something else + +template +kernel void binary_strided( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant long* sizes [[buffer(3)]], + constant long* output_strides [[buffer(4)]], + constant long* input_strides [[buffer(5)]], + constant long* other_strides [[buffer(6)]], + constant uint3& ndim [[buffer(7)]], + uint index [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim.x); + const auto input_offs = offset_from_coord(pos, input_strides, ndim.x); + const auto other_offs = offset_from_coord(pos, other_strides, ndim.x); + const auto output_offs = offset_from_coord(pos, output_strides, ndim.x); + const auto a = val_at_offs(input, input_offs); + const auto b = val_at_offs(other, other_offs); + ref_at_offs(output, output_offs) = + static_cast(f(om_t(a), om_t(b))); +} + +template +kernel void binary_alpha_strided( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + constant long* sizes [[buffer(4)]], + constant long* output_strides [[buffer(5)]], + constant long* input_strides [[buffer(6)]], + constant long* other_strides [[buffer(7)]], + constant uint3& ndim [[buffer(8)]], + uint index [[thread_position_in_grid]]) { + F f; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim.x); + const auto input_offs = offset_from_coord(pos, input_strides, ndim.x); + const auto other_offs = offset_from_coord(pos, other_strides, ndim.x); + const auto output_offs = offset_from_coord(pos, output_strides, ndim.x); + const auto a = val_at_offs(input, input_offs); + const auto b = val_at_offs(other, other_offs); + ref_at_offs>(output, output_offs) = f(a, b, alpha); +} + +template > +kernel void binary_strided_cast( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant long* sizes [[buffer(3)]], + constant long* output_strides [[buffer(4)]], + constant long* input_strides [[buffer(5)]], + constant long* other_strides [[buffer(6)]], + constant uint4& ndim_types [[buffer(7)]], + uint index [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim_types.x); + const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x); + const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x); + const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x); + const auto a = val_at_offs( + input, input_offs, static_cast(ndim_types.y)); + const auto b = val_at_offs( + other, other_offs, static_cast(ndim_types.z)); + ref_at_offs(output, output_offs) = static_cast(f(a, b)); +} + +template +kernel void binary_alpha_strided_cast( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + constant long* sizes [[buffer(4)]], + constant long* output_strides [[buffer(5)]], + constant long* input_strides [[buffer(6)]], + constant long* other_strides [[buffer(7)]], + constant uint4& ndim_types [[buffer(8)]], + uint index [[thread_position_in_grid]]) { + F f; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim_types.x); + const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x); + const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x); + const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x); + const auto a = + val_at_offs(input, input_offs, static_cast(ndim_types.y)); + const auto b = + val_at_offs(other, other_offs, static_cast(ndim_types.z)); + ref_at_offs>(output, output_offs) = f(a, b, alpha); +} + +template > +kernel void binary_dense( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* other [[buffer(2)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast(f(om_t(input[tid]), om_t(other[tid]))); +} + +template +kernel void binary_alpha_dense( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* other [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + out[tid] = f(input[tid], other[tid], alpha); +} + +template +kernel void binary_dense_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant uint4& sizes_types [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = val_at_offs( + other, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = static_cast(f(a, b)); +} + +template +kernel void binary_alpha_dense_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + constant uint4& sizes_types [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = val_at_offs( + other, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = f(a, b, alpha); +} + +template > +kernel void binary_dense_broadcast( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* broadcast [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast( + f(om_t(input[tid]), om_t(broadcast[tid % broadcast_numel]))); +} + +template > +kernel void binary_dense_broadcast_rhs( + device result_of* out [[buffer(0)]], + constant T* broadcast [[buffer(1)]], + constant T* input [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast( + f(om_t(broadcast[tid % broadcast_numel]), om_t(input[tid]))); +} + +template +kernel void binary_alpha_dense_broadcast( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* broadcast [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant T2& alpha [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + out[tid] = f(input[tid], broadcast[tid % broadcast_numel], alpha); +} + +template +kernel void binary_alpha_dense_broadcast_rhs( + device result_of* out [[buffer(0)]], + constant T* broadcast [[buffer(1)]], + constant T* input [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant T2& alpha [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + out[tid] = f(broadcast[tid % broadcast_numel], input[tid], alpha); +} + +template +kernel void binary_dense_broadcast_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* broadcast [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant uint4& sizes_types [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = val_at_offs( + broadcast, + (tid % broadcast_numel) * sizes_types.y, + static_cast(sizes_types.w)); + out[tid] = static_cast(f(a, b)); +} + +template +kernel void binary_dense_broadcast_rhs_cast( + device result_of* out [[buffer(0)]], + constant void* broadcast [[buffer(1)]], + constant void* input [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant uint4& sizes_types [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = val_at_offs( + broadcast, + (tid % broadcast_numel) * sizes_types.x, + static_cast(sizes_types.z)); + const auto b = val_at_offs( + input, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = static_cast(f(a, b)); +} + +template +kernel void binary_alpha_dense_broadcast_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* broadcast [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant T2& alpha [[buffer(4)]], + constant uint4& sizes_types [[buffer(5)]], + uint tid [[thread_position_in_grid]]) { + F f; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = val_at_offs( + broadcast, + (tid % broadcast_numel) * sizes_types.y, + static_cast(sizes_types.w)); + out[tid] = f(a, b, alpha); +} + +template +kernel void binary_alpha_dense_broadcast_rhs_cast( + device result_of* out [[buffer(0)]], + constant void* broadcast [[buffer(1)]], + constant void* input [[buffer(2)]], + constant long& broadcast_numel [[buffer(3)]], + constant T2& alpha [[buffer(4)]], + constant uint4& sizes_types [[buffer(5)]], + uint tid [[thread_position_in_grid]]) { + F f; + const auto a = val_at_offs( + broadcast, + (tid % broadcast_numel) * sizes_types.x, + static_cast(sizes_types.z)); + const auto b = val_at_offs( + input, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = f(a, b, alpha); +} + +template > +kernel void binary_dense_scalar( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + device T* scalar [[buffer(2)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast(f(om_t(input[tid]), om_t(scalar[0]))); +} + +template > +kernel void binary_dense_scalar_lhs( + device result_of* out [[buffer(0)]], + device T* scalar [[buffer(1)]], + constant T* input [[buffer(2)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast(f(om_t(scalar[0]), om_t(input[tid]))); +} + +template +kernel void binary_dense_scalar_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + device void* scalar [[buffer(2)]], + constant uint4& sizes_types [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = + val_at_offs(scalar, 0, static_cast(sizes_types.w)); + out[tid] = static_cast(f(a, b)); +} + +template +kernel void binary_dense_scalar_lhs_cast( + device result_of* out [[buffer(0)]], + device void* scalar [[buffer(1)]], + constant void* input [[buffer(2)]], + constant uint4& sizes_types [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = + val_at_offs(scalar, 0, static_cast(sizes_types.z)); + const auto b = val_at_offs( + input, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = static_cast(f(a, b)); +} + +template +kernel void binary_alpha_dense_scalar( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + device T* scalar [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + out[tid] = f(input[tid], scalar[0], alpha); +} + +template +kernel void binary_alpha_dense_scalar_lhs( + device result_of* out [[buffer(0)]], + device T* scalar [[buffer(1)]], + constant T* input [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + out[tid] = f(scalar[0], input[tid], alpha); +} + +template +kernel void binary_alpha_dense_scalar_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + device void* scalar [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + constant uint4& sizes_types [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + const auto a = val_at_offs( + input, tid * sizes_types.x, static_cast(sizes_types.z)); + const auto b = + val_at_offs(scalar, 0, static_cast(sizes_types.w)); + out[tid] = f(a, b, alpha); +} + +template +kernel void binary_alpha_dense_scalar_lhs_cast( + device result_of* out [[buffer(0)]], + device void* scalar [[buffer(1)]], + constant void* input [[buffer(2)]], + constant T2& alpha [[buffer(3)]], + constant uint4& sizes_types [[buffer(4)]], + uint tid [[thread_position_in_grid]]) { + F f; + const auto a = + val_at_offs(scalar, 0, static_cast(sizes_types.z)); + const auto b = val_at_offs( + input, tid * sizes_types.y, static_cast(sizes_types.w)); + out[tid] = f(a, b, alpha); +} + +#define REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, OMT) \ + static_assert( \ + ::metal::is_same_v< \ + DTYPEO, \ + ::c10::metal::result_of>, \ + "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI); \ + template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI)]] kernel void :: \ + c10::metal::binary_strided( \ + device void* out, \ + constant void* input, \ + constant void* other, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other_strides, \ + constant uint3& ndim, \ + uint tid); \ + template [[host_name(#NAME "_strided_cast_" #DTYPEI)]] kernel void ::c10:: \ + metal::binary_strided_cast( \ + device void* out, \ + constant void* input, \ + constant void* other, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other_strides, \ + constant uint4& ndim_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI)]] kernel void :: \ + c10::metal::binary_dense( \ + device ::c10::metal::result_of * \ + out_, \ + constant DTYPEI * input_, \ + constant DTYPEI * other_, \ + uint tid); \ + template [[host_name(#NAME "_dense_cast_" #DTYPEI)]] kernel void ::c10:: \ + metal::binary_dense_cast( \ + device ::c10::metal::result_of * \ + out_, \ + constant void* input, \ + constant void* other, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_" #DTYPEO "_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_broadcast( \ + device ::c10::metal::result_of * \ + out_, \ + constant DTYPEI * input_, \ + constant DTYPEI * broadcast_, \ + constant long& broadcast_numel, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_rhs_" #DTYPEO "_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_broadcast_rhs( \ + device ::c10::metal::result_of * \ + out_, \ + constant DTYPEI * broadcast_, \ + constant DTYPEI * input_, \ + constant long& broadcast_numel, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_cast_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_broadcast_cast( \ + device ::c10::metal::result_of * \ + out_, \ + constant void* input_, \ + constant void* broadcast_, \ + constant long& broadcast_numel, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_rhs_cast_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_broadcast_rhs_cast( \ + device ::c10::metal::result_of * \ + out_, \ + constant void* broadcast_, \ + constant void* input_, \ + constant long& broadcast_numel, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_" #DTYPEO "_" #DTYPEI)]] \ + kernel void ::c10::metal::binary_dense_scalar( \ + device ::c10::metal::result_of * out_, \ + constant DTYPEI * input_, \ + device DTYPEI * scalar_, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_lhs_" #DTYPEO "_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_scalar_lhs( \ + device ::c10::metal::result_of * \ + out_, \ + device DTYPEI * scalar_, \ + constant DTYPEI * input_, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_cast_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_scalar_cast( \ + device ::c10::metal::result_of * \ + out_, \ + constant void* input_, \ + device void* scalar_, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_lhs_cast_" #DTYPEI)]] \ + kernel void ::c10::metal:: \ + binary_dense_scalar_lhs_cast( \ + device ::c10::metal::result_of * \ + out_, \ + device void* scalar_, \ + constant void* input_, \ + constant uint4& sizes_types, \ + uint tid) + +// OpMath Binary Op promotes inputs to higher precision type before Functor call +#define REGISTER_OPMATH_BINARY_OP(NAME, DTYPEI, DTYPEO) \ + REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, ::c10::metal::opmath_t) + +#define REGISTER_BINARY_OP(NAME, DTYPEI, DTYPEO) \ + REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, DTYPEI) + +#define REGISTER_BINARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO) \ + static_assert( \ + ::metal::is_same_v< \ + DTYPEO, \ + ::c10::metal::result_of>, \ + "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI); \ + template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_strided( \ + device void* out, \ + constant void* input, \ + constant void* other, \ + constant DTYPEA& alpha, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other_strides, \ + constant uint3& ndim, \ + uint tid); \ + template [[host_name(#NAME "_strided_cast_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_strided_cast( \ + device void* out, \ + constant void* input, \ + constant void* other, \ + constant DTYPEA& alpha, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other_strides, \ + constant uint4& ndim_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant DTYPEI * input_, \ + constant DTYPEI * other_, \ + constant DTYPEA & alpha, \ + uint tid); \ + template \ + [[host_name(#NAME "_dense_cast_" #DTYPEI "_" #DTYPEA)]] kernel void :: \ + c10::metal::binary_alpha_dense_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant void* input, \ + constant void* other, \ + constant DTYPEA& alpha, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_broadcast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant DTYPEI * input_, \ + constant DTYPEI * broadcast_, \ + constant long& broadcast_numel, \ + constant DTYPEA& alpha, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_rhs_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_broadcast_rhs( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant DTYPEI * broadcast_, \ + constant DTYPEI * input_, \ + constant long& broadcast_numel, \ + constant DTYPEA& alpha, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_cast_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_broadcast_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant void* input_, \ + constant void* broadcast_, \ + constant long& broadcast_numel, \ + constant DTYPEA& alpha, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_broadcast_rhs_cast_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_broadcast_rhs_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant void* broadcast_, \ + constant void* input_, \ + constant long& broadcast_numel, \ + constant DTYPEA& alpha, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_scalar( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant DTYPEI * input_, \ + device DTYPEI * scalar_, \ + constant DTYPEA & alpha, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_lhs_" #DTYPEO "_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_scalar_lhs( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + device DTYPEI * scalar_, \ + constant DTYPEI * input_, \ + constant DTYPEA & alpha, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_cast_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_scalar_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant void* input_, \ + device void* scalar_, \ + constant DTYPEA& alpha, \ + constant uint4& sizes_types, \ + uint tid); \ + template [[host_name(#NAME "_dense_scalar_lhs_cast_" #DTYPEI \ + "_" #DTYPEA)]] kernel void ::c10::metal:: \ + binary_alpha_dense_scalar_lhs_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + device void* scalar_, \ + constant void* input_, \ + constant DTYPEA& alpha, \ + constant uint4& sizes_types, \ + uint tid) + +// Ternary elementwise ops kernels +// Right now there are 4 flavors available: +// - ternary_dense where both input, other1, other2, and output are dense and +// share the same type +// - ternary_strided when all inputs are of the same types, but some elements +// are strided +// - ternary_dense_cast - inputs are dense, but of different dtypes +// - ternary_strided_cast - inputs or output are strided and of different dtypes +// Note about accuracy (for more info see +// https://github.com/pytorch/pytorch/issues/152736) Sometimes when kernel is +// invoked to produce `half` output, but one of the arguments is float arguments +// should be upcast to float, rather than downcast to half At the moment this is +// expressed with `om_t` optional argument (which stands for opmath_type) which +// is identical to output type but could be something else + +template +kernel void ternary_strided( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other1 [[buffer(2)]], + constant void* other2 [[buffer(3)]], + constant long* sizes [[buffer(4)]], + constant long* output_strides [[buffer(5)]], + constant long* input_strides [[buffer(6)]], + constant long* other1_strides [[buffer(7)]], + constant long* other2_strides [[buffer(8)]], + constant uint& ndim [[buffer(9)]], + constant uint4& types [[buffer(10)]], + uint index [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim); + const auto input_offs = offset_from_coord(pos, input_strides, ndim); + const auto other1_offs = offset_from_coord(pos, other1_strides, ndim); + const auto other2_offs = offset_from_coord(pos, other2_strides, ndim); + const auto output_offs = offset_from_coord(pos, output_strides, ndim); + const auto a = val_at_offs(input, input_offs); + const auto b = val_at_offs(other1, other1_offs); + const auto c = val_at_offs(other2, other2_offs); + ref_at_offs(output, output_offs) = + static_cast(f(om_t(a), om_t(b), om_t(c))); +} + +template > +kernel void ternary_strided_cast( + device void* output [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other1 [[buffer(2)]], + constant void* other2 [[buffer(3)]], + constant long* sizes [[buffer(4)]], + constant long* output_strides [[buffer(5)]], + constant long* input_strides [[buffer(6)]], + constant long* other1_strides [[buffer(7)]], + constant long* other2_strides [[buffer(8)]], + constant uint& ndim [[buffer(9)]], + constant uint4& types [[buffer(10)]], + uint index [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + int pos[max_ndim]; + pos_from_thread_index(int(index), pos, sizes, ndim); + const auto input_offs = offset_from_coord(pos, input_strides, ndim); + const auto other1_offs = offset_from_coord(pos, other1_strides, ndim); + const auto other2_offs = offset_from_coord(pos, other2_strides, ndim); + const auto output_offs = offset_from_coord(pos, output_strides, ndim); + const auto a = + val_at_offs(input, input_offs, static_cast(types.x)); + const auto b = + val_at_offs(other1, other1_offs, static_cast(types.y)); + const auto c = + val_at_offs(other2, other2_offs, static_cast(types.z)); + ref_at_offs(output, output_offs) = static_cast(f(a, b, c)); +} + +template > +kernel void ternary_dense( + device result_of* out [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* other1 [[buffer(2)]], + constant T* other2 [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + out[tid] = static_cast( + f(om_t(input[tid]), om_t(other1[tid]), om_t(other2[tid]))); +} + +template +kernel void ternary_dense_cast( + device result_of* out [[buffer(0)]], + constant void* input [[buffer(1)]], + constant void* other1 [[buffer(2)]], + constant void* other2 [[buffer(3)]], + constant uint3& sizes [[buffer(4)]], + constant uint3& types [[buffer(5)]], + uint tid [[thread_position_in_grid]]) { + F f; + using res_t = result_of; + const auto a = + val_at_offs(input, tid * sizes.x, static_cast(types.x)); + const auto b = val_at_offs( + other1, tid * sizes.y, static_cast(types.y)); + const auto c = val_at_offs( + other2, tid * sizes.z, static_cast(types.z)); + out[tid] = static_cast(f(a, b, c)); +} + +#define REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, OMT) \ + static_assert( \ + ::metal::is_same_v< \ + DTYPEO, \ + ::c10::metal::result_of>, \ + "Output dtype mismatch for ternary op " #NAME " and input " #DTYPEI); \ + template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI)]] kernel void :: \ + c10::metal::ternary_strided( \ + device void* out, \ + constant void* input, \ + constant void* other1, \ + constant void* other2, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other1_strides, \ + constant long* other2_strides, \ + constant uint& ndim, \ + constant uint4& types, \ + uint tid); \ + template [[host_name(#NAME "_strided_cast_" #DTYPEI)]] kernel void ::c10:: \ + metal::ternary_strided_cast( \ + device void* out, \ + constant void* input, \ + constant void* other1, \ + constant void* other2, \ + constant long* sizes, \ + constant long* output_strides, \ + constant long* input_strides, \ + constant long* other1_strides, \ + constant long* other2_strides, \ + constant uint& ndim, \ + constant uint4& types, \ + uint tid); \ + template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI)]] kernel void :: \ + c10::metal::ternary_dense( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant DTYPEI * input_, \ + constant DTYPEI * other1_, \ + constant DTYPEI * other2_, \ + uint tid); \ + template [[host_name(#NAME "_dense_cast_" #DTYPEI)]] kernel void ::c10:: \ + metal::ternary_dense_cast( \ + device ::c10::metal:: \ + result_of * \ + out_, \ + constant void* input, \ + constant void* other1, \ + constant void* other2, \ + constant uint3& sizes, \ + constant uint3& types, \ + uint tid) + +// OpMath ternary Op promotes inputs to higher precision type before Functor +// call +#define REGISTER_OPMATH_TERNARY_OP(NAME, DTYPEI, DTYPEO) \ + REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, ::c10::metal::opmath_t) + +#define REGISTER_TERNARY_OP(NAME, DTYPEI, DTYPEO) \ + REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, DTYPEI) + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h new file mode 100644 index 0000000000000000000000000000000000000000..711e446d667decbbf3e2cfc7fc5a0da5d81d3123 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h @@ -0,0 +1,83 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Philox Counter based RNG implementation for Metal +// Borrowed from aten/src/ATen/core/PhiloxRNGEngine.h +// Which in turn borrowed from +// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf +#pragma once +#include + +namespace c10 { +namespace metal { + +namespace detail { + +constexpr float uint32_to_uniform_float(uint32_t value) { + // maximum value such that `MAX_INT * scale < 1.0` (with float rounding) + constexpr float scale = 4.6566127342e-10; + return static_cast(value & 0x7FFFFFFF) * scale; +} + +inline uint2 splitlong(ulong v) { + return uint2(v >> 32, v & 0xffffffff); +} + +} // namespace detail + +namespace philox4 { + +uint2 mulhilo(uint a, uint b) { + auto rc = static_cast(a) * b; + return detail::splitlong(rc); +} +uint4 single_round(uint4 ctr, uint2 key) { + constexpr uint kPhiloxSA = 0xD2511F53; + constexpr uint kPhiloxSB = 0xCD9E8D57; + auto rc0 = mulhilo(kPhiloxSA, ctr.x); + auto rc1 = mulhilo(kPhiloxSB, ctr.z); + return uint4(rc1.y ^ ctr.y ^ key.x, rc1.x, rc0.y ^ ctr.w ^ key.y, rc0.x); +} + +uint4 multiple_rounds(uint4 ctr, uint2 key, uint rounds) { + constexpr uint2 kPhilox10 = {0x9E3779B9, 0xBB67AE85}; + for (uint round = 0; round < rounds - 1; ++round) { + ctr = single_round(ctr, key); + key += kPhilox10; + } + return ctr; +} + +uint4 rand(long seed, long index) { + uint4 ctr = 0; + ctr.zw = detail::splitlong(index); + return multiple_rounds(ctr, detail::splitlong(seed), 10); +} + +} // namespace philox4 + +float randn(long seed, long index) { + auto value = philox4::rand(seed, index); + float u1 = 1.0 - detail::uint32_to_uniform_float(value.x); + float u2 = 1.0 - detail::uint32_to_uniform_float(value.y); + return ::metal::sqrt(-2.0 * ::metal::log(u1)) * + ::metal::cos(2.0 * M_PI_F * u2); +} + +float rand(long seed, long index) { + auto value = philox4::rand(seed, index); + return detail::uint32_to_uniform_float(value.x); +} + +long randint64(long seed, long index, long low, long high) { + auto range = high - low; + auto value = philox4::rand(seed, index); + // TODO: Implement better algorithm for large ranges + return low + + static_cast(detail::uint32_to_uniform_float(value.x) * range); +} + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f23c1af774ed88568bc1abacc668e98760bb6f98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h @@ -0,0 +1,364 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { +namespace metal { +namespace detail { +template +struct simd_type { + using t = T; +}; + +// Helper that allows one to run simd ops over bfl16 by upcasting them to fp32 +template +using simd_type_t = typename simd_type::t; + +template <> +struct simd_type { + using t = float; +}; +} // namespace detail + +template +inline ::metal::enable_if_t, T> simd_sum(T val) { + return T(::metal::simd_sum(detail::simd_type_t(val))); +} + +template +inline ::metal::enable_if_t, T> simd_prod(T val) { + return T(::metal::simd_product(detail::simd_type_t(val))); +} + +// Extend simd_broadcast to 64-bit integral types using int2 trick +template < + typename T, + ::metal::enable_if_t<::metal::is_integral_v && sizeof(T) == 8, bool> = + true> +inline T simd_broadcast(T val, ushort lane_id) { + return as_type(::metal::simd_broadcast(as_type(val), lane_id)); +} + +template < + typename T, + ::metal::enable_if_t || sizeof(T) != 8, bool> = + true> +inline T simd_broadcast(T val, ushort lane_id) { + return ::metal::simd_broadcast(val, lane_id); +} + +// Floating simd_min/max with nan propagation +template < + typename T, + ::metal::enable_if_t<::metal::is_floating_point_v, bool> = true> +inline T simd_max(T val) { + if (::metal::simd_any(::metal::isnan(val))) { + return ::metal::numeric_limits::quiet_NaN(); + } + return T(::metal::simd_max(detail::simd_type_t(val))); +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_floating_point_v, bool> = true> +inline T simd_min(T val) { + if (::metal::simd_any(::metal::isnan(val))) { + return ::metal::numeric_limits::quiet_NaN(); + } + return T(::metal::simd_min(detail::simd_type_t(val))); +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_integral_v && sizeof(T) != 8, bool> = + true> +inline T simd_max(T val) { + return ::metal::simd_max(val); +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_integral_v && sizeof(T) != 8, bool> = + true> +inline T simd_min(T val) { + return ::metal::simd_min(val); +} + +// Metal does not support SIMD reductions over 64-bit types, but it could be +// implement using simd_shuffle_down, that yields result in log2(simdgroup_size) +// iterations Use fill variant, as shuffle down returns garbage if inactive +// thread is referenced (on M1/M2, works fine on M4) and broadcast result to all +// threads in the end. Implementation heavily borrows from +// https://github.com/ml-explore/mlx/blob/86389bf9707f46101af45d90510e8e97c8a90b93/mlx/backend/metal/kernels/reduction/ops.h#L16 +template +inline ::metal::enable_if_t<::metal::is_same_v, T> simd_sum(T val) { + for (ushort i = simdgroup_size / 2; i > 0; i /= 2) { + val += as_type( + ::metal::simd_shuffle_and_fill_down(as_type(val), int2(0), i)); + } + return simd_broadcast(val, 0); +} + +template +inline ::metal::enable_if_t<::metal::is_same_v, T> simd_prod(T val) { + for (ushort i = simdgroup_size / 2; i > 0; i /= 2) { + val *= as_type( + ::metal::simd_shuffle_and_fill_down(as_type(val), int2(0), i)); + } + return simd_broadcast(val, 0); +} + +template +inline ::metal::enable_if_t<::metal::is_same_v, T> simd_max(T val) { + for (ushort i = simdgroup_size / 2; i > 0; i /= 2) { + val = ::metal::max( + val, + as_type(::metal::simd_shuffle_and_fill_down( + as_type(val), int2(0), i))); + } + return simd_broadcast(val, 0); +} + +template +inline ::metal::enable_if_t<::metal::is_same_v, T> simd_min(T val) { + for (ushort i = simdgroup_size / 2; i > 0; i /= 2) { + val = ::metal::min( + val, + as_type(::metal::simd_shuffle_and_fill_down( + as_type(val), int2(0), i))); + } + return simd_broadcast(val, 0); +} + +// argmin/argmax helpers using simd_ballot +template < + typename T, + ::metal::enable_if_t<::metal::is_integral_v, bool> = true> +inline ::c10::metal::pair simd_argmin(T val) { + const auto rc = simd_min(val); + const auto vote = ::metal::simd_ballot(val == rc); + return {rc, static_cast(::metal::ctz(static_cast(vote)))}; +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_floating_point_v, bool> = true> +inline ::c10::metal::pair simd_argmin(T val) { + const auto rc = simd_min(val); + const auto vote = ::metal::simd_ballot(val == rc || ::metal::isnan(val)); + return {rc, static_cast(::metal::ctz(static_cast(vote)))}; +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_integral_v, bool> = true> +inline ::c10::metal::pair simd_argmax(T val) { + const auto rc = simd_max(val); + const auto vote = ::metal::simd_ballot(val == rc); + return {rc, static_cast(::metal::ctz(static_cast(vote)))}; +} + +template < + typename T, + ::metal::enable_if_t<::metal::is_floating_point_v, bool> = true> +inline ::c10::metal::pair simd_argmax(T val) { + const auto rc = simd_max(val); + const auto vote = ::metal::simd_ballot(val == rc || ::metal::isnan(val)); + return {rc, static_cast(::metal::ctz(static_cast(vote)))}; +} + +template +inline c10::metal::pair simd_argmin(ARG_T val, IDX_T idx_val) { + auto rc = simd_argmin(val); + return {rc.first, simd_broadcast(idx_val, rc.second)}; +} + +template +inline c10::metal::pair simd_argmax(ARG_T val, IDX_T idx_val) { + auto rc = simd_argmax(val); + return {rc.first, simd_broadcast(idx_val, rc.second)}; +} + +// Below algorithms are written with hardcoded assumption that simdgroup is 32 +// and threadgroup_max is 1024, i.e. reduction can be done in two stages max +template +opmath_t threadgroup_sum( + threadgroup opmath_t* data, + T val, + unsigned idx, + unsigned size) { + auto rc = simd_sum(static_cast>(val)); + if (idx % simdgroup_size == 0) { + data[idx / simdgroup_size] = rc; + } + if (size > simdgroup_size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_sum(data[idx]); + if (idx == 0) { + data[0] = rc1; + } + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return data[0]; +} + +template +opmath_t threadgroup_prod( + threadgroup opmath_t* data, + T val, + unsigned idx, + unsigned size) { + auto rc = simd_prod(static_cast>(val)); + if (idx % simdgroup_size == 0) { + data[idx / simdgroup_size] = rc; + } + if (size > simdgroup_size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_prod(data[idx]); + if (idx == 0) { + data[0] = rc1; + } + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return data[0]; +} + +template +T threadgroup_max(threadgroup T* data, T val, unsigned idx, unsigned size) { + auto rc = simd_max(val); + if (idx % simdgroup_size == 0) { + data[idx / simdgroup_size] = rc; + } + if (size > simdgroup_size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_max(data[idx]); + if (idx == 0) { + data[0] = rc1; + } + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return data[0]; +} + +template +T threadgroup_min(threadgroup T* data, T val, unsigned idx, unsigned size) { + auto rc = simd_min(val); + if (idx % simdgroup_size == 0) { + data[idx / simdgroup_size] = rc; + } + if (size > simdgroup_size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_min(data[idx]); + if (idx == 0) { + data[0] = rc1; + } + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return data[0]; +} + +template +float3 threadgroup_welford_reduce(threadgroup T* data, unsigned size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + float m = data[0]; + float m2 = 0; + for (unsigned idx = 1; idx < size; ++idx) { + float delta = data[idx] - m; + m += delta / (idx + 1); + m2 += delta * (data[idx] - m); + } + return float3(m, m2, size); +} + +// Each vec3type is tuple of mean, m2 and weight +template +float3 welford_combine(T a, T b) { + float delta = b.x - a.x; + float new_weight = a.z + b.z; + auto w2_over_w = new_weight != 0 ? b.z / new_weight : 0.0; + return float3( + a.x + delta * w2_over_w, + a.y + b.y + delta * delta * a.z * w2_over_w, + new_weight); +} + +template +float3 threadgroup_welford_combine(threadgroup T* data, unsigned size) { + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + float3 rc = data[0]; + for (unsigned idx = 1; idx < size; ++idx) { + rc = welford_combine(rc, data[idx]); + } + return rc; +} + +template +IDX_T threadgroup_argmax( + threadgroup ARG_T* arg_data, + threadgroup IDX_T* idx_data, + ARG_T val, + IDX_T idx_val, + unsigned idx, + unsigned size) { + auto rc = simd_argmax(val, idx_val); + if (size <= simdgroup_size) { + return rc.second; + } + if (idx % simdgroup_size == 0) { + arg_data[idx / simdgroup_size] = rc.first; + idx_data[idx / simdgroup_size] = rc.second; + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_argmax(arg_data[idx], idx_data[idx]); + if (idx == 0) { + idx_data[0] = rc1.second; + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return idx_data[0]; +} + +template +IDX_T threadgroup_argmin( + threadgroup ARG_T* arg_data, + threadgroup IDX_T* idx_data, + ARG_T val, + IDX_T idx_val, + unsigned idx, + unsigned size) { + auto rc = simd_argmin(val, idx_val); + if (size <= simdgroup_size) { + return rc.second; + } + if (idx % simdgroup_size == 0) { + arg_data[idx / simdgroup_size] = rc.first; + idx_data[idx / simdgroup_size] = rc.second; + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) { + auto rc1 = simd_argmin(arg_data[idx], idx_data[idx]); + if (idx == 0) { + idx_data[0] = rc1.second; + } + } + ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup); + return idx_data[0]; +} + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h new file mode 100644 index 0000000000000000000000000000000000000000..d0fb82cc0ad813b59aeb2e62a0d93ca37ac0d54b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h @@ -0,0 +1,2064 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Implementation of special math functions for Metal +#pragma once +#include +#include +#include +#include + +namespace c10 { +namespace metal { + +/* + * Approximation to the error function. + * Based on code from: + * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199 + * Copy-n-pasted from + * https://github.com/ml-explore/mlx/blob/2e8cf0b4506c200a5c2d199ecbbf655fdf4c2ce2/mlx/backend/metal/kernels/erf.h#L11 + */ +template +inline float erf(T x) { + const auto a = static_cast(x); + const auto t = ::metal::abs(a); + const auto s = a * a; + if (t > 0.927734375f) { + // maximum error 0.99527 ulp + auto r = ::metal::fma( + -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12 + const auto u = ::metal::fma( + -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6 + r = ::metal::fma(r, s, u); + r = ::metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4 + r = ::metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1 + r = ::metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3 + r = ::metal::fma(r, t, -t); + // TODO, replace with expm1 when implemented + r = 1.0f - ::metal::exp(r); + r = ::metal::copysign(r, a); + return r; + } + + // maximum error 0.98929 ulp + auto r = -5.96761703e-4f; // -0x1.38e000p-11 + r = ::metal::fma(r, s, 4.99119423e-3f); // 0x1.471a58p-8 + r = ::metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6 + r = ::metal::fma(r, s, 1.12819925e-1f); // 0x1.ce1c44p-4 + r = ::metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2 + r = ::metal::fma(r, s, 1.28379166e-1f); // 0x1.06eba8p-3 + r = ::metal::fma(r, a, a); + return r; +} + +template +float erfc(T x) { + return 1.0 - erf(x); +} + +template +inline float erfinv(T y) { + /* coefficients in rational expansion */ + constexpr float a[4] = {0.886226899, -1.645349621, 0.914624893, -0.140543331}; + constexpr float b[4] = {-2.118377725, 1.442710462, -0.329097515, 0.012229801}; + constexpr float c[4] = {-1.970840454, -1.624906493, 3.429567803, 1.641345311}; + constexpr float d[2] = {3.543889200, 1.637067800}; + + float x, z, num, dem; /*working variables */ + + float y_abs = ::metal::abs(static_cast(y)); + if (y_abs >= 1.0f) { + return y_abs > 1.0f ? NAN + : ::metal::copysign(INFINITY, static_cast(y)); + } + if (y_abs <= 0.7f) { + z = y * y; + num = ((a[3] * z + a[2]) * z + a[1]) * z + a[0]; + dem = (((b[3] * z + b[2]) * z + b[1]) * z + b[0]) * z + 1.0f; + x = y * num / dem; + } else { + z = ::metal::sqrt(-1.0f * ::metal::log((1.0 - y_abs) / 2.0)); + num = ((c[3] * z + c[2]) * z + c[1]) * z + c[0]; + dem = (d[1] * z + d[0]) * z + 1.0f; + x = ::metal::copysign(num, static_cast(y)) / dem; + } + + return x; +} + +/* + * For licensing information and documentation, please refer to the cpu + * implementation located in "ATen/native/Math.h". + */ + +template +inline T chbevl(T x, const float array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); +} + +// Copied from +// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L502 + +template +inline T i0(T _x) { + auto x = ::metal::fabs(_x); + + if (x <= 8.0) { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + constexpr float A[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + auto y = (x / 2.0) - 2.0; + return static_cast(::metal::exp(x) * chbevl(y, A, 30)); + } + + // Handles x > 8 case + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + constexpr float B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return static_cast( + (::metal::exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / ::metal::sqrt(x)); +} + +template +inline T i0e(T _x) { + auto x = ::metal::fabs(_x); + + if (x <= 8.0) { + constexpr float coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + auto y = (x / 2.0) - 2.0; + return static_cast(chbevl(y, coefficients, int{30})); + } + + // x > 8 + constexpr float coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return static_cast( + chbevl(32.0 / x - 2.0, coefficients, 25) / ::metal::sqrt(x)); +} + +// Copied from +// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576 + +template +inline T i1(T _x) { + const auto x = ::metal::fabs(_x); + + if (x <= 8.0) { + // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] + // lim(x->0){ exp(-x) i1(x) / x } = 1/2 + constexpr float coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const auto y = x / 2.0 - 2.0; + const auto out = ::metal::exp(x) * x * chbevl(y, coefficients, 29); + return static_cast(_x < T(0.) ? -out : out); + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval [8, infinity] + // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) + constexpr float coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + const auto out = (::metal::exp(x) * chbevl(32. / x - 2., coefficients, 25)) / + ::metal::sqrt(x); + return static_cast(_x < T(0.) ? -out : out); +} + +template +inline T i1e(T _x) { + const auto x = ::metal::fabs(_x); + if (x <= 8.0) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + constexpr float coefficients[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + const auto y = x / 2.0 - 2.0; + const auto out = chbevl(y, coefficients, 17) * x; + return static_cast(_x < 0. ? -out : out); + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + constexpr float coefficients[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + const auto out = + chbevl(32. / x - 2., coefficients, 7) / ::metal::precise::sqrt(x); + return static_cast(_x < 0. ? -out : out); +} + +// gamma, lgamma +template +inline float log_gamma(const T); + +template +inline float gamma(const T x) { + if (x < 0.001) { + constexpr float EULER_MASCHERONI = 0.577215664901532860606512090; + // For small x, 1/gamma(x) has power series x + gamma x^2 - ... + // So in this range, 1/gamma(x) = x + gamma x^2 with error on the order of + // x^3. The relative error over this interval is less than 6e-7. + + return 1.0 / (x * (1.0 + EULER_MASCHERONI * x)); + } + if (x >= 12.0) { + return ::metal::exp(log_gamma(x)); + } + // The algorithm directly approximates gamma over (1,2) and uses + // reduction identities to reduce other arguments to this interval. + // numerator coefficients for gamma approximation over the interval (1,2) + constexpr float GAMMA_NUMERATOR_COEF[8] = { + -1.71618513886549492533811E+0, + 2.47656508055759199108314E+1, + -3.79804256470945635097577E+2, + 6.29331155312818442661052E+2, + 8.66966202790413211295064E+2, + -3.14512729688483675254357E+4, + -3.61444134186911729807069E+4, + 6.64561438202405440627855E+4}; + + // denominator coefficients for gamma approximation over the interval (1,2) + constexpr float GAMMA_DENOMINATOR_COEF[8] = { + -3.08402300119738975254353E+1, + 3.15350626979604161529144E+2, + -1.01515636749021914166146E+3, + -3.10777167157231109440444E+3, + 2.25381184209801510330112E+4, + 4.75584627752788110767815E+3, + -1.34659959864969306392456E+5, + -1.15132259675553483497211E+5}; + + // Add or subtract integers as necessary to bring y into (1,2) + float y = 1.0 + ::metal::fract(x); + + float num = 0.0; + float den = 1.0; + + float z = y - 1; + for (int i = 0; i < 8; i++) { + num = (num + GAMMA_NUMERATOR_COEF[i]) * z; + den = den * z + GAMMA_DENOMINATOR_COEF[i]; + } + float result = num / den + 1.0; + + // Apply correction if argument was not initially in (1,2) + if (x < 1.0) { + // identity gamma(z) = gamma(z+1)/z + result /= (y - 1.0); + } else { + // identity gamma(z+n) = z*(z+1)* ... *(z+n-1)*gamma(z) + auto n = static_cast(::metal::floor(x)); + for (int i = 1; i < n; i++) { + result *= y++; + } + } + + return result; +} + +template +inline float log_gamma(const T x) { + constexpr float LOG_PI = 1.14472988584940017414342735135305; + constexpr float HALF_LOG_TWO_PI = 0.91893853320467274178032973640562; + constexpr float LGAMMA_EXPANSION_COEF[8] = { + 1.0 / 12.0, + -1.0 / 360.0, + 1.0 / 1260.0, + -1.0 / 1680.0, + 1.0 / 1188.0, + -691.0 / 360360.0, + 1.0 / 156.0, + -3617.0 / 122400.0}; + + float rc; + + const auto abs_x = ::metal::abs(static_cast(x)); + if (abs_x == 0) { + return INFINITY; + } + if (abs_x < 12.0) { + rc = ::metal::log(::metal::abs(gamma(abs_x))); + } else { + // Abramowitz and Stegun 6.1.41 + // Asymptotic series should be good to at least 11 or 12 figures + // For error analysis, see Whittiker and Watson + // A Course in Modern Analysis (1927), page 252 + + float z = 1.0 / (abs_x * abs_x); + float sum = LGAMMA_EXPANSION_COEF[7]; + + for (int i = 6; i >= 0; i--) { + sum *= z; + sum += LGAMMA_EXPANSION_COEF[i]; + } + float series = sum / abs_x; + + rc = (abs_x - 0.5) * ::metal::log(abs_x) - abs_x + HALF_LOG_TWO_PI + series; + } + + if (x >= 0) { + return rc; + } + + // Reflection formula + // Compute arg first to workaround Metal compiler bgg of sorts on M4 + // See https://github.com/pytorch/pytorch/pull/145740 for more details + auto log_arg = abs_x * ::metal::abs(::metal::sinpi(abs_x)); + return LOG_PI - rc - ::metal::log(log_arg); +} + +inline float zeta(float x, float q) { + constexpr float MACHEP = 1.11022302462515654042E-16; + constexpr float ZETA_EXPANSION[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, + 7.47242496e10, + -2.950130727918164224e12, + 1.1646782814350067249e14, + -4.5979787224074726105e15, + 1.8152105401943546773e17, + -7.1661652561756670113e18}; + if (x == 1.0f) { + return INFINITY; + } + + if (x < 1.0f) { + return NAN; + } + + if (q <= 0.0f) { + if (q == ::metal::trunc(q)) { + return INFINITY; + } + if (x != ::metal::trunc(x)) { + return NAN; + } + } + + float s = ::metal::pow(q, -x); + float a = q; + int i = 0; + float b = 0.0f; + while ((i < 9) || (a <= 9.0f)) { + i += 1; + a += 1.0f; + b = ::metal::pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return s; + } + } + + float w = a; + s += b * w / (x - 1.0f); + s -= 0.5f * b; + a = 1.0f; + float t; + float k = 0.0f; + for (int i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / ZETA_EXPANSION[i]; + s += t; + t = ::metal::fabs(t / s); + if (t < MACHEP) { + return s; + } + k += 1.0f; + a *= x + k; + b /= w; + k += 1.0f; + } + return s; +} + +inline float calc_digamma_positive_domain(float x) { + constexpr float DIGAMMA_COEF[7] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + // Push x to be >= 10 + float result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + constexpr float PSI_10 = 2.25175258906672110764; + return result + PSI_10; + } + + // Compute asymptotic digamma + float y = 0; + if (x < 1.0E+17) { + float z = 1.0 / (x * x); + for (int i = 0; i <= 6; i++) { + y += ::metal::pow(z, i) * DIGAMMA_COEF[i]; + } + y *= z; + } + return result + ::metal::log(x) - (0.5 / x) - y; +} + +template +inline float digamma(T0 x) { + if (x < 0.0f) { + if (x == ::metal::trunc(x)) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return NAN; + } else { + // Extracts the fractional part of x as r, since tan(pi * r) is more + // numerically accurate than tan(pi * x). While these operations are + // mathematically equivalent since both x and r are in radians and tan() + // has a periodicity of pi, in practice the computation of pi * x is a + // source of error (when |x| > 1). + float r = ::metal::fract(x); + return calc_digamma_positive_domain(1.0f - x) - + M_PI_F / ::metal::tan(M_PI_F * r); + } + } else if (x == 0.0f) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return ::metal::copysign(INFINITY, static_cast(-x)); + } else { + return calc_digamma_positive_domain(x); + } +} + +template +inline float polygamma(const int64_t order, const T0 input) { + // Filter out n == 0. + if (order == 0) { + return digamma(input); + } + + float x = input; + float n = order; + float sgn = ((order % 2) ? 1 : -1); + return sgn * gamma(n + 1) * zeta(n + 1, x); +} + +template +inline ::metal::enable_if_t, T> sinc(T a) { + if (a == static_cast(0)) { + return static_cast(1); + } + auto product = M_PI_F * static_cast(a); + return static_cast(::metal::precise::sin(product) / product); +} + +// Complex sinc2 implementation +template +inline ::metal::enable_if_t, T> sinc(T inp) { + auto a = static_cast(inp) * M_PI_F; + const float a2 = a.x * a.x + a.y * a.y; + if (a2 == 0) { + return 0; + } + float cosx; + float sinx = ::metal::sincos(a.x, cosx); + float sinhy = ::metal::sinh(a.y); + float coshy = ::metal::cosh(a.y); + auto re = sinx * coshy * a.x + cosx * sinhy * a.y; + auto im = cosx * sinhy * a.x - sinx * coshy * a.y; + return T(re, im) / a2; +} + +template +inline T spherical_bessel_j0(T x) { + if (::metal::isinf(x)) + return T(0.0); + T x2 = x * x; + T k1 = static_cast(-1.0); + T k2 = static_cast(1.0); + + if (::metal::fabs(static_cast(x)) < T(0.5)) { + return T(1.0) + + x2 * + (k1 / T(6.0) + + x2 * + (k2 / T(120.0) + + x2 * + (k1 / T(5040.0) + + x2 * + (k2 / T(362880.0) + + x2 * + (k1 / T(39916800.0) + + x2 * (k2 / T(6227020800.0))))))); + } + + return static_cast(::metal::sin(x) / x); +} + +template +inline ::metal::enable_if_t, T> logaddexp( + T a, + T b) { + float a0 = static_cast(a); + float b0 = static_cast(b); + if (::metal::isinf(a0) && a0 == b0) { + return static_cast(a0); + } else { + float m0 = ::metal::max(a0, b0); + return static_cast( + m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0)))); + } +} + +// The function is ported from mlx +template +inline ::metal::enable_if_t, T> logaddexp(T a, T b) { + if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) || + ::metal::isnan(b.y)) { + return T(NAN, NAN); + } + + T maxval = a.x > b.x ? a : b; + T minval = a.x < b.x ? a : b; + constexpr auto inf = ::metal::numeric_limits::infinity().x; + + if (minval.x == -inf || maxval.x == inf) { + return maxval; + } + + float2 maxval_ = static_cast(maxval); + float2 minval_ = static_cast(minval); + float m = ::metal::exp(minval_.x - maxval_.x); + float2 dexp{ + m * ::metal::cos(minval_.y - maxval_.y), + m * ::metal::sin(minval_.y - maxval_.y), + }; + return static_cast(maxval_ + ::c10::metal::log1p(dexp)); +} + +template +inline T logaddexp2(T a, T b) { + constexpr auto log_2 = float(0.693147180559945309417232121458176); + constexpr auto inv_log_2 = float(1) / log_2; + float a0 = static_cast(a); + float b0 = static_cast(b); + if (::metal::isinf(a0) && a0 == b0) { + return static_cast(a0); + } else { + float m0 = ::metal::max(a0, b0); + return static_cast( + m0 + + ::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) * + inv_log_2); + } +} + +template +inline float xlog1py(T x, T y) { + if (::metal::isnan(y)) { + return NAN; + } + + if (x == 0) { + return x; + } + + return x * ::c10::metal::log1p(y); +} + +template +inline T entr(T a) { + if (a != a) { + return a; + } + + if (a > 0) { + return static_cast(-a * ::metal::log(a)); + } + + if (a == 0) { + return 0; + } + + return static_cast(-INFINITY); +} + +// Copy-n-paste from aten/src/ATen/native/cuda/Math.cuh lines 1463-1915 +template +inline float bessel_j0_forward(T x) { + constexpr float PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + constexpr float PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + constexpr float QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + constexpr float QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + constexpr float RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + constexpr float RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return 1.0 - x * x / 4.0; + } + + float rp = 0.0; + + for (auto index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + float rq = 0.0; + + for (auto index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - 5.78318596294678452118e+00) * + (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + float pp = 0.0; + + for (auto index = 0; index <= 6; index++) { + pp = pp * (25.0 / (x * x)) + PP[index]; + } + + float pq = 0.0; + + for (auto index = 0; index <= 6; index++) { + pq = pq * (25.0 / (x * x)) + PQ[index]; + } + + float qp = 0.0; + + for (auto index = 0; index <= 7; index++) { + qp = qp * (25.0 / (x * x)) + QP[index]; + } + + float qq = 0.0; + + for (auto index = 0; index <= 6; index++) { + qq = qq * (25.0 / (x * x)) + QQ[index]; + } + + return (pp / pq * + ::metal::precise::cos( + x - T(0.785398163397448309615660845819875721)) - + 5.0 / x * (qp / qq) * + ::metal::precise::sin( + x - 0.785398163397448309615660845819875721)) * + 0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x); +} // bessel_j0_forward(T x) + +template +inline float bessel_y0_forward(T x) { + constexpr float PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + constexpr float PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + constexpr float QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + constexpr float QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + constexpr float YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + constexpr float YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return -INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + float yp = 0.0; + + for (auto index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + float yq = 0.0; + + for (auto index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + + (0.636619772367581343075535053490057448 * ::metal::precise::log(x) * + bessel_j0_forward(x)); + } + + float pp = 0.0; + + for (auto index = 0; index <= 6; index++) { + pp = pp * (25.0 / (x * x)) + PP[index]; + } + + float pq = 0.0; + + for (auto index = 0; index <= 6; index++) { + pq = pq * (25.0 / (x * x)) + PQ[index]; + } + + float qp = 0.0; + + for (auto index = 0; index <= 7; index++) { + qp = qp * (25.0 / (x * x)) + QP[index]; + } + + float qq = 0.0; + + for (auto index = 0; index <= 6; index++) { + qq = qq * (25.0 / (x * x)) + QQ[index]; + } + + return (pp / pq * + ::metal::precise::sin( + x - 0.785398163397448309615660845819875721) + + 5.0 / x * (qp / qq) * + ::metal::precise::cos( + x - 0.785398163397448309615660845819875721)) * + 0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x); +} // bessel_y0_forward(T x) + +template +inline float bessel_j1_forward(T x) { + constexpr float PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + constexpr float PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + constexpr float QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + constexpr float QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + constexpr float RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + constexpr float RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + float rp = 0.0; + + for (auto index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + float rq = 0.0; + + for (auto index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - 1.46819706421238932572e+01) * + (x * x - 4.92184563216946036703e+01); + } + + float pp = 0.0; + + for (auto index = 0; index <= 6; index++) { + pp = pp * (5.0 / x * (5.0 / x)) + PP[index]; + } + + float pq = 0.0; + + for (auto index = 0; index <= 6; index++) { + pq = pq * (5.0 / x * (5.0 / x)) + PQ[index]; + } + + float qp = 0.0; + + for (auto index = 0; index <= 7; index++) { + qp = qp * (5.0 / x * (5.0 / x)) + QP[index]; + } + + float qq = 0.0; + + for (auto index = 0; index <= 6; index++) { + qq = qq * (5.0 / x * (5.0 / x)) + QQ[index]; + } + + return (pp / pq * + ::metal::precise::cos( + x - 2.356194490192344928846982537459627163) - + 5.0 / x * (qp / qq) * + ::metal::precise::sin( + x - 2.356194490192344928846982537459627163)) * + 0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x); +} // bessel_j1_forward(T x) + +template +inline float bessel_y1_forward(T x) { + constexpr float PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + constexpr float PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + constexpr float QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + constexpr float QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + constexpr float YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + constexpr float YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return -INFINITY; + } + + if (x <= T(0.0)) { + return NAN; + } + + float yp = 0.0; + + for (auto index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + float yq = 0.0; + + for (auto index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + + (0.636619772367581343075535053490057448 * + (bessel_j1_forward(x) * ::metal::precise::log(x) - 1.0 / x)); + } + + float pp = 0.0; + + for (auto index = 0; index <= 6; index++) { + pp = pp * (5.0 / x * (5.0 / x)) + PP[index]; + } + + float pq = 0.0; + + for (auto index = 0; index <= 6; index++) { + pq = pq * (5.0 / x * (5.0 / x)) + PQ[index]; + } + + float qp = 0.0; + + for (auto index = 0; index <= 7; index++) { + qp = qp * (5.0 / x * (5.0 / x)) + QP[index]; + } + + float qq = 0.0; + + for (auto index = 0; index <= 6; index++) { + qq = qq * (5.0 / x * (5.0 / x)) + QQ[index]; + } + + return (pp / pq * + ::metal::precise::sin( + x - 2.356194490192344928846982537459627163) + + 5.0 / x * (qp / qq) * + ::metal::precise::cos( + x - 2.356194490192344928846982537459627163)) * + 0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x); +} // bessel_y1_forward(T x) + +template +inline float modified_bessel_i0_forward(T x) { + constexpr float A[] = { + -4.41534164647933937950e-18, +3.33079451882223809783e-17, + -2.43127984654795469359e-16, +1.71539128555513303061e-15, + -1.16853328779934516808e-14, +7.67618549860493561688e-14, + -4.85644678311192946090e-13, +2.95505266312963983461e-12, + -1.72682629144155570723e-11, +9.67580903537323691224e-11, + -5.18979560163526290666e-10, +2.65982372468238665035e-09, + -1.30002500998624804212e-08, +6.04699502254191894932e-08, + -2.67079385394061173391e-07, +1.11738753912010371815e-06, + -4.41673835845875056359e-06, +1.64484480707288970893e-05, + -5.75419501008210370398e-05, +1.88502885095841655729e-04, + -5.76375574538582365885e-04, +1.63947561694133579842e-03, + -4.32430999505057594430e-03, +1.05464603945949983183e-02, + -2.37374148058994688156e-02, +4.93052842396707084878e-02, + -9.49010970480476444210e-02, +1.71620901522208775349e-01, + -3.04682672343198398683e-01, +6.76795274409476084995e-01, + }; + + constexpr float B[] = { + -7.23318048787475395456e-18, -4.83050448594418207126e-18, + +4.46562142029675999901e-17, +3.46122286769746109310e-17, + -2.82762398051658348494e-16, -3.42548561967721913462e-16, + +1.77256013305652638360e-15, +3.81168066935262242075e-15, + -9.55484669882830764870e-15, -4.15056934728722208663e-14, + +1.54008621752140982691e-14, +3.85277838274214270114e-13, + +7.18012445138366623367e-13, -1.79417853150680611778e-12, + -1.32158118404477131188e-11, -3.14991652796324136454e-11, + +1.18891471078464383424e-11, +4.94060238822496958910e-10, + +3.39623202570838634515e-09, +2.26666899049817806459e-08, + +2.04891858946906374183e-07, +2.89137052083475648297e-06, + +6.88975834691682398426e-05, +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + float p; + float q = 0.0; + + if (::metal::fabs(x) <= 8.0) { + float a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index]; + } + + return ::metal::exp(::metal::fabs(x)) * (T(0.5) * (a - p)); + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index]; + } + + return ::metal::exp(::metal::fabs(x)) * (.5 * (b - p)) / + ::metal::precise::sqrt(::metal::fabs(x)); +} // modified_bessel_i0_forward(T x) + +template +inline float modified_bessel_i1_forward(T x) { + constexpr float A[] = { + +2.77791411276104639959e-18, -2.11142121435816608115e-17, + +1.55363195773620046921e-16, -1.10559694773538630805e-15, + +7.60068429473540693410e-15, -5.04218550472791168711e-14, + +3.22379336594557470981e-13, -1.98397439776494371520e-12, + +1.17361862988909016308e-11, -6.66348972350202774223e-11, + +3.62559028155211703701e-10, -1.88724975172282928790e-09, + +9.38153738649577178388e-09, -4.44505912879632808065e-08, + +2.00329475355213526229e-07, -8.56872026469545474066e-07, + +3.47025130813767847674e-06, -1.32731636560394358279e-05, + +4.78156510755005422638e-05, -1.61760815825896745588e-04, + +5.12285956168575772895e-04, -1.51357245063125314899e-03, + +4.15642294431288815669e-03, -1.05640848946261981558e-02, + +2.47264490306265168283e-02, -5.29459812080949914269e-02, + +1.02643658689847095384e-01, -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + constexpr float B[] = { + +7.51729631084210481353e-18, +4.41434832307170791151e-18, + -4.65030536848935832153e-17, -3.20952592199342395980e-17, + +2.96262899764595013876e-16, +3.30820231092092828324e-16, + -1.88035477551078244854e-15, -3.81440307243700780478e-15, + +1.04202769841288027642e-14, +4.27244001671195135429e-14, + -2.10154184277266431302e-14, -4.08355111109219731823e-13, + -7.19855177624590851209e-13, +2.03562854414708950722e-12, + +1.41258074366137813316e-11, +3.25260358301548823856e-11, + -1.89749581235054123450e-11, -5.58974346219658380687e-10, + -3.83538038596423702205e-09, -2.63146884688951950684e-08, + -2.51223623787020892529e-07, -3.88256480887769039346e-06, + -1.10588938762623716291e-04, -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + float p; + float q = 0.0; + + if (::metal::fabs(x) <= T(8.0)) { + float a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index]; + } + + return .5 * (a - p) * x * ::metal::precise::exp(::metal::fabs(x)); + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index]; + } + + if (x < 0.0) { + return -( + ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) / + ::metal::precise::sqrt(::metal::fabs(x))); + } + + return ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) / + ::metal::precise::sqrt(::metal::fabs(x)); +} // modified_bessel_i1_forward(T x) + +template +inline float modified_bessel_k0_forward(T x) { + constexpr float A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + constexpr float B[] = { + +5.30043377268626276149e-18, -1.64758043015242134646e-17, + +5.21039150503902756861e-17, -1.67823109680541210385e-16, + +5.51205597852431940784e-16, -1.84859337734377901440e-15, + +6.34007647740507060557e-15, -2.22751332699166985548e-14, + +8.03289077536357521100e-14, -2.98009692317273043925e-13, + +1.14034058820847496303e-12, -4.51459788337394416547e-12, + +1.85594911495471785253e-11, -7.95748924447710747776e-11, + +3.57739728140030116597e-10, -1.69753450938905987466e-09, + +8.57403401741422608519e-09, -4.66048989768794782956e-08, + +2.76681363944501510342e-07, -1.83175552271911948767e-06, + +1.39498137188764993662e-05, -1.28495495816278026384e-04, + +1.56988388573005337491e-03, -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == 0.0) { + return INFINITY; + } + + if (x < 0.0) { + return NAN; + } + + float p; + float q = 0.0; + + if (x <= 2.0) { + float a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - 2.0) * q - p + A[index]; + } + + return 0.5 * (a - p) - + ::metal::log(0.5 * x) * modified_bessel_i0_forward(x); + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (8.0 / x - 2.0) * q - p + B[index]; + } + + return ::metal::exp(-x) * (0.5 * (b - p)) / ::metal::sqrt(x); +} // modified_bessel_k0_forward(T x) + +template +inline float modified_bessel_k1_forward(T x) { + constexpr float A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + constexpr float B[] = { + -5.75674448366501715755e-18, +1.79405087314755922667e-17, + -5.68946255844285935196e-17, +1.83809354436663880070e-16, + -6.05704724837331885336e-16, +2.03870316562433424052e-15, + -7.01983709041831346144e-15, +2.47715442448130437068e-14, + -8.97670518232499435011e-14, +3.34841966607842919884e-13, + -1.28917396095102890680e-12, +5.13963967348173025100e-12, + -2.12996783842756842877e-11, +9.21831518760500529508e-11, + -4.19035475934189648750e-10, +2.01504975519703286596e-09, + -1.03457624656780970260e-08, +5.74108412545004946722e-08, + -3.50196060308781257119e-07, +2.40648494783721712015e-06, + -1.93619797416608296024e-05, +1.95215518471351631108e-04, + -2.85781685962277938680e-03, +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == 0.0) { + return INFINITY; + } + + if (x < 0.0) { + return NAN; + } + + float p; + float q = 0.0; + + if (x <= 2.0) { + float a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return ::metal::precise::log(T(0.5) * x) * modified_bessel_i1_forward(x) + + 0.5 * (a - p) / x; + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (8.0 / x - 2.0) * q - p + B[index]; + } + + return ::metal::precise::exp(-x) * (0.5 * (b - p)) / + ::metal::precise::sqrt(x); +} + +template +inline float scaled_modified_bessel_k0_forward(T x) { + constexpr float A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + constexpr float B[] = { + +5.30043377268626276149e-18, -1.64758043015242134646e-17, + +5.21039150503902756861e-17, -1.67823109680541210385e-16, + +5.51205597852431940784e-16, -1.84859337734377901440e-15, + +6.34007647740507060557e-15, -2.22751332699166985548e-14, + +8.03289077536357521100e-14, -2.98009692317273043925e-13, + +1.14034058820847496303e-12, -4.51459788337394416547e-12, + +1.85594911495471785253e-11, -7.95748924447710747776e-11, + +3.57739728140030116597e-10, -1.69753450938905987466e-09, + +8.57403401741422608519e-09, -4.66048989768794782956e-08, + +2.76681363944501510342e-07, -1.83175552271911948767e-06, + +1.39498137188764993662e-05, -1.28495495816278026384e-04, + +1.56988388573005337491e-03, -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == 0.0) { + return INFINITY; + } + + if (x < 0.0) { + return NAN; + } + + float p; + float q = 0.0; + + if (x <= 2.0) { + float a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (0.5 * (a - p) - + ::metal::precise::log(0.5 * x) * modified_bessel_i0_forward(x)) * + ::metal::precise::exp(x); + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (8.0 / x - 2.0) * q - p + B[index]; + } + + return 0.5 * (b - p) / ::metal::precise::sqrt(x); +} + +template +inline float scaled_modified_bessel_k1_forward(T x) { + constexpr float A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + constexpr float B[] = { + -5.75674448366501715755e-18, +1.79405087314755922667e-17, + -5.68946255844285935196e-17, +1.83809354436663880070e-16, + -6.05704724837331885336e-16, +2.03870316562433424052e-15, + -7.01983709041831346144e-15, +2.47715442448130437068e-14, + -8.97670518232499435011e-14, +3.34841966607842919884e-13, + -1.28917396095102890680e-12, +5.13963967348173025100e-12, + -2.12996783842756842877e-11, +9.21831518760500529508e-11, + -4.19035475934189648750e-10, +2.01504975519703286596e-09, + -1.03457624656780970260e-08, +5.74108412545004946722e-08, + -3.50196060308781257119e-07, +2.40648494783721712015e-06, + -1.93619797416608296024e-05, +1.95215518471351631108e-04, + -2.85781685962277938680e-03, +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == 0.0) { + return INFINITY; + } + + if (x < 0.0) { + return NAN; + } + + float p; + float q = 0.0; + + if (x <= 2.0) { + float a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - 2.0) * q - p + A[index]; + } + + return (::metal::precise::log(0.5 * x) * modified_bessel_i1_forward(x) + + 0.5 * (a - p) / x) * + ::metal::precise::exp(x); + } + + float b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (8.0 / x - 2.0) * q - p + B[index]; + } + + return (0.5 * (b - p) / ::metal::precise::sqrt(x)); +} + +template +float chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (::metal::fabs(x) == 1.0) { + if (x > 0.0 || n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + if ((n > 6) && (::metal::precise::fabs(x) < 1.0)) { + return ::metal::precise::cos(n * ::metal::precise::acos(x)); + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return x; + } + + float p = 1.0; + float q = x; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + return r; +} + +template +float chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (::metal::fabs(x) == 1.0) { + if (x > 0.0 || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (::metal::fabs(x) < 1.0)) { + const auto acos_x = ::metal::precise::acos(x); + if (::metal::precise::sin(acos_x) != 0.0) { + return ::metal::precise::sin((n + 1) * acos_x) / + ::metal::precise::sin(acos_x); + } + + return (n + 1) * ::metal::precise::cos((n + 1) * acos_x) / x; + } + + if (n == 0) { + return 1.0; + } + + auto q = 2.0 * x; + if (n == 1) { + return q; + } + + auto p = 1.0; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = 2 * x * q - p; + p = q; + q = r; + } + + return r; +} + +template +float chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (::metal::fabs(x) == 1.0) { + if (x > 0.0) { + return 1.0; + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (::metal::fabs(x) < 1.0)) { + const auto acos_x = ::metal::precise::acos(x); + if (::metal::precise::sin(.5 * acos_x) != 1.0) { + return ::metal::precise::cos((n + 0.5) * acos_x) / + ::metal::precise::cos(.5 * acos_x); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return 1.0; + } + + auto q = 2.0 * x - 1.0; + if (n == 1) { + return q; + } + + auto p = 1.0; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = 2 * x * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_v_forward(T x, int64_t n) + +template +float chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (::metal::fabs(x) == 1.0) { + if (x > 0.0) { + return n + n + 1; + } + + if (n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + if ((n > 8) && (::metal::fabs(x) < 1.0)) { + const auto acos_x = ::metal::precise::acos(x); + if (::metal::precise::cos(.5 * acos_x) != 1.0) { + return ::metal::precise::sin((n + 0.5) * acos_x) / + ::metal::precise::sin(.5 * acos_x); + } + + if (x > 0.0) { + return n + n + 1; + } + + if (n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + if (n == 0) { + return 1.0; + } + + auto q = 2.0 * x + 1.0; + if (n == 1) { + return q; + } + + auto p = 1.0; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = 2.0 * x * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_w_forward(T x, int64_t n) + +template +float shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (x == T(1.0)) { + return 1.0; + } + + if (x == 0.0) { + if (n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + const float xpxm1 = x + x - 1.0; + if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) { + return ::metal::precise::cos(n * ::metal::precise::acos(xpxm1)); + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return xpxm1; + } + + float p = 1.0; + float q = xpxm1; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = (xpxm1 + xpxm1) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + +template +float shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (x == 1.0) { + return n + 1; + } + + if (x == 0.0) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + const float xpxm1 = x + x - 1.0; + if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) { + const float acos_2xm1 = ::metal::precise::acos(xpxm1); + const float divisor = ::metal::precise::sin(acos_2xm1); + if (divisor != 0.0) { + return ::metal::precise::sin((n + 1) * acos_2xm1) / divisor; + } + + return (n + 1) * ::metal::precise::cos((n + 1) * acos_2xm1) / xpxm1; + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return xpxm1 + xpxm1; + } + + float p = 1.0; + float q = xpxm1 + xpxm1; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = (xpxm1 + xpxm1) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + +template +float shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (x == 1.0) { + return 1.0; + } + + if (x == 0.0) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + const float xpxm1 = x + x - 1.0; + if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) { + const float acos_2xm1 = ::metal::precise::acos(xpxm1); + if (::metal::precise::sin(acos_2xm1 / 2.0) != 1.0) { + return ::metal::precise::cos((n + 0.5) * acos_2xm1) / + ::metal::precise::cos(acos_2xm1 / 2.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return xpxm1 + xpxm1 - 1.0; + } + + float p = 1.0; + float q = xpxm1 + xpxm1 - 1.0; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = (xpxm1 + xpxm1) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + +template +float shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (x == 1.0) { + return n + n + 1; + } + + if (x == 0.0) { + if (n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + const float xpxm1 = x + x - 1.0; + if ((n > 4) && (::metal::abs(xpxm1) < 1.0)) { + const float acos_2xm1 = ::metal::precise::acos(xpxm1); + if (::metal::precise::cos(acos_2xm1 / 2.0) != 1.0) { + return ::metal::precise::sin((n + 0.5) * acos_2xm1) / + ::metal::precise::sin(acos_2xm1 / 2.0); + } + + if (n % 2 == 0) { + return 1.0; + } + + return -1.0; + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return xpxm1 + xpxm1 + 1.0; + } + + float p = 1.0; + float q = xpxm1 + xpxm1 + 1.0; + float r; + + for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) { + r = (xpxm1 + xpxm1) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + +template +// TODO: Add 512 if/when double will be supported in Metal +inline constexpr int getHermitianLimit() { + return 128; +} + +template +inline float hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return x + x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + float p = 1.0; + float q = x + x; + float r = 0.0; + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; +} // hermite_polynomial_h_forward(T x, int64_t n) + +template +inline float hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return 0.0; + } + + if (n == 0) { + return 1.0; + } + + if (n == 1) { + return x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + float p = 1.0; + float q = x; + float r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; +} // hermite_polynomial_he_forward(T x, int64_t n) + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..13c23ac7ed705a4e8fc76ba144f603be82a9c503 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h @@ -0,0 +1,386 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Metal helper functions +#pragma once +#include +#include + +namespace c10 { +namespace metal { + +namespace detail { +template +struct vectypes {}; + +template <> +struct vectypes { + using type4 = float4; + using type3 = float3; + using type2 = float2; +}; + +template <> +struct vectypes { + using type4 = half4; + using type3 = half3; + using type2 = half2; +}; + +template <> +struct vectypes { + using type4 = bfloat4; + using type3 = bfloat3; + using type2 = bfloat2; +}; + +template <> +struct vectypes { + using type4 = short4; + using type3 = short3; + using type2 = short2; +}; + +template <> +struct vectypes { + using type4 = int4; + using type3 = int3; + using type2 = int2; +}; + +template <> +struct vectypes { + using type4 = short4; + using type3 = short3; + using type2 = short2; +}; + +template +struct OpMathType { + using type = T; +}; + +template <> +struct OpMathType { + using type = float; +}; + +template <> +struct OpMathType { + using type = int; +}; + +template <> +struct OpMathType { + using type = int; +}; + +template <> +struct OpMathType { + using type = int; +}; + +template <> +struct OpMathType { + using type = float; +}; + +// Type promotion structure for higher precision accumulation +template +struct AccumulationType { + using type = T; +}; + +// Specialization for half - promote to float for accumulation +template <> +struct AccumulationType { + using type = float; +}; + +// Specialization for bfloat - promote to float for accumulation +template <> +struct AccumulationType { + using type = float; +}; + +} // namespace detail + +template +::metal::enable_if_t<::metal::is_floating_point_v, T> max(T a, T b) { + return ::metal::isunordered(a, b) ? NAN : ::metal::max(a, b); +} + +template +::metal::enable_if_t<::metal::is_integral_v&& ::metal::is_integral_v, T> +max(T a, U b) { + return ::metal::max(a, static_cast(b)); +} + +template +::metal::enable_if_t<::metal::is_floating_point_v, T> min(T a, T b) { + return ::metal::isunordered(a, b) ? NAN : ::metal::min(a, b); +} + +template +::metal::enable_if_t<::metal::is_integral_v&& ::metal::is_integral_v, T> +min(T a, U b) { + return ::metal::min(a, static_cast(b)); +} + +template <> +inline bfloat min(bfloat a, bfloat b) { + return bfloat( + ::metal::isunordered(a, b) ? NAN : ::metal::min(float(a), float(b))); +} + +template <> +inline bfloat max(bfloat a, bfloat b) { + return bfloat( + ::metal::isunordered(a, b) ? NAN : ::metal::max(float(a), float(b))); +} + +template +using vec2type_t = typename detail::vectypes::type2; + +template +using vec4type_t = typename detail::vectypes::type4; + +template +using opmath_t = typename detail::OpMathType::type; + +template +using accum_t = typename detail::AccumulationType::type; + +// TODO: Move it to type_traits header may be +template +using result_of = decltype(::metal::declval()(::metal::declval()...)); + +template +constexpr constant bool is_complex_v = + ::metal::is_same_v || ::metal::is_same_v; + +template +constexpr constant bool is_scalar_floating_point_v = + ::metal::is_floating_point_v && ::metal::is_scalar_v; + +template +constexpr constant bool is_scalar_integral_v = + ::metal::is_integral_v && ::metal::is_scalar_v; + +template +using common_dtype = decltype(U(0) + V(0)); + +// floor_divide +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_integral_v && is_scalar_integral_v, + bool> = true> +inline common_dtype floor_divide(T x, U y) { + const auto quot = x / y; + return (x < 0) == (y < 0) ? quot : (x % y != 0) ? quot - 1 : quot; +} + +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_floating_point_v && is_scalar_floating_point_v, + bool> = true> +inline common_dtype floor_divide(T x, U y) { + return ::metal::floor(x / y); +} + +// fmod +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_integral_v && is_scalar_integral_v, + bool> = true> +inline common_dtype fmod(T x, U y) { + return x % y; +} + +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_floating_point_v && is_scalar_floating_point_v, + bool> = true> +inline common_dtype fmod(T x, U y) { + return ::metal::fmod(x, y); +} + +// cast_to primitives +// - No-op if types as the same +template < + typename T, + typename U, + ::metal::enable_if_t<::metal::is_same_v, bool> = true> +inline T cast_to(const U from) { + return from; +} +// - Simple cast between scalar and complex dtypes +template < + typename T, + typename U, + ::metal::enable_if_t< + !::metal::is_same_v && (is_complex_v == is_complex_v), + bool> = true> +inline T cast_to(const U from) { + return static_cast(from); +} + +// - Scalar to complex +template < + typename T, + typename U, + ::metal::enable_if_t && !is_complex_v, bool> = true> +inline T cast_to(const U from) { + return T(float(from), 0.0); +} +// - Complex to scalar (should not really be used, but exists for compliteness) +template < + typename T, + typename U, + ::metal::enable_if_t && is_complex_v, bool> = true> +inline T cast_to(const U from) { + return static_cast(from.x); +} + +// Generalizable math operators (used for both scalar and complex) + +template < + typename T, + typename U, + ::metal::enable_if_t, bool> = true> +inline common_dtype mul(const T x, const U y) { + return x * y; +} + +template < + typename T, + typename U, + ::metal::enable_if_t && is_complex_v, bool> = true> +inline common_dtype mul(const T x, const U y) { + return T(x.x * y.x - x.y * y.y, x.x * y.y + x.y * y.x); +} + +template < + typename T, + typename U, + ::metal::enable_if_t, bool> = true> +inline common_dtype div(const T x, const U y) { + return x / y; +} + +template < + typename T, + typename U, + ::metal::enable_if_t && is_complex_v, bool> = true> +inline common_dtype div(const T x, const U y) { + return T(::metal::dot(x, y), x.y * y.x - x.x * y.y) / ::metal::dot(y, y); +} + +// Remainder operator +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_floating_point_v || is_scalar_floating_point_v, + bool> = true> +inline float remainder(const T x, const U y) { + const auto x_f = static_cast(x); + const auto y_f = static_cast(y); + return x_f - y_f * floor_divide(x_f, y_f); +} + +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_integral_v && is_scalar_integral_v, + bool> = true> +inline common_dtype remainder(const T x, const U y) { + auto rc = x % y; + return rc == 0 || (x ^ y) > 0 ? rc : rc + y; +} + +// Based on algorithm described in +// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202 +inline float log1p(float x) { + const auto xp1 = 1.0f + x; + // First two elements of Taylor series for log(1+x) in Horner's form are: + // log(1+x) = x * (1 - x * (.5 ...)), but if 1 + x == x, then it's just x + if (xp1 == 1.0f) { + return x; + } + auto rc = ::metal::precise::log(xp1); + if (x > -.5 && x < .5) { + // Order of operations is important here for higher precision + rc *= x / (xp1 - 1.0f); + } + return rc; +} + +// The function is ported from mlx +inline float2 log1p(float2 in) { + float x = in.x; + float y = in.y; + float zabs = ::metal::precise::sqrt(x * x + y * y); + float theta = ::metal::atan2(y, x + 1); + if (zabs < 0.5f) { + float r = x * (2 + x) + y * y; + if (r == 0) { // handle underflow + return {x, theta}; + } + return {0.5f * log1p(r), theta}; + } else { + auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y); + return {::metal::log(z0), theta}; + } +} + +template +struct pair { + T1 first; + T2 second; +}; + +template +inline T conj(T a) { + return a; +} + +template <> +inline half2 conj(half2 a) { + return half2(a.x, -a.y); +} + +template <> +inline float2 conj(float2 a) { + return float2(a.x, -a.y); +} + +#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \ + MACRO(float); \ + MACRO(half); \ + MACRO(bfloat); \ + MACRO(float2); \ + MACRO(long); \ + MACRO(char); \ + MACRO(uchar); \ + MACRO(short); \ + MACRO(int); + +#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \ + MACRO(float); \ + MACRO(half); \ + MACRO(bfloat); + +} // namespace metal +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..ad6854b8871d9e55324bea686b1313f64c1f5883 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include + +/* + * CPUCachingAllocator: + * DISCLAIMER: + * This is subject to change (beta) and only supported on mobile builds. + * If code snippet such as in 'Usage pattern' is used outside of mobile + * build you will not observe the intended behavior. + * See below for more information. + * Why? + * It has been observed that some mobile platforms, such as pixel 3, return + * memory aggressively to the system. This results in page faults in some + * cases and ends up hurting performance. This caching allocator aims to address + * that. Furthermore it also allows users to specify their own allocator by + * implementing allocate/free virtual interfaces. What are the cons? There are + * some cons that were observed where use of caching allocator led to worse + * performance on some platforms. Reason being that the caching mechanism used + * by this allocator left us worse off compared to the corresponding platform's + * tuned memory allocator. In that case it seemed better to not use this + * allocator. Note there are some ideas to fix this in the works. + * + * Usage: + * Usage pattern: + * Instantiate and own the caching allocator. + * std::unique_ptr caching_allocator = + * std::make_unique(); + * Use caching allocator with a scoped guard at inference time. + * { + * WithCPUCachingAllocatorGuard(caching_allocator.get()); + * ... model.forward(...); + * } + */ + +namespace c10 { + +class C10_API CPUCachingAllocator { + /* + * What it does: + * Caches all the allocations carried out by this allocator. + * Cache key is the size of the allocation. + * If requested size is found in the cache returns the cached pointer. + * What it does not do: + * No speculative allocation for any future allocations. + */ + private: + inline void* allocate_and_cache(const size_t bytes); + void free_cached(); + + protected: + // Invariants. + // 1. If memory is ever allocated via this allocator then + // the pointer will exist in allocation_map_, unless the allocator + // returned the memory to OS via free_cached. + // 1.1. Therefore even when the said memory is "freed" via this + // allocator (and thus cached), it will continue to stay + // in allocation_map_. Furthermore it will also exist in + // available_map_. Thus an allocated memory pointer can be in both + // allocation_map_ and available_map_ simultaneously. + // 2. Memory pointer maybe removed from allocation_map_, when it + // is freed outside of the scope of this allocator, but was allocated + // by this allocator. + // 3. Available map only contains that memory which was allocated + // by this allocator and subsequently freed by this allocator. + // As a result of above invariants, allocated memory ptr cannot be in + // available_map_ unless it is in allocation_map_ as well. + ska::flat_hash_map> available_map_; + static ska::flat_hash_map allocation_map_; + // Since allocation_map, which is a global instance, is mutated/read via + // all public APIs we need a global mutex. + static std::mutex mutex_; + + public: + static void record_free(void* ptr); + virtual ~CPUCachingAllocator(); + // Checks the cache to see if allocation of size bytes can be found. + // If so return cached memory, else + // allocates memory, records it for caching and returns. + virtual void* allocate(const size_t bytes); + // Checks if the memory being freed is was marked for allocation by + // an earlier call to allocate. If so cache the allocation. + // Otherwise free. + virtual void free(void* ptr); +}; + +CPUCachingAllocator* GetDefaultCPUCachingAllocator(); + +bool ThreadLocalCachingAllocatorEnabled(); +CPUCachingAllocator* GetThreadLocalCachingAllocator(); + +class C10_API WithCPUCachingAllocatorGuard { + public: + WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator); + ~WithCPUCachingAllocatorGuard(); + + private: + CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..07064210e115bb5799906828fac135ccb63a3146 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h @@ -0,0 +1,157 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/* + * Given a sequence of allocations in a thread, AllocationPlan records + * 1. size of each allocation + * 2. Lifetime of each allocation. + * 3. allocation offsets: Memory offset for each allocation in a single blob of + * memory + * 4. Total size of a blob of memory required to satisfy all the allocations. + */ +class C10_API AllocationPlan { + private: + // Records size of each allocation by their sequential allocation ids. + std::vector allocation_sizes; + // This maps one allocation id (X) to another allocation id (Y). + // Allocation X is alive until allocation Y. From allocation Y onwards + // allocation X is not referenced. + // Thus Y is the id of the first allocation after X is freed. + // NB: When an allocation is recorded, along with recording its size, + // we also set the lifetime to be numeric_limits::max() + // This is to track allocations that are made during the scope of + // profiling but were not freed until after the scope ended. + // Such allocations are not managed by profiling allocator. + std::vector allocation_lifetimes; + // Maps an allocation to some offset in a blob of memory. + std::vector allocation_offsets; + uint64_t total_size{0}; + void clear(); + friend class AllocationPlanner; + friend class CPUProfilingAllocator; +}; + +/* + * Map of memory ptr to allocation id. This is auxiliary information only + * used to establish lifetime of allocations. + */ +class C10_API AllocationPlanner { + private: + AllocationPlan* allocation_plan_{nullptr}; + // Maps allocated ptr to its allocation id. + // This is used when freeing the memory to look up the allocation id + // in order to establish the lifetime of a particular allocation. + ska::flat_hash_map allocation_ptr_to_id_; + uint64_t allocation_id_{0}; + bool validation_mode_{false}; + + bool validate_allocation(const uint64_t size, const void* ptr); + bool validate_free(const void* ptr); + + public: + bool validation_success{true}; + + AllocationPlanner() = delete; + AllocationPlanner(AllocationPlan* plan, bool validate = false) + : allocation_plan_(plan), validation_mode_(validate) {} + void record_allocation(const uint64_t size, const void* ptr); + void record_free(const void* ptr); + void formulate_plan(); + void clear(); +}; + +// NOT THREAD SAFE profiling allocator. +class C10_API CPUProfilingAllocator { + private: + const AllocationPlan* plan_{nullptr}; + uint64_t allocation_id_{0}; + uint64_t current_size_{0}; + void* blob_{nullptr}; + ska::flat_hash_map allocation_ptr_to_id_; + + public: + ~CPUProfilingAllocator(); + void set_plan(const AllocationPlan* plan); + void unset_plan(); + void* allocate(const size_t bytes); + void free(void* const ptr); +}; + +/* + * Usage: Profile allocations made by one run of the model. + * AllocationPlan plan; + * { + * WithProfileAllocationGuard profile_guard(&plan); + * module.forward(...); + * } + * plan now contains allocation plan. + */ +class C10_API WithProfileAllocationsGuard { + public: + WithProfileAllocationsGuard(AllocationPlan* plan); + ~WithProfileAllocationsGuard(); + + private: + std::unique_ptr planner_; +}; + +/* + * Usage: Validate allocation plan made with WithProfileAllocationGuard + * bool plan_validation_success, success = true; + * for (some number of representative inputs) + * { + * WithValidateAllocationPlanGuard(&plan, &plan_validation_success); + * module.forward(...); + * success = success && plan_validation_success; + * } + * success == true means allocations are according to plan + * else for some inputs allocation pattern changed. + */ +class C10_API WithValidateAllocationPlanGuard { + public: + WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success); + ~WithValidateAllocationPlanGuard(); + + private: + std::unique_ptr planner_; + bool* success_; +}; + +AllocationPlanner* GetThreadLocalAllocationPlanner(); + +/* + * Usage: Allocate tensors accordingly to allocation plan + * First make allocation plan. + * See WithProfileAllocationsGuard usage. + * Second validate allocation plan. + * See WithValidateAllocationPlanGuard usage. + * CPUProfilingAllocator profiling_allocator; + * { + * WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan); + * module.forward(...); + * } + */ +class C10_API WithProfilingAllocatorGuard { + public: + WithProfilingAllocatorGuard( + CPUProfilingAllocator* allocator, + const AllocationPlan* plan); + ~WithProfilingAllocatorGuard(); +}; + +CPUProfilingAllocator* GetThreadLocalProfilingAllocator(); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h new file mode 100644 index 0000000000000000000000000000000000000000..026570edcd7f2be024266f65b5745a65036bbeed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h @@ -0,0 +1,14 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef C10_TEST_CORE_MACROS_MACROS_H_ + +#ifdef _WIN32 +#define DISABLED_ON_WINDOWS(x) DISABLED_##x +#else +#define DISABLED_ON_WINDOWS(x) x +#endif + +#endif // C10_MACROS_MACROS_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..a68a35cd968a95ef35b61b92594837fcbdbf79a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h @@ -0,0 +1,672 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Warning: this file is included twice in +// aten/src/ATen/test/cuda_complex_math_test.cu + +#include +#include + +#ifndef PI +#define PI 3.141592653589793238463 +#endif + +#ifndef tol +#define tol 1e-6 +#endif + +// Exponential functions + +C10_DEFINE_TEST(TestExponential, IPi) { + // exp(i*pi) = -1 + { + c10::complex e_i_pi = std::exp(c10::complex(0, float(PI))); + C10_ASSERT_NEAR(e_i_pi.real(), -1, tol); + C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol); + } + { + c10::complex e_i_pi = ::exp(c10::complex(0, float(PI))); + C10_ASSERT_NEAR(e_i_pi.real(), -1, tol); + C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol); + } + { + c10::complex e_i_pi = std::exp(c10::complex(0, PI)); + C10_ASSERT_NEAR(e_i_pi.real(), -1, tol); + C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol); + } + { + c10::complex e_i_pi = ::exp(c10::complex(0, PI)); + C10_ASSERT_NEAR(e_i_pi.real(), -1, tol); + C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol); + } +} + +C10_DEFINE_TEST(TestExponential, EulerFormula) { + // exp(ix) = cos(x) + i * sin(x) + { + c10::complex x(0.1, 1.2); + c10::complex e = std::exp(x); + float expected_real = std::exp(x.real()) * std::cos(x.imag()); + float expected_imag = std::exp(x.real()) * std::sin(x.imag()); + C10_ASSERT_NEAR(e.real(), expected_real, tol); + C10_ASSERT_NEAR(e.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex e = ::exp(x); + float expected_real = ::exp(x.real()) * ::cos(x.imag()); + float expected_imag = ::exp(x.real()) * ::sin(x.imag()); + C10_ASSERT_NEAR(e.real(), expected_real, tol); + C10_ASSERT_NEAR(e.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex e = std::exp(x); + float expected_real = std::exp(x.real()) * std::cos(x.imag()); + float expected_imag = std::exp(x.real()) * std::sin(x.imag()); + C10_ASSERT_NEAR(e.real(), expected_real, tol); + C10_ASSERT_NEAR(e.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex e = ::exp(x); + float expected_real = ::exp(x.real()) * ::cos(x.imag()); + float expected_imag = ::exp(x.real()) * ::sin(x.imag()); + C10_ASSERT_NEAR(e.real(), expected_real, tol); + C10_ASSERT_NEAR(e.imag(), expected_imag, tol); + } +} + +C10_DEFINE_TEST(TestExpm1, Normal) { + // expm1(x) = exp(x) - 1 + { + c10::complex x(0.1, 1.2); + c10::complex l1 = std::expm1(x); + c10::complex l2 = std::exp(x) - 1.0f; + C10_ASSERT_NEAR(l1.real(), l2.real(), tol); + C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l1 = std::expm1(x); + c10::complex l2 = std::exp(x) - 1.0; + C10_ASSERT_NEAR(l1.real(), l2.real(), tol); + C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol); + } +} + +C10_DEFINE_TEST(TestExpm1, Small) { + // expm1(x) = exp(x) - 1 + // expm1(x) provides greater precision than exp(x) - 1 for small values of x + { + c10::complex x(1e-30, 1e-30); + c10::complex l1 = std::expm1(x); + C10_ASSERT_NEAR(l1.real(), 1e-30, tol); + C10_ASSERT_NEAR(l1.imag(), 1e-30, tol); + } + { + c10::complex x(1e-100, 1e-100); + c10::complex l1 = std::expm1(x); + C10_ASSERT_NEAR(l1.real(), 1e-30, tol); + C10_ASSERT_NEAR(l1.imag(), 1e-30, tol); + } +} + +C10_DEFINE_TEST(TestLog, Definition) { + // log(x) = log(r) + i*theta + { + c10::complex x(1.2, 3.4); + c10::complex l = std::log(x); + float expected_real = std::log(std::abs(x)); + float expected_imag = std::arg(x); + C10_ASSERT_NEAR(l.real(), expected_real, tol); + C10_ASSERT_NEAR(l.imag(), expected_imag, tol); + } + { + c10::complex x(1.2, 3.4); + c10::complex l = ::log(x); + float expected_real = ::log(std::abs(x)); + float expected_imag = std::arg(x); + C10_ASSERT_NEAR(l.real(), expected_real, tol); + C10_ASSERT_NEAR(l.imag(), expected_imag, tol); + } + { + c10::complex x(1.2, 3.4); + c10::complex l = std::log(x); + float expected_real = std::log(std::abs(x)); + float expected_imag = std::arg(x); + C10_ASSERT_NEAR(l.real(), expected_real, tol); + C10_ASSERT_NEAR(l.imag(), expected_imag, tol); + } + { + c10::complex x(1.2, 3.4); + c10::complex l = ::log(x); + float expected_real = ::log(std::abs(x)); + float expected_imag = std::arg(x); + C10_ASSERT_NEAR(l.real(), expected_real, tol); + C10_ASSERT_NEAR(l.imag(), expected_imag, tol); + } +} + +C10_DEFINE_TEST(TestLog10, Rev) { + // log10(10^x) = x + { + c10::complex x(0.1, 1.2); + c10::complex l = std::log10(std::pow(float(10), x)); + C10_ASSERT_NEAR(l.real(), float(0.1), tol); + C10_ASSERT_NEAR(l.imag(), float(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = ::log10(::pow(float(10), x)); + C10_ASSERT_NEAR(l.real(), float(0.1), tol); + C10_ASSERT_NEAR(l.imag(), float(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = std::log10(std::pow(double(10), x)); + C10_ASSERT_NEAR(l.real(), double(0.1), tol); + C10_ASSERT_NEAR(l.imag(), double(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = ::log10(::pow(double(10), x)); + C10_ASSERT_NEAR(l.real(), double(0.1), tol); + C10_ASSERT_NEAR(l.imag(), double(1.2), tol); + } +} + +C10_DEFINE_TEST(TestLog2, Rev) { + // log2(2^x) = x + { + c10::complex x(0.1, 1.2); + c10::complex l = std::log2(std::pow(float(2), x)); + C10_ASSERT_NEAR(l.real(), float(0.1), tol); + C10_ASSERT_NEAR(l.imag(), float(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = ::log2(std::pow(float(2), x)); + C10_ASSERT_NEAR(l.real(), float(0.1), tol); + C10_ASSERT_NEAR(l.imag(), float(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = std::log2(std::pow(double(2), x)); + C10_ASSERT_NEAR(l.real(), double(0.1), tol); + C10_ASSERT_NEAR(l.imag(), double(1.2), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l = ::log2(std::pow(double(2), x)); + C10_ASSERT_NEAR(l.real(), double(0.1), tol); + C10_ASSERT_NEAR(l.imag(), double(1.2), tol); + } +} + +C10_DEFINE_TEST(TestLog1p, Normal) { + // log1p(x) = log(1 + x) + { + c10::complex x(0.1, 1.2); + c10::complex l1 = std::log1p(x); + c10::complex l2 = std::log(1.0f + x); + C10_ASSERT_NEAR(l1.real(), l2.real(), tol); + C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex l1 = std::log1p(x); + c10::complex l2 = std::log(1.0 + x); + C10_ASSERT_NEAR(l1.real(), l2.real(), tol); + C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol); + } +} + +C10_DEFINE_TEST(TestLog1p, Small) { + // log(1 + x) ~ x for |x| << 1 + { + c10::complex x(1e-9, 2e-9); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real() / x.real(), 1, tol); + C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol); + } + { + c10::complex x(1e-100, 2e-100); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real() / x.real(), 1, tol); + C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol); + } +} + +C10_DEFINE_TEST(TestLog1p, Extreme) { + // log(1 + x) ~ x for |x| << 1 and in the brink of overflow / underflow + { + c10::complex x(-1, 1e-30); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), -69.07755278982137, tol); + C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol); + } + { + c10::complex x(-1, 1e30); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol); + C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol); + } + { + c10::complex x(1e30, 1); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol); + C10_ASSERT_NEAR(l.imag(), 1e-30, tol); + } + { + c10::complex x(1e-30, 1); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol); + C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol); + } + { + c10::complex x(1e30, 1e30); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 69.42412638010134, tol); + C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol); + } + { + c10::complex x(1e-38, 1e-38); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 1e-38, tol); + C10_ASSERT_NEAR(l.imag(), 1e-38, tol); + } + { + c10::complex x(1e-38, 2e-30); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 1e-30, tol); + C10_ASSERT_NEAR(l.imag(), 2e-30, tol); + } + { + c10::complex x(-1, 1e-250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), -575.6462732485114, tol); + C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol); + } + { + c10::complex x(-1, 1e250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol); + C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol); + } + { + c10::complex x(1e250, 1); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol); + C10_ASSERT_NEAR(l.imag(), 1e-250, tol); + } + { + c10::complex x(1e-250, 1); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol); + C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol); + } + { + c10::complex x(1e250, 1e250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 575.9928468387914, tol); + C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol); + } + { + c10::complex x(1e-250, 1e-250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 1e-250, tol); + C10_ASSERT_NEAR(l.imag(), 1e-250, tol); + } + { + c10::complex x(1e-250, 2e-250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 1e-250, tol); + C10_ASSERT_NEAR(l.imag(), 2e-250, tol); + } + { + c10::complex x(2e-308, 1.5e-250); + c10::complex l = std::log1p(x); + C10_ASSERT_NEAR(l.real(), 2e-308, tol); + C10_ASSERT_NEAR(l.imag(), 1.5e-308, tol); + } +} + +// Power functions + +C10_DEFINE_TEST(TestPowSqrt, Equal) { + // x^0.5 = sqrt(x) + { + c10::complex x(0.1, 1.2); + c10::complex y = std::pow(x, float(0.5)); + c10::complex z = std::sqrt(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::pow(x, float(0.5)); + c10::complex z = ::sqrt(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::pow(x, double(0.5)); + c10::complex z = std::sqrt(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::pow(x, double(0.5)); + c10::complex z = ::sqrt(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } +} + +C10_DEFINE_TEST(TestPow, Square) { + // x^2 = x * x + { + c10::complex x(0.1, 1.2); + c10::complex y = std::pow(x, float(2)); + c10::complex z = x * x; + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::pow(x, float(2)); + c10::complex z = x * x; + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::pow(x, double(2)); + c10::complex z = x * x; + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::pow(x, double(2)); + c10::complex z = x * x; + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } +} + +// Trigonometric functions and hyperbolic functions + +C10_DEFINE_TEST(TestSinCosSinhCosh, Identity) { + // sin(x + i * y) = sin(x) * cosh(y) + i * cos(x) * sinh(y) + // cos(x + i * y) = cos(x) * cosh(y) - i * sin(x) * sinh(y) + { + c10::complex x(0.1, 1.2); + c10::complex y = std::sin(x); + float expected_real = std::sin(x.real()) * std::cosh(x.imag()); + float expected_imag = std::cos(x.real()) * std::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::sin(x); + float expected_real = ::sin(x.real()) * ::cosh(x.imag()); + float expected_imag = ::cos(x.real()) * ::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::cos(x); + float expected_real = std::cos(x.real()) * std::cosh(x.imag()); + float expected_imag = -std::sin(x.real()) * std::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::cos(x); + float expected_real = ::cos(x.real()) * ::cosh(x.imag()); + float expected_imag = -::sin(x.real()) * ::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::sin(x); + float expected_real = std::sin(x.real()) * std::cosh(x.imag()); + float expected_imag = std::cos(x.real()) * std::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::sin(x); + float expected_real = ::sin(x.real()) * ::cosh(x.imag()); + float expected_imag = ::cos(x.real()) * ::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::cos(x); + float expected_real = std::cos(x.real()) * std::cosh(x.imag()); + float expected_imag = -std::sin(x.real()) * std::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::cos(x); + float expected_real = ::cos(x.real()) * ::cosh(x.imag()); + float expected_imag = -::sin(x.real()) * ::sinh(x.imag()); + C10_ASSERT_NEAR(y.real(), expected_real, tol); + C10_ASSERT_NEAR(y.imag(), expected_imag, tol); + } +} + +C10_DEFINE_TEST(TestTan, Identity) { + // tan(x) = sin(x) / cos(x) + { + c10::complex x(0.1, 1.2); + c10::complex y = std::tan(x); + c10::complex z = std::sin(x) / std::cos(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::tan(x); + c10::complex z = ::sin(x) / ::cos(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::tan(x); + c10::complex z = std::sin(x) / std::cos(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::tan(x); + c10::complex z = ::sin(x) / ::cos(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } +} + +C10_DEFINE_TEST(TestTanh, Identity) { + // tanh(x) = sinh(x) / cosh(x) + { + c10::complex x(0.1, 1.2); + c10::complex y = std::tanh(x); + c10::complex z = std::sinh(x) / std::cosh(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::tanh(x); + c10::complex z = ::sinh(x) / ::cosh(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = std::tanh(x); + c10::complex z = std::sinh(x) / std::cosh(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } + { + c10::complex x(0.1, 1.2); + c10::complex y = ::tanh(x); + c10::complex z = ::sinh(x) / ::cosh(x); + C10_ASSERT_NEAR(y.real(), z.real(), tol); + C10_ASSERT_NEAR(y.imag(), z.imag(), tol); + } +} + +// Rev trigonometric functions + +C10_DEFINE_TEST(TestRevTrigonometric, Rev) { + // asin(sin(x)) = x + // acos(cos(x)) = x + // atan(tan(x)) = x + { + c10::complex x(0.5, 0.6); + c10::complex s = std::sin(x); + c10::complex ss = std::asin(s); + c10::complex c = std::cos(x); + c10::complex cc = std::acos(c); + c10::complex t = std::tan(x); + c10::complex tt = std::atan(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = ::sin(x); + c10::complex ss = ::asin(s); + c10::complex c = ::cos(x); + c10::complex cc = ::acos(c); + c10::complex t = ::tan(x); + c10::complex tt = ::atan(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = std::sin(x); + c10::complex ss = std::asin(s); + c10::complex c = std::cos(x); + c10::complex cc = std::acos(c); + c10::complex t = std::tan(x); + c10::complex tt = std::atan(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = ::sin(x); + c10::complex ss = ::asin(s); + c10::complex c = ::cos(x); + c10::complex cc = ::acos(c); + c10::complex t = ::tan(x); + c10::complex tt = ::atan(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } +} + +// Rev hyperbolic functions + +C10_DEFINE_TEST(TestRevHyperbolic, Rev) { + // asinh(sinh(x)) = x + // acosh(cosh(x)) = x + // atanh(tanh(x)) = x + { + c10::complex x(0.5, 0.6); + c10::complex s = std::sinh(x); + c10::complex ss = std::asinh(s); + c10::complex c = std::cosh(x); + c10::complex cc = std::acosh(c); + c10::complex t = std::tanh(x); + c10::complex tt = std::atanh(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = ::sinh(x); + c10::complex ss = ::asinh(s); + c10::complex c = ::cosh(x); + c10::complex cc = ::acosh(c); + c10::complex t = ::tanh(x); + c10::complex tt = ::atanh(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = std::sinh(x); + c10::complex ss = std::asinh(s); + c10::complex c = std::cosh(x); + c10::complex cc = std::acosh(c); + c10::complex t = std::tanh(x); + c10::complex tt = std::atanh(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } + { + c10::complex x(0.5, 0.6); + c10::complex s = ::sinh(x); + c10::complex ss = ::asinh(s); + c10::complex c = ::cosh(x); + c10::complex cc = ::acosh(c); + c10::complex t = ::tanh(x); + c10::complex tt = ::atanh(t); + C10_ASSERT_NEAR(x.real(), ss.real(), tol); + C10_ASSERT_NEAR(x.imag(), ss.imag(), tol); + C10_ASSERT_NEAR(x.real(), cc.real(), tol); + C10_ASSERT_NEAR(x.imag(), cc.imag(), tol); + C10_ASSERT_NEAR(x.real(), tt.real(), tol); + C10_ASSERT_NEAR(x.imag(), tt.imag(), tol); + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..94586ba1293ac4c922d6638817ce7a92b14d83b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h @@ -0,0 +1,663 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define MAYBE_GLOBAL __global__ +#else +#define MAYBE_GLOBAL +#endif + +#define PI 3.141592653589793238463 + +namespace memory { + +MAYBE_GLOBAL void test_size() { + static_assert(sizeof(c10::complex) == 2 * sizeof(float), ""); + static_assert(sizeof(c10::complex) == 2 * sizeof(double), ""); +} + +MAYBE_GLOBAL void test_align() { + static_assert(alignof(c10::complex) == 2 * sizeof(float), ""); + static_assert(alignof(c10::complex) == 2 * sizeof(double), ""); +} + +MAYBE_GLOBAL void test_pod() { + static_assert(std::is_standard_layout>::value, ""); + static_assert(std::is_standard_layout>::value, ""); +} + +TEST(TestMemory, ReinterpretCast) { + { + std::complex z(1, 2); + c10::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), float(1)); + ASSERT_EQ(zz.imag(), float(2)); + } + + { + c10::complex z(3, 4); + std::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), float(3)); + ASSERT_EQ(zz.imag(), float(4)); + } + + { + std::complex z(1, 2); + c10::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), double(1)); + ASSERT_EQ(zz.imag(), double(2)); + } + + { + c10::complex z(3, 4); + std::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), double(3)); + ASSERT_EQ(zz.imag(), double(4)); + } +} + +#if defined(__CUDACC__) || defined(__HIPCC__) +TEST(TestMemory, ThrustReinterpretCast) { + { + thrust::complex z(1, 2); + c10::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), float(1)); + ASSERT_EQ(zz.imag(), float(2)); + } + + { + c10::complex z(3, 4); + thrust::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), float(3)); + ASSERT_EQ(zz.imag(), float(4)); + } + + { + thrust::complex z(1, 2); + c10::complex zz = *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), double(1)); + ASSERT_EQ(zz.imag(), double(2)); + } + + { + c10::complex z(3, 4); + thrust::complex zz = + *reinterpret_cast*>(&z); + ASSERT_EQ(zz.real(), double(3)); + ASSERT_EQ(zz.imag(), double(4)); + } +} +#endif + +} // namespace memory + +namespace constructors { + +template +C10_HOST_DEVICE void test_construct_from_scalar() { + constexpr scalar_t num1 = scalar_t(1.23); + constexpr scalar_t num2 = scalar_t(4.56); + constexpr scalar_t zero = scalar_t(); + static_assert(c10::complex(num1, num2).real() == num1, ""); + static_assert(c10::complex(num1, num2).imag() == num2, ""); + static_assert(c10::complex(num1).real() == num1, ""); + static_assert(c10::complex(num1).imag() == zero, ""); + static_assert(c10::complex().real() == zero, ""); + static_assert(c10::complex().imag() == zero, ""); +} + +template +C10_HOST_DEVICE void test_construct_from_other() { + constexpr other_t num1 = other_t(1.23); + constexpr other_t num2 = other_t(4.56); + constexpr scalar_t num3 = scalar_t(num1); + constexpr scalar_t num4 = scalar_t(num2); + static_assert( + c10::complex(c10::complex(num1, num2)).real() == num3, + ""); + static_assert( + c10::complex(c10::complex(num1, num2)).imag() == num4, + ""); +} + +MAYBE_GLOBAL void test_convert_constructors() { + test_construct_from_scalar(); + test_construct_from_scalar(); + + static_assert( + std::is_convertible, c10::complex>::value, ""); + static_assert( + !std::is_convertible, c10::complex>::value, + ""); + static_assert( + std::is_convertible, c10::complex>::value, + ""); + static_assert( + std::is_convertible, c10::complex>::value, + ""); + + static_assert( + std::is_constructible, c10::complex>::value, + ""); + static_assert( + std::is_constructible, c10::complex>::value, + ""); + static_assert( + std::is_constructible, c10::complex>::value, + ""); + static_assert( + std::is_constructible, c10::complex>::value, + ""); + + test_construct_from_other(); + test_construct_from_other(); + test_construct_from_other(); + test_construct_from_other(); +} + +template +C10_HOST_DEVICE void test_construct_from_std() { + constexpr scalar_t num1 = scalar_t(1.23); + constexpr scalar_t num2 = scalar_t(4.56); + static_assert( + c10::complex(std::complex(num1, num2)).real() == num1, + ""); + static_assert( + c10::complex(std::complex(num1, num2)).imag() == num2, + ""); +} + +MAYBE_GLOBAL void test_std_conversion() { + test_construct_from_std(); + test_construct_from_std(); +} + +#if defined(__CUDACC__) || defined(__HIPCC__) +template +void test_construct_from_thrust() { + constexpr scalar_t num1 = scalar_t(1.23); + constexpr scalar_t num2 = scalar_t(4.56); + ASSERT_EQ( + c10::complex(thrust::complex(num1, num2)).real(), + num1); + ASSERT_EQ( + c10::complex(thrust::complex(num1, num2)).imag(), + num2); +} + +TEST(TestConstructors, FromThrust) { + test_construct_from_thrust(); + test_construct_from_thrust(); +} +#endif + +TEST(TestConstructors, UnorderedMap) { + std::unordered_map< + c10::complex, + c10::complex, + c10::hash>> + m; + auto key1 = c10::complex(2.5, 3); + auto key2 = c10::complex(2, 0); + auto val1 = c10::complex(2, -3.2); + auto val2 = c10::complex(0, -3); + m[key1] = val1; + m[key2] = val2; + ASSERT_EQ(m[key1], val1); + ASSERT_EQ(m[key2], val2); +} + +} // namespace constructors + +namespace assignment { + +template +constexpr c10::complex one() { + c10::complex result(3, 4); + result = scalar_t(1); + return result; +} + +MAYBE_GLOBAL void test_assign_real() { + static_assert(one().real() == float(1), ""); + static_assert(one().imag() == float(), ""); + static_assert(one().real() == double(1), ""); + static_assert(one().imag() == double(), ""); +} + +constexpr std::tuple, c10::complex> one_two() { + constexpr c10::complex src(1, 2); + c10::complex ret0; + c10::complex ret1; + ret0 = ret1 = src; + return std::make_tuple(ret0, ret1); +} + +MAYBE_GLOBAL void test_assign_other() { + constexpr auto tup = one_two(); + static_assert(std::get>(tup).real() == double(1), ""); + static_assert(std::get>(tup).imag() == double(2), ""); + static_assert(std::get>(tup).real() == float(1), ""); + static_assert(std::get>(tup).imag() == float(2), ""); +} + +constexpr std::tuple, c10::complex> one_two_std() { + constexpr std::complex src(1, 1); + c10::complex ret0; + c10::complex ret1; + ret0 = ret1 = src; + return std::make_tuple(ret0, ret1); +} + +MAYBE_GLOBAL void test_assign_std() { + constexpr auto tup = one_two(); + static_assert(std::get>(tup).real() == double(1), ""); + static_assert(std::get>(tup).imag() == double(2), ""); + static_assert(std::get>(tup).real() == float(1), ""); + static_assert(std::get>(tup).imag() == float(2), ""); +} + +#if defined(__CUDACC__) || defined(__HIPCC__) +C10_HOST_DEVICE std::tuple, c10::complex> +one_two_thrust() { + thrust::complex src(1, 2); + c10::complex ret0; + c10::complex ret1; + ret0 = ret1 = src; + return std::make_tuple(ret0, ret1); +} + +TEST(TestAssignment, FromThrust) { + auto tup = one_two_thrust(); + ASSERT_EQ(std::get>(tup).real(), double(1)); + ASSERT_EQ(std::get>(tup).imag(), double(2)); + ASSERT_EQ(std::get>(tup).real(), float(1)); + ASSERT_EQ(std::get>(tup).imag(), float(2)); +} +#endif + +} // namespace assignment + +namespace literals { + +MAYBE_GLOBAL void test_complex_literals() { + using namespace c10::complex_literals; + static_assert(std::is_same>::value, ""); + static_assert((0.5_if).real() == float(), ""); + static_assert((0.5_if).imag() == float(0.5), ""); + static_assert( + std::is_same>::value, ""); + static_assert((0.5_id).real() == float(), ""); + static_assert((0.5_id).imag() == float(0.5), ""); + + static_assert(std::is_same>::value, ""); + static_assert((1_if).real() == float(), ""); + static_assert((1_if).imag() == float(1), ""); + static_assert(std::is_same>::value, ""); + static_assert((1_id).real() == double(), ""); + static_assert((1_id).imag() == double(1), ""); +} + +} // namespace literals + +namespace real_imag { + +template +constexpr c10::complex zero_one() { + c10::complex result; + result.imag(scalar_t(1)); + return result; +} + +template +constexpr c10::complex one_zero() { + c10::complex result; + result.real(scalar_t(1)); + return result; +} + +MAYBE_GLOBAL void test_real_imag_modify() { + static_assert(zero_one().real() == float(0), ""); + static_assert(zero_one().imag() == float(1), ""); + static_assert(zero_one().real() == double(0), ""); + static_assert(zero_one().imag() == double(1), ""); + + static_assert(one_zero().real() == float(1), ""); + static_assert(one_zero().imag() == float(0), ""); + static_assert(one_zero().real() == double(1), ""); + static_assert(one_zero().imag() == double(0), ""); +} + +} // namespace real_imag + +namespace arithmetic_assign { + +template +constexpr c10::complex p(scalar_t value) { + c10::complex result(scalar_t(2), scalar_t(2)); + result += value; + return result; +} + +template +constexpr c10::complex m(scalar_t value) { + c10::complex result(scalar_t(2), scalar_t(2)); + result -= value; + return result; +} + +template +constexpr c10::complex t(scalar_t value) { + c10::complex result(scalar_t(2), scalar_t(2)); + result *= value; + return result; +} + +template +constexpr c10::complex d(scalar_t value) { + c10::complex result(scalar_t(2), scalar_t(2)); + result /= value; + return result; +} + +template +C10_HOST_DEVICE void test_arithmetic_assign_scalar() { + constexpr c10::complex x = p(scalar_t(1)); + static_assert(x.real() == scalar_t(3), ""); + static_assert(x.imag() == scalar_t(2), ""); + constexpr c10::complex y = m(scalar_t(1)); + static_assert(y.real() == scalar_t(1), ""); + static_assert(y.imag() == scalar_t(2), ""); + constexpr c10::complex z = t(scalar_t(2)); + static_assert(z.real() == scalar_t(4), ""); + static_assert(z.imag() == scalar_t(4), ""); + constexpr c10::complex t = d(scalar_t(2)); + static_assert(t.real() == scalar_t(1), ""); + static_assert(t.imag() == scalar_t(1), ""); +} + +template +constexpr c10::complex p( + scalar_t real, + scalar_t imag, + c10::complex rhs) { + c10::complex result(real, imag); + result += rhs; + return result; +} + +template +constexpr c10::complex m( + scalar_t real, + scalar_t imag, + c10::complex rhs) { + c10::complex result(real, imag); + result -= rhs; + return result; +} + +template +constexpr c10::complex t( + scalar_t real, + scalar_t imag, + c10::complex rhs) { + c10::complex result(real, imag); + result *= rhs; + return result; +} + +template +constexpr c10::complex d( + scalar_t real, + scalar_t imag, + c10::complex rhs) { + c10::complex result(real, imag); + result /= rhs; + return result; +} + +template +C10_HOST_DEVICE void test_arithmetic_assign_complex() { + using namespace c10::complex_literals; + constexpr c10::complex x2 = p(scalar_t(2), scalar_t(2), 1.0_if); + static_assert(x2.real() == scalar_t(2), ""); + static_assert(x2.imag() == scalar_t(3), ""); + constexpr c10::complex x3 = p(scalar_t(2), scalar_t(2), 1.0_id); + static_assert(x3.real() == scalar_t(2), ""); + + // this test is skipped due to a bug in constexpr evaluation + // in nvcc. This bug has already been fixed since CUDA 11.2 +#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020) + static_assert(x3.imag() == scalar_t(3), ""); +#endif + + constexpr c10::complex y2 = m(scalar_t(2), scalar_t(2), 1.0_if); + static_assert(y2.real() == scalar_t(2), ""); + static_assert(y2.imag() == scalar_t(1), ""); + constexpr c10::complex y3 = m(scalar_t(2), scalar_t(2), 1.0_id); + static_assert(y3.real() == scalar_t(2), ""); + + // this test is skipped due to a bug in constexpr evaluation + // in nvcc. This bug has already been fixed since CUDA 11.2 +#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020) + static_assert(y3.imag() == scalar_t(1), ""); +#endif + + constexpr c10::complex z2 = t(scalar_t(1), scalar_t(-2), 1.0_if); + static_assert(z2.real() == scalar_t(2), ""); + static_assert(z2.imag() == scalar_t(1), ""); + constexpr c10::complex z3 = t(scalar_t(1), scalar_t(-2), 1.0_id); + static_assert(z3.real() == scalar_t(2), ""); + static_assert(z3.imag() == scalar_t(1), ""); + + constexpr c10::complex t2 = d(scalar_t(-1), scalar_t(2), 1.0_if); + static_assert(t2.real() == scalar_t(2), ""); + static_assert(t2.imag() == scalar_t(1), ""); + constexpr c10::complex t3 = d(scalar_t(-1), scalar_t(2), 1.0_id); + static_assert(t3.real() == scalar_t(2), ""); + static_assert(t3.imag() == scalar_t(1), ""); +} + +MAYBE_GLOBAL void test_arithmetic_assign() { + test_arithmetic_assign_scalar(); + test_arithmetic_assign_scalar(); + test_arithmetic_assign_complex(); + test_arithmetic_assign_complex(); +} + +} // namespace arithmetic_assign + +namespace arithmetic { + +template +C10_HOST_DEVICE void test_arithmetic_() { + static_assert( + c10::complex(1, 2) == +c10::complex(1, 2), ""); + static_assert( + c10::complex(-1, -2) == -c10::complex(1, 2), ""); + + static_assert( + c10::complex(1, 2) + c10::complex(3, 4) == + c10::complex(4, 6), + ""); + static_assert( + c10::complex(1, 2) + scalar_t(3) == + c10::complex(4, 2), + ""); + static_assert( + scalar_t(3) + c10::complex(1, 2) == + c10::complex(4, 2), + ""); + + static_assert( + c10::complex(1, 2) - c10::complex(3, 4) == + c10::complex(-2, -2), + ""); + static_assert( + c10::complex(1, 2) - scalar_t(3) == + c10::complex(-2, 2), + ""); + static_assert( + scalar_t(3) - c10::complex(1, 2) == + c10::complex(2, -2), + ""); + + static_assert( + c10::complex(1, 2) * c10::complex(3, 4) == + c10::complex(-5, 10), + ""); + static_assert( + c10::complex(1, 2) * scalar_t(3) == + c10::complex(3, 6), + ""); + static_assert( + scalar_t(3) * c10::complex(1, 2) == + c10::complex(3, 6), + ""); + + static_assert( + c10::complex(-5, 10) / c10::complex(3, 4) == + c10::complex(1, 2), + ""); + static_assert( + c10::complex(5, 10) / scalar_t(5) == + c10::complex(1, 2), + ""); + static_assert( + scalar_t(25) / c10::complex(3, 4) == + c10::complex(3, -4), + ""); +} + +MAYBE_GLOBAL void test_arithmetic() { + test_arithmetic_(); + test_arithmetic_(); +} + +template +void test_binary_ops_for_int_type_(T real, T img, int_t num) { + c10::complex c(real, img); + ASSERT_EQ(c + num, c10::complex(real + num, img)); + ASSERT_EQ(num + c, c10::complex(num + real, img)); + ASSERT_EQ(c - num, c10::complex(real - num, img)); + ASSERT_EQ(num - c, c10::complex(num - real, -img)); + ASSERT_EQ(c * num, c10::complex(real * num, img * num)); + ASSERT_EQ(num * c, c10::complex(num * real, num * img)); + ASSERT_EQ(c / num, c10::complex(real / num, img / num)); + ASSERT_EQ( + num / c, + c10::complex(num * real / std::norm(c), -num * img / std::norm(c))); +} + +template +void test_binary_ops_for_all_int_types_(T real, T img, int8_t i) { + test_binary_ops_for_int_type_(real, img, i); + test_binary_ops_for_int_type_(real, img, i); + test_binary_ops_for_int_type_(real, img, i); + test_binary_ops_for_int_type_(real, img, i); +} + +TEST(TestArithmeticIntScalar, All) { + test_binary_ops_for_all_int_types_(1.0, 0.1, 1); + test_binary_ops_for_all_int_types_(-1.3, -0.2, -2); +} + +} // namespace arithmetic + +namespace equality { + +template +C10_HOST_DEVICE void test_equality_() { + static_assert( + c10::complex(1, 2) == c10::complex(1, 2), ""); + static_assert(c10::complex(1, 0) == scalar_t(1), ""); + static_assert(scalar_t(1) == c10::complex(1, 0), ""); + static_assert( + c10::complex(1, 2) != c10::complex(3, 4), ""); + static_assert(c10::complex(1, 2) != scalar_t(1), ""); + static_assert(scalar_t(1) != c10::complex(1, 2), ""); +} + +MAYBE_GLOBAL void test_equality() { + test_equality_(); + test_equality_(); +} + +} // namespace equality + +namespace io { + +template +void test_io_() { + std::stringstream ss; + c10::complex a(1, 2); + ss << a; + ASSERT_EQ(ss.str(), "(1,2)"); + ss.str("(3,4)"); + ss >> a; + ASSERT_TRUE(a == c10::complex(3, 4)); +} + +TEST(TestIO, All) { + test_io_(); + test_io_(); +} + +} // namespace io + +namespace test_std { + +template +C10_HOST_DEVICE void test_callable_() { + static_assert(std::real(c10::complex(1, 2)) == scalar_t(1), ""); + static_assert(std::imag(c10::complex(1, 2)) == scalar_t(2), ""); + std::abs(c10::complex(1, 2)); + std::arg(c10::complex(1, 2)); + static_assert(std::norm(c10::complex(3, 4)) == scalar_t(25), ""); + static_assert( + std::conj(c10::complex(3, 4)) == c10::complex(3, -4), + ""); + c10::polar(float(1), float(PI / 2)); + c10::polar(double(1), double(PI / 2)); +} + +MAYBE_GLOBAL void test_callable() { + test_callable_(); + test_callable_(); +} + +template +void test_values_() { + ASSERT_EQ(std::abs(c10::complex(3, 4)), scalar_t(5)); + ASSERT_LT(std::abs(std::arg(c10::complex(0, 1)) - PI / 2), 1e-6); + ASSERT_LT( + std::abs( + c10::polar(scalar_t(1), scalar_t(PI / 2)) - + c10::complex(0, 1)), + 1e-6); +} + +TEST(TestStd, BasicFunctions) { + test_values_(); + test_values_(); + // CSQRT edge cases: checks for overflows which are likely to occur + // if square root is computed using polar form + ASSERT_LT( + std::abs(std::sqrt(c10::complex(-1e20, -4988429.2)).real()), 3e-4); + ASSERT_LT( + std::abs(std::sqrt(c10::complex(-1e60, -4988429.2)).real()), + 3e-4); +} + +} // namespace test_std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h new file mode 100644 index 0000000000000000000000000000000000000000..f7bcaaa28af3871f95280a9bd764aea260405ca1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h @@ -0,0 +1,88 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { +class AbortHandlerHelper { + public: + static AbortHandlerHelper& getInstance() { +#ifdef _WIN32 + thread_local +#endif // _WIN32 + static AbortHandlerHelper instance; + return instance; + } + + void set(std::terminate_handler handler) { + std::lock_guard lk(mutex); + if (!inited) { + prev = std::set_terminate(handler); + curr = std::get_terminate(); + inited = true; + } + } + + std::terminate_handler getPrev() const { + return prev; + } + + private: + std::terminate_handler prev = nullptr; + std::terminate_handler curr = nullptr; + bool inited = false; + std::mutex mutex; + AbortHandlerHelper() = default; + ~AbortHandlerHelper() { + // Only restore the handler if we are the current one + if (inited && curr == std::get_terminate()) { + std::set_terminate(prev); + } + } + + public: + AbortHandlerHelper(AbortHandlerHelper const&) = delete; + void operator=(AbortHandlerHelper const&) = delete; + AbortHandlerHelper(AbortHandlerHelper&&) = delete; + void operator=(AbortHandlerHelper&&) = delete; +}; + +namespace detail { +C10_ALWAYS_INLINE void terminate_handler() { + std::cout << "Unhandled exception caught in c10/util/AbortHandler.h" << '\n'; + auto backtrace = get_backtrace(); + std::cout << backtrace << '\n' << std::flush; + auto prev_handler = AbortHandlerHelper::getInstance().getPrev(); + if (prev_handler) { + prev_handler(); + } else { + std::abort(); + } +} +} // namespace detail + +C10_ALWAYS_INLINE void set_terminate_handler() { + bool use_custom_terminate = false; + // On Windows it is enabled by default based on + // https://github.com/pytorch/pytorch/pull/50320#issuecomment-763147062 +#ifdef _WIN32 + use_custom_terminate = true; +#endif // _WIN32 + auto result = c10::utils::check_env("TORCH_CUSTOM_TERMINATE"); + if (result != std::nullopt) { + use_custom_terminate = result.value(); + } + if (use_custom_terminate) { + AbortHandlerHelper::getInstance().set(detail::terminate_handler); + } +} +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h new file mode 100644 index 0000000000000000000000000000000000000000..ce9fe90961700f2a1dd3f9c25e120eaa9609fc03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h @@ -0,0 +1,181 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AlignedCharArray and AlignedCharArrayUnion classes. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::AlignOf +// replaced LLVM_ALIGNAS with alignas + +#pragma once + +#include + +namespace c10 { + +/// \struct AlignedCharArray +/// \brief Helper for building an aligned character array type. +/// +/// This template is used to explicitly build up a collection of aligned +/// character array types. We have to build these up using a macro and explicit +/// specialization to cope with MSVC (at least till 2015) where only an +/// integer literal can be used to specify an alignment constraint. Once built +/// up here, we can then begin to indirect between these using normal C++ +/// template parameters. + +// MSVC requires special handling here. +#ifndef _MSC_VER + +template +struct AlignedCharArray { + // NOLINTNEXTLINE(*c-arrays) + alignas(Alignment) char buffer[Size]; +}; + +#else // _MSC_VER + +/// \brief Create a type with an aligned char buffer. +template +struct AlignedCharArray; + +// We provide special variations of this template for the most common +// alignments because __declspec(align(...)) doesn't actually work when it is +// a member of a by-value function argument in MSVC, even if the alignment +// request is something reasonably like 8-byte or 16-byte. Note that we can't +// even include the declspec with the union that forces the alignment because +// MSVC warns on the existence of the declspec despite the union member forcing +// proper alignment. + +template +struct AlignedCharArray<1, Size> { + union { + char aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<2, Size> { + union { + short aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<4, Size> { + union { + int aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<8, Size> { + union { + double aligned; + char buffer[Size]; + }; +}; + +// The rest of these are provided with a __declspec(align(...)) and we simply +// can't pass them by-value as function arguments on MSVC. + +#define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \ + template \ + struct AlignedCharArray { \ + __declspec(align(x)) char buffer[Size]; \ + }; + +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128) + +#undef AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT + +#endif // _MSC_VER + +namespace detail { +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> +class AlignerImpl { + T1 t1; + T2 t2; + T3 t3; + T4 t4; + T5 t5; + T6 t6; + T7 t7; + T8 t8; + T9 t9; + T10 t10; + + public: + AlignerImpl() = delete; +}; + +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> +union SizerImpl { + // NOLINTNEXTLINE(*c-arrays) + char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)], + arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], + arr9[sizeof(T9)], arr10[sizeof(T10)]; +}; +} // end namespace detail + +/// \brief This union template exposes a suitably aligned and sized character +/// array member which can hold elements of any of up to ten types. +/// +/// These types may be arrays, structs, or any other types. The goal is to +/// expose a char array buffer member which can be used as suitable storage for +/// a placement new of any of these types. Support for more than ten types can +/// be added at the cost of more boilerplate. +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> +struct AlignedCharArrayUnion + : AlignedCharArray< + alignof(detail::AlignerImpl), + sizeof(::c10::detail:: + SizerImpl)> {}; +} // end namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h new file mode 100644 index 0000000000000000000000000000000000000000..7410fc4e829fa44aadb22f61e85e1f05f9a81134 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h @@ -0,0 +1,120 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2023-present Facebook. All Rights Reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(C10_IOS) && defined(C10_MOBILE) +#include // for gettimeofday() +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) +#define C10_RDTSC +#if defined(_MSC_VER) +#include +#elif defined(__CUDACC__) || defined(__HIPCC__) +#undef C10_RDTSC +#elif defined(__clang__) +// `__rdtsc` is available by default. +// NB: This has to be first, because Clang will also define `__GNUC__` +#elif defined(__GNUC__) +#include +#else +#undef C10_RDTSC +#endif +#endif + +namespace c10 { + +using time_t = int64_t; +using steady_clock_t = std::conditional_t< + std::chrono::high_resolution_clock::is_steady, + std::chrono::high_resolution_clock, + std::chrono::steady_clock>; + +inline time_t getTimeSinceEpoch() { + auto now = std::chrono::system_clock::now().time_since_epoch(); + return std::chrono::duration_cast(now).count(); +} + +inline time_t getTime(bool allow_monotonic = false) { +#if defined(C10_IOS) && defined(C10_MOBILE) + // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS + // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime + // is implemented or not + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000000000 + + static_cast(now.tv_usec) * 1000; +#elif defined(_WIN32) || defined(__MACH__) + return std::chrono::duration_cast( + steady_clock_t::now().time_since_epoch()) + .count(); +#else + // clock_gettime is *much* faster than std::chrono implementation on Linux + struct timespec t{}; + auto mode = CLOCK_REALTIME; + if (allow_monotonic) { + mode = CLOCK_MONOTONIC; + } + clock_gettime(mode, &t); + return static_cast(t.tv_sec) * 1000000000 + + static_cast(t.tv_nsec); +#endif +} + +// We often do not need to capture true wall times. If a fast mechanism such +// as TSC is available we can use that instead and convert back to epoch time +// during post processing. This greatly reduce the clock's contribution to +// profiling. +// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ +// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io +// TODO: We should use +// `https://github.com/google/benchmark/blob/main/src/cycleclock.h` +inline auto getApproximateTime() { +#if defined(C10_RDTSC) + return static_cast(__rdtsc()); +#else + return getTime(); +#endif +} + +using approx_time_t = decltype(getApproximateTime()); +static_assert( + std::is_same_v || + std::is_same_v, + "Expected either int64_t (`getTime`) or uint64_t (some TSC reads)."); + +// Convert `getCount` results to Nanoseconds since unix epoch. +class C10_API ApproximateClockToUnixTimeConverter final { + public: + ApproximateClockToUnixTimeConverter(); + std::function makeConverter(); + + struct UnixAndApproximateTimePair { + time_t t_; + approx_time_t approx_t_; + }; + static UnixAndApproximateTimePair measurePair(); + + private: + static constexpr size_t replicates = 1001; + using time_pairs = std::array; + time_pairs measurePairs(); + + time_pairs start_times_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h new file mode 100644 index 0000000000000000000000000000000000000000..9da524e96ce718b7782e1584a795c919af0ecd78 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h @@ -0,0 +1,326 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::ArrayRef. +// removed llvm-specific functionality +// removed some implicit const -> non-const conversions that rely on +// complicated std::enable_if meta-programming +// removed a bunch of slice variants for simplicity... + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { +/// ArrayRef - Represent a constant reference to an array (0 or more elements +/// consecutively in memory), i.e. a start pointer and a length. It allows +/// various APIs to take consecutive elements easily and conveniently. +/// +/// This class does not own the underlying data, it is expected to be used in +/// situations where the data resides in some other buffer, whose lifetime +/// extends past that of the ArrayRef. For this reason, it is not in general +/// safe to store an ArrayRef. +/// +/// This is intended to be trivially copyable, so it should be passed by +/// value. +/// +/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct +/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of +/// the underlying constexpr calls, we rely on apparent-type dispatch for +/// inheritance. This should be fine because their memory format is the same, +/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods. +/// However, you should prefer to use ArrayRef when possible, because its use +/// of TORCH_CHECK will lead to better user-facing error messages. +template +// ArrayRef cannot be derived from. Normally, we would use `final` +// specifier to force this constraint at compile time. However, Intel +// compiler does not recognize ArrayRef as a class template (which is +// required in the definition of at::TensorAccessor, for instance) +// when `final` specifier is used. So, we cannot define ArrayRef as +// final because of the Intel compiler issue. +class ArrayRef : public HeaderOnlyArrayRef { + public: + /// @name Constructors, all inherited from HeaderOnlyArrayRef except for + /// SmallVector. As inherited constructors won't work with class template + /// argument deduction (CTAD) until C++23, we add deduction guides after + /// the class definition to enable CTAD. + /// @{ + + using HeaderOnlyArrayRef::HeaderOnlyArrayRef; + + /// Construct an ArrayRef from a SmallVector. This is templated in order to + /// avoid instantiating SmallVectorTemplateCommon whenever we + /// copy-construct an ArrayRef. + /// NOTE: this is the only constructor that is not inherited from + /// HeaderOnlyArrayRef. + template + /* implicit */ ArrayRef(const SmallVectorTemplateCommon& Vec) + : HeaderOnlyArrayRef(Vec.data(), Vec.size()) {} + + /// @} + /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef + /// @{ + + /// front - Get the first element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK + constexpr const T& front() const { + TORCH_CHECK( + !this->empty(), "ArrayRef: attempted to access front() of empty list"); + return this->Data[0]; + } + + /// back - Get the last element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK + constexpr const T& back() const { + TORCH_CHECK( + !this->empty(), "ArrayRef: attempted to access back() of empty list"); + return this->Data[this->Length - 1]; + } + + /// slice(n, m) - Take M elements of the array starting at element N + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK + constexpr ArrayRef slice(size_t N, size_t M) const { + TORCH_CHECK( + N + M <= this->size(), + "ArrayRef: invalid slice, N = ", + N, + "; M = ", + M, + "; size = ", + this->size()); + return ArrayRef(this->data() + N, M); + } + + /// slice(n) - Chop off the first N elements of the array. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK + constexpr ArrayRef slice(size_t N) const { + TORCH_CHECK( + N <= this->size(), + "ArrayRef: invalid slice, N = ", + N, + "; size = ", + this->size()); + return slice(N, this->size() - N); // should this slice be this->slice? + } + + /// @} + /// @name Operator Overloads + /// @{ + + /// Vector compatibility + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK + constexpr const T& at(size_t Index) const { + TORCH_CHECK( + Index < this->Length, + "ArrayRef: invalid index Index = ", + Index, + "; Length = ", + this->Length); + return this->Data[Index]; + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + std::enable_if_t, ArrayRef>& operator=( + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + U&& Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + std::enable_if_t, ArrayRef>& operator=( + std::initializer_list) = delete; + + /// @} +}; + +/// Deduction guides for ArrayRef to support CTAD with inherited constructors +/// These mirror the constructors inherited from HeaderOnlyArrayRef +/// @{ + +// Single element constructor +template +ArrayRef(const T&) -> ArrayRef; + +// Pointer and length constructor +template +ArrayRef(const T*, size_t) -> ArrayRef; + +// Range constructor (begin, end) +template +ArrayRef(const T*, const T*) -> ArrayRef; + +// Generic container constructor (anything with .data() and .size()) +template +ArrayRef(const Container&) -> ArrayRef< + std::remove_pointer_t().data())>>; + +// std::vector constructor +template +ArrayRef(const std::vector&) -> ArrayRef; + +// std::array constructor +template +ArrayRef(const std::array&) -> ArrayRef; + +// C array constructor +template +ArrayRef(const T (&)[N]) -> ArrayRef; + +// std::initializer_list constructor +template +ArrayRef(const std::initializer_list&) -> ArrayRef; + +/// @} + +template +std::ostream& operator<<(std::ostream& out, ArrayRef list) { + int i = 0; + out << '['; + for (const auto& e : list) { + if (i++ > 0) + out << ", "; + out << e; + } + out << ']'; + return out; +} + +/// @name ArrayRef Convenience constructors +/// @{ + +/// Construct an ArrayRef from a single element. +template +ArrayRef makeArrayRef(const T& OneElt) { + return OneElt; +} + +/// Construct an ArrayRef from a pointer and length. +template +ArrayRef makeArrayRef(const T* data, size_t length) { + return ArrayRef(data, length); +} + +/// Construct an ArrayRef from a range. +template +ArrayRef makeArrayRef(const T* begin, const T* end) { + return ArrayRef(begin, end); +} + +/// Construct an ArrayRef from a SmallVector. +template +ArrayRef makeArrayRef(const SmallVectorImpl& Vec) { + return Vec; +} + +/// Construct an ArrayRef from a SmallVector. +template +ArrayRef makeArrayRef(const SmallVector& Vec) { + return Vec; +} + +/// Construct an ArrayRef from a std::vector. +template +ArrayRef makeArrayRef(const std::vector& Vec) { + return Vec; +} + +/// Construct an ArrayRef from a std::array. +template +ArrayRef makeArrayRef(const std::array& Arr) { + return Arr; +} + +/// Construct an ArrayRef from an ArrayRef (no-op) (const) +template +ArrayRef makeArrayRef(const ArrayRef& Vec) { + return Vec; +} + +/// Construct an ArrayRef from an ArrayRef (no-op) +template +ArrayRef& makeArrayRef(ArrayRef& Vec) { + return Vec; +} + +/// Construct an ArrayRef from a C array. +template +// NOLINTNEXTLINE(*c-arrays*) +ArrayRef makeArrayRef(const T (&Arr)[N]) { + return ArrayRef(Arr); +} + +// WARNING: Template instantiation will NOT be willing to do an implicit +// conversions to get you to an c10::ArrayRef, which is why we need so +// many overloads. + +template +bool operator==(c10::ArrayRef a1, c10::ArrayRef a2) { + return a1.equals(a2); +} + +template +bool operator!=(c10::ArrayRef a1, c10::ArrayRef a2) { + return !a1.equals(a2); +} + +template +bool operator==(const std::vector& a1, c10::ArrayRef a2) { + return c10::ArrayRef(a1).equals(a2); +} + +template +bool operator!=(const std::vector& a1, c10::ArrayRef a2) { + return !c10::ArrayRef(a1).equals(a2); +} + +template +bool operator==(c10::ArrayRef a1, const std::vector& a2) { + return a1.equals(c10::ArrayRef(a2)); +} + +template +bool operator!=(c10::ArrayRef a1, const std::vector& a2) { + return !a1.equals(c10::ArrayRef(a2)); +} + +using IntArrayRef = ArrayRef; + +using IntList [[deprecated( + "This alias is deprecated because it doesn't make ownership semantics obvious. Use IntArrayRef instead!")]] = + ArrayRef; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..90ca6b677ab3740550f4700479497fd58c35536b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h new file mode 100644 index 0000000000000000000000000000000000000000..1e01d94ea590ccd96414ec760b09a48419de9de8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h @@ -0,0 +1,123 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if defined(_MSC_VER) +#include +#endif + +namespace c10::utils { + +/** + * This is a simple bitset class with sizeof(long long int) bits. + * You can set bits, unset bits, query bits by index, + * and query for the first set bit. + * Before using this class, please also take a look at std::bitset, + * which has more functionality and is more generic. It is probably + * a better fit for your use case. The sole reason for c10::utils::bitset + * to exist is that std::bitset misses a find_first_set() method. + */ +struct bitset final { + private: +#if defined(_MSC_VER) + // MSVCs _BitScanForward64 expects int64_t + using bitset_type = int64_t; +#else + // POSIX ffsll expects long long int + using bitset_type = long long int; +#endif + public: + static constexpr size_t NUM_BITS() { + return 8 * sizeof(bitset_type); + } + + constexpr bitset() noexcept = default; + constexpr bitset(const bitset&) noexcept = default; + constexpr bitset(bitset&&) noexcept = default; + // there is an issue for gcc 5.3.0 when define default function as constexpr + // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754. + bitset& operator=(const bitset&) noexcept = default; + bitset& operator=(bitset&&) noexcept = default; + ~bitset() = default; + + constexpr void set(size_t index) noexcept { + bitset_ |= (static_cast(1) << index); + } + + constexpr void unset(size_t index) noexcept { + bitset_ &= ~(static_cast(1) << index); + } + + constexpr bool get(size_t index) const noexcept { + return bitset_ & (static_cast(1) << index); + } + + constexpr bool is_entirely_unset() const noexcept { + return 0 == bitset_; + } + + // Call the given functor with the index of each bit that is set + template + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + void for_each_set_bit(Func&& func) const { + bitset cur = *this; + size_t index = cur.find_first_set(); + while (0 != index) { + // -1 because find_first_set() is not one-indexed. + index -= 1; + func(index); + cur.unset(index); + index = cur.find_first_set(); + } + } + + private: + // Return the index of the first set bit. The returned index is one-indexed + // (i.e. if the very first bit is set, this function returns '1'), and a + // return of '0' means that there was no bit set. + size_t find_first_set() const { +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) + unsigned long result; + bool has_bits_set = (0 != _BitScanForward64(&result, bitset_)); + if (!has_bits_set) { + return 0; + } + return result + 1; +#elif defined(_MSC_VER) && defined(_M_IX86) + unsigned long result; + if (static_cast(bitset_) != 0) { + bool has_bits_set = + (0 != _BitScanForward(&result, static_cast(bitset_))); + if (!has_bits_set) { + return 0; + } + return result + 1; + } else { + bool has_bits_set = + (0 != _BitScanForward(&result, static_cast(bitset_ >> 32))); + if (!has_bits_set) { + return 32; + } + return result + 33; + } +#else + return __builtin_ffsll(bitset_); +#endif + } + + friend bool operator==(bitset lhs, bitset rhs) noexcept { + return lhs.bitset_ == rhs.bitset_; + } + + bitset_type bitset_{0}; +}; + +inline bool operator!=(bitset lhs, bitset rhs) noexcept { + return !(lhs == rhs); +} + +} // namespace c10::utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h new file mode 100644 index 0000000000000000000000000000000000000000..f9e010daa58b3e172456412c6478bfb1006b3e3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#ifndef C10_UTIL_CPP17_H_ +#define C10_UTIL_CPP17_H_ + +#include +#include +#include +#include +#include + +#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \ + __GNUC__ < 9 +#error \ + "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later." +#endif + +#if defined(__clang__) && __clang_major__ < 9 +#error \ + "You're trying to build PyTorch with a too old version of Clang. We need Clang 9 or later." +#endif + +#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \ + (!defined(_MSC_VER) && __cplusplus < 201703L) +#error You need C++17 to compile PyTorch +#endif + +#if defined(_WIN32) && (defined(min) || defined(max)) +#error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows +#endif + +/* + * This header adds some polyfills with C++17 functionality + */ + +namespace c10 { + +namespace guts { + +#if defined(__HIP__) + +// Implementation from http://en.cppreference.com/w/cpp/utility/apply (but +// modified) +// TODO This is an incomplete implementation of std::apply, not working for +// member functions. +namespace detail { +template +C10_HOST_DEVICE constexpr auto apply_impl( + F&& f, + Tuple&& t, + std::index_sequence) { + return std::forward(f)(std::get(std::forward(t))...); +} +} // namespace detail + +template +C10_HOST_DEVICE constexpr auto apply(F&& f, Tuple&& t) { + return detail::apply_impl( + std::forward(f), + std::forward(t), + std::make_index_sequence< + std::tuple_size>::value>{}); +} + +#endif + +} // namespace guts + +} // namespace c10 + +#endif // C10_UTIL_CPP17_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h new file mode 100644 index 0000000000000000000000000000000000000000..0037755f64a8fce82ae816391559c2123e3ad1cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +// custom c10 call_once implementation to avoid the deadlock in std::call_once. +// The implementation here is a simplified version from folly and likely much +// much higher memory footprint. +template +inline void call_once(Flag& flag, F&& f, Args&&... args) { + if (C10_LIKELY(flag.test_once())) { + return; + } + flag.call_once_slow(std::forward(f), std::forward(args)...); +} + +class once_flag { + public: +#ifndef _WIN32 + // running into build error on MSVC. Can't seem to get a repro locally so I'm + // just avoiding constexpr + // + // C:/actions-runner/_work/pytorch/pytorch\c10/util/CallOnce.h(26): error: + // defaulted default constructor cannot be constexpr because the + // corresponding implicitly declared default constructor would not be + // constexpr 1 error detected in the compilation of + // "C:/actions-runner/_work/pytorch/pytorch/aten/src/ATen/cuda/cub.cu". + constexpr +#endif + once_flag() noexcept = default; + once_flag(const once_flag&) = delete; + once_flag& operator=(const once_flag&) = delete; + once_flag(once_flag&&) = delete; + once_flag& operator=(once_flag&&) = delete; + ~once_flag() = default; + bool test_once() { + return init_.load(std::memory_order_acquire); + } + + private: + template + friend void call_once(Flag& flag, F&& f, Args&&... args); + + template + void call_once_slow(F&& f, Args&&... args) { + std::lock_guard guard(mutex_); + if (init_.load(std::memory_order_relaxed)) { + return; + } + std::invoke(std::forward(f), std::forward(args)...); + init_.store(true, std::memory_order_release); + } + + void reset_once() { + init_.store(false, std::memory_order_release); + } + + private: + std::mutex mutex_; + std::atomic init_{false}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h new file mode 100644 index 0000000000000000000000000000000000000000..56dd979ce833087e264e6e8faef8563019fa3ea5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h @@ -0,0 +1,137 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10::util { + +namespace detail { +// NOLINTNEXTLINE(*c-arrays*) +constexpr uint64_t crc64_table[] = { + 0x0000000000000000, 0x7ad870c830358979, 0xf5b0e190606b12f2, + 0x8f689158505e9b8b, 0xc038e5739841b68f, 0xbae095bba8743ff6, + 0x358804e3f82aa47d, 0x4f50742bc81f2d04, 0xab28ecb46814fe75, + 0xd1f09c7c5821770c, 0x5e980d24087fec87, 0x24407dec384a65fe, + 0x6b1009c7f05548fa, 0x11c8790fc060c183, 0x9ea0e857903e5a08, + 0xe478989fa00bd371, 0x7d08ff3b88be6f81, 0x07d08ff3b88be6f8, + 0x88b81eabe8d57d73, 0xf2606e63d8e0f40a, 0xbd301a4810ffd90e, + 0xc7e86a8020ca5077, 0x4880fbd87094cbfc, 0x32588b1040a14285, + 0xd620138fe0aa91f4, 0xacf86347d09f188d, 0x2390f21f80c18306, + 0x594882d7b0f40a7f, 0x1618f6fc78eb277b, 0x6cc0863448deae02, + 0xe3a8176c18803589, 0x997067a428b5bcf0, 0xfa11fe77117cdf02, + 0x80c98ebf2149567b, 0x0fa11fe77117cdf0, 0x75796f2f41224489, + 0x3a291b04893d698d, 0x40f16bccb908e0f4, 0xcf99fa94e9567b7f, + 0xb5418a5cd963f206, 0x513912c379682177, 0x2be1620b495da80e, + 0xa489f35319033385, 0xde51839b2936bafc, 0x9101f7b0e12997f8, + 0xebd98778d11c1e81, 0x64b116208142850a, 0x1e6966e8b1770c73, + 0x8719014c99c2b083, 0xfdc17184a9f739fa, 0x72a9e0dcf9a9a271, + 0x08719014c99c2b08, 0x4721e43f0183060c, 0x3df994f731b68f75, + 0xb29105af61e814fe, 0xc849756751dd9d87, 0x2c31edf8f1d64ef6, + 0x56e99d30c1e3c78f, 0xd9810c6891bd5c04, 0xa3597ca0a188d57d, + 0xec09088b6997f879, 0x96d1784359a27100, 0x19b9e91b09fcea8b, + 0x636199d339c963f2, 0xdf7adabd7a6e2d6f, 0xa5a2aa754a5ba416, + 0x2aca3b2d1a053f9d, 0x50124be52a30b6e4, 0x1f423fcee22f9be0, + 0x659a4f06d21a1299, 0xeaf2de5e82448912, 0x902aae96b271006b, + 0x74523609127ad31a, 0x0e8a46c1224f5a63, 0x81e2d7997211c1e8, + 0xfb3aa75142244891, 0xb46ad37a8a3b6595, 0xceb2a3b2ba0eecec, + 0x41da32eaea507767, 0x3b024222da65fe1e, 0xa2722586f2d042ee, + 0xd8aa554ec2e5cb97, 0x57c2c41692bb501c, 0x2d1ab4dea28ed965, + 0x624ac0f56a91f461, 0x1892b03d5aa47d18, 0x97fa21650afae693, + 0xed2251ad3acf6fea, 0x095ac9329ac4bc9b, 0x7382b9faaaf135e2, + 0xfcea28a2faafae69, 0x8632586aca9a2710, 0xc9622c4102850a14, + 0xb3ba5c8932b0836d, 0x3cd2cdd162ee18e6, 0x460abd1952db919f, + 0x256b24ca6b12f26d, 0x5fb354025b277b14, 0xd0dbc55a0b79e09f, + 0xaa03b5923b4c69e6, 0xe553c1b9f35344e2, 0x9f8bb171c366cd9b, + 0x10e3202993385610, 0x6a3b50e1a30ddf69, 0x8e43c87e03060c18, + 0xf49bb8b633338561, 0x7bf329ee636d1eea, 0x012b592653589793, + 0x4e7b2d0d9b47ba97, 0x34a35dc5ab7233ee, 0xbbcbcc9dfb2ca865, + 0xc113bc55cb19211c, 0x5863dbf1e3ac9dec, 0x22bbab39d3991495, + 0xadd33a6183c78f1e, 0xd70b4aa9b3f20667, 0x985b3e827bed2b63, + 0xe2834e4a4bd8a21a, 0x6debdf121b863991, 0x1733afda2bb3b0e8, + 0xf34b37458bb86399, 0x8993478dbb8deae0, 0x06fbd6d5ebd3716b, + 0x7c23a61ddbe6f812, 0x3373d23613f9d516, 0x49aba2fe23cc5c6f, + 0xc6c333a67392c7e4, 0xbc1b436e43a74e9d, 0x95ac9329ac4bc9b5, + 0xef74e3e19c7e40cc, 0x601c72b9cc20db47, 0x1ac40271fc15523e, + 0x5594765a340a7f3a, 0x2f4c0692043ff643, 0xa02497ca54616dc8, + 0xdafce7026454e4b1, 0x3e847f9dc45f37c0, 0x445c0f55f46abeb9, + 0xcb349e0da4342532, 0xb1eceec59401ac4b, 0xfebc9aee5c1e814f, + 0x8464ea266c2b0836, 0x0b0c7b7e3c7593bd, 0x71d40bb60c401ac4, + 0xe8a46c1224f5a634, 0x927c1cda14c02f4d, 0x1d148d82449eb4c6, + 0x67ccfd4a74ab3dbf, 0x289c8961bcb410bb, 0x5244f9a98c8199c2, + 0xdd2c68f1dcdf0249, 0xa7f41839ecea8b30, 0x438c80a64ce15841, + 0x3954f06e7cd4d138, 0xb63c61362c8a4ab3, 0xcce411fe1cbfc3ca, + 0x83b465d5d4a0eece, 0xf96c151de49567b7, 0x76048445b4cbfc3c, + 0x0cdcf48d84fe7545, 0x6fbd6d5ebd3716b7, 0x15651d968d029fce, + 0x9a0d8ccedd5c0445, 0xe0d5fc06ed698d3c, 0xaf85882d2576a038, + 0xd55df8e515432941, 0x5a3569bd451db2ca, 0x20ed197575283bb3, + 0xc49581ead523e8c2, 0xbe4df122e51661bb, 0x3125607ab548fa30, + 0x4bfd10b2857d7349, 0x04ad64994d625e4d, 0x7e7514517d57d734, + 0xf11d85092d094cbf, 0x8bc5f5c11d3cc5c6, 0x12b5926535897936, + 0x686de2ad05bcf04f, 0xe70573f555e26bc4, 0x9ddd033d65d7e2bd, + 0xd28d7716adc8cfb9, 0xa85507de9dfd46c0, 0x273d9686cda3dd4b, + 0x5de5e64efd965432, 0xb99d7ed15d9d8743, 0xc3450e196da80e3a, + 0x4c2d9f413df695b1, 0x36f5ef890dc31cc8, 0x79a59ba2c5dc31cc, + 0x037deb6af5e9b8b5, 0x8c157a32a5b7233e, 0xf6cd0afa9582aa47, + 0x4ad64994d625e4da, 0x300e395ce6106da3, 0xbf66a804b64ef628, + 0xc5bed8cc867b7f51, 0x8aeeace74e645255, 0xf036dc2f7e51db2c, + 0x7f5e4d772e0f40a7, 0x05863dbf1e3ac9de, 0xe1fea520be311aaf, + 0x9b26d5e88e0493d6, 0x144e44b0de5a085d, 0x6e963478ee6f8124, + 0x21c640532670ac20, 0x5b1e309b16452559, 0xd476a1c3461bbed2, + 0xaeaed10b762e37ab, 0x37deb6af5e9b8b5b, 0x4d06c6676eae0222, + 0xc26e573f3ef099a9, 0xb8b627f70ec510d0, 0xf7e653dcc6da3dd4, + 0x8d3e2314f6efb4ad, 0x0256b24ca6b12f26, 0x788ec2849684a65f, + 0x9cf65a1b368f752e, 0xe62e2ad306bafc57, 0x6946bb8b56e467dc, + 0x139ecb4366d1eea5, 0x5ccebf68aecec3a1, 0x2616cfa09efb4ad8, + 0xa97e5ef8cea5d153, 0xd3a62e30fe90582a, 0xb0c7b7e3c7593bd8, + 0xca1fc72bf76cb2a1, 0x45775673a732292a, 0x3faf26bb9707a053, + 0x70ff52905f188d57, 0x0a2722586f2d042e, 0x854fb3003f739fa5, + 0xff97c3c80f4616dc, 0x1bef5b57af4dc5ad, 0x61372b9f9f784cd4, + 0xee5fbac7cf26d75f, 0x9487ca0fff135e26, 0xdbd7be24370c7322, + 0xa10fceec0739fa5b, 0x2e675fb4576761d0, 0x54bf2f7c6752e8a9, + 0xcdcf48d84fe75459, 0xb71738107fd2dd20, 0x387fa9482f8c46ab, + 0x42a7d9801fb9cfd2, 0x0df7adabd7a6e2d6, 0x772fdd63e7936baf, + 0xf8474c3bb7cdf024, 0x829f3cf387f8795d, 0x66e7a46c27f3aa2c, + 0x1c3fd4a417c62355, 0x935745fc4798b8de, 0xe98f353477ad31a7, + 0xa6df411fbfb21ca3, 0xdc0731d78f8795da, 0x536fa08fdfd90e51, + 0x29b7d047efec8728, +}; + +inline constexpr uint64_t crc64impl( + uint64_t accumulator, + const char* data, + size_t size) { + for (size_t i = 0; i < size; ++i) { + accumulator = + crc64_table[(accumulator ^ data[i]) & 0xFF] ^ (accumulator >> 8); + } + return accumulator; +} +} // namespace detail + +struct crc64_t final : IdWrapper { + constexpr crc64_t(uint64_t checksum) : IdWrapper(checksum) {} + constexpr uint64_t checksum() const { + return this->underlyingId(); + } +}; + +// CRC64 with Jones coefficients and an init value of 0. +inline constexpr crc64_t crc64(const char* str, size_t size) { + return crc64_t{detail::crc64impl(0, str, size)}; +} + +inline constexpr crc64_t crc64(std::string_view str) { + return crc64(str.data(), str.size()); +} +} // namespace c10::util + +// Allow usage of crc64_t in std::unordered_set +C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::crc64_t) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h new file mode 100644 index 0000000000000000000000000000000000000000..ccd1ac50400d3dcdc160c42e8745bac7139c8217 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h @@ -0,0 +1,7 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h new file mode 100644 index 0000000000000000000000000000000000000000..682b8f364a2094c0feec2b6c19a8e2e54d296ee1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +constexpr size_t kDimVectorStaticSize = C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE; + +/// A container for sizes or strides +using DimVector = SmallVector; +using SymDimVector = SmallVector; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h new file mode 100644 index 0000000000000000000000000000000000000000..37e0af4319435c223442cc52d2b34d8e12e2715b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h @@ -0,0 +1,54 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include + +namespace c10::monitor { + +class C10_API DynamicCounter { + public: + using Callback = std::function; + + // Creates a dynamic counter that can be queried at any point in time by + // multiple backends. Only one counter with a given key can exist at any point + // in time. + // + // The callback is invoked every time the counter is queried. + // The callback must be thread-safe. + // The callback must not throw. + // The callback must not block. + DynamicCounter(std::string_view key, Callback getCounterCallback); + + // Unregisters the callback. + // Waits for all ongoing callback invocations to finish. + ~DynamicCounter(); + + private: + struct Guard; + std::unique_ptr guard_; +}; + +namespace detail { +class DynamicCounterBackendIf { + public: + virtual ~DynamicCounterBackendIf() = default; + + virtual void registerCounter( + std::string_view key, + DynamicCounter::Callback getCounterCallback) = 0; + // MUST wait for all ongoing callback invocations to finish + virtual void unregisterCounter(std::string_view key) = 0; +}; + +void C10_API registerDynamicCounterBackend( + std::unique_ptr /*backend*/); +} // namespace detail +} // namespace c10::monitor + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h new file mode 100644 index 0000000000000000000000000000000000000000..c6b4a7fa25013fa413504a69fb177b0e1d6febcc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h @@ -0,0 +1,875 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef C10_UTIL_EXCEPTION_H_ +#define C10_UTIL_EXCEPTION_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +namespace c10 { + +/// The primary ATen error class. +/// Provides a complete error message with source location information via +/// `what()`, and a more concise message via `what_without_backtrace()`. +/// Don't throw this directly; use TORCH_CHECK/TORCH_INTERNAL_ASSERT instead. +/// +/// NB: c10::Error is handled specially by the default torch to suppress the +/// backtrace, see torch/csrc/Exceptions.h +class C10_API Error : public std::exception { + private: + // The actual error message. + std::string msg_; + + // Context for the message (in order of decreasing specificity). Context will + // be automatically formatted appropriately, so it is not necessary to add + // extra leading/trailing newlines to strings inside this vector + std::vector context_; + + // The C++ backtrace at the point when this exception was raised. This + // may be empty if there is no valid backtrace. (We don't use optional + // here to reduce the dependencies this file has.) + Backtrace backtrace_; + + // These two are derived fields from msg_stack_ and backtrace_, but we need + // fields for the strings so that we can return a const char* (as the + // signature of std::exception requires). Currently, the invariant + // is that these fields are ALWAYS populated consistently with respect + // to msg_stack_ and backtrace_. + mutable OptimisticLazy what_; + std::string what_without_backtrace_; + + // This is a little debugging trick: you can stash a relevant pointer + // in caller, and then when you catch the exception, you can compare + // against pointers you have on hand to get more information about + // where the exception came from. In Caffe2, this is used to figure + // out which operator raised an exception. + const void* caller_; + + public: + // PyTorch-style Error constructor. NB: the implementation of this + // is actually in Logging.cpp + Error(SourceLocation source_location, std::string msg); + + // Caffe2-style error message + Error( + const char* file, + const uint32_t line, + const char* condition, + const std::string& msg, + Backtrace backtrace, + const void* caller = nullptr); + + // Base constructor + Error( + std::string msg, + Backtrace backtrace = nullptr, + const void* caller = nullptr); + + // Add some new context to the message stack. The last added context + // will be formatted at the end of the context list upon printing. + // WARNING: This method is O(n) in the size of the stack, so don't go + // wild adding a ridiculous amount of context to error messages. + void add_context(std::string msg); + + const std::string& msg() const { + return msg_; + } + + const std::vector& context() const { + return context_; + } + + const Backtrace& backtrace() const; + + /// Returns the complete error message, including the source location. + /// The returned pointer is invalidated if you call add_context() on + /// this object. + const char* what() const noexcept override; + + const void* caller() const noexcept { + return caller_; + } + + /// Returns only the error message string, without source location. + /// The returned pointer is invalidated if you call add_context() on + /// this object. + virtual const char* what_without_backtrace() const noexcept { + return what_without_backtrace_.c_str(); + } + + private: + void refresh_what(); + std::string compute_what(bool include_backtrace) const; +}; + +class C10_API Warning { + public: + class C10_API UserWarning{}; + class C10_API DeprecationWarning{}; + + using warning_variant_t = std::variant; + + Warning( + warning_variant_t type, + const SourceLocation& source_location, + std::string msg, + bool verbatim); + + Warning( + warning_variant_t type, + SourceLocation source_location, + const char* msg, + bool verbatim); + + Warning( + warning_variant_t type, + SourceLocation source_location, + ::c10::detail::CompileTimeEmptyString msg, + bool verbatim); + + // Getters for members + warning_variant_t type() const; + const SourceLocation& source_location() const; + const std::string& msg() const; + bool verbatim() const; + + private: + // The type of warning + warning_variant_t type_; + + // Where the warning happened. + SourceLocation source_location_; + + // The actual warning message. + std::string msg_; + + // See note: [Verbatim Warnings] + bool verbatim_; +}; + +using UserWarning = Warning::UserWarning; +using DeprecationWarning = Warning::DeprecationWarning; + +// Issue a warning with a given message. Dispatched to the current +// warning handler. +void C10_API warn(const Warning& warning); + +class C10_API WarningHandler { + public: + virtual ~WarningHandler() = default; + /// The default warning handler. Prints the message to stderr. + virtual void process(const Warning& warning); +}; + +namespace WarningUtils { + +// Note: [Verbatim Warnings] +// Warnings originating in C++ code can appear out-of-place to Python users: +// a user runs a line in Python, but the warning references a line in C++. +// Some parts of PyTorch, like the JIT, are cognizant of this mismatch +// and take care to map warnings back to the user's program, but most +// of PyTorch simply throws a context-free warning. To allow warning +// handlers to add context where appropriate, warn takes the +// "verbatim" flag. When this is false a warning handler might append +// the C++ warning to a Python warning message that relates the warning +// back to the user's program. Callers who have already accounted for +// context in their warnings should set verbatim to true so their warnings +// appear without modification. + +/// Sets the global warning handler. This is not thread-safe, so it should +/// generally be called once during initialization or while holding the GIL +/// for programs that use python. +/// User is responsible for keeping the WarningHandler alive until +/// it is not needed. +C10_API void set_warning_handler(WarningHandler* handler) noexcept(true); +/// Gets the global warning handler. +C10_API WarningHandler* get_warning_handler() noexcept(true); + +class C10_API WarningHandlerGuard { + WarningHandler* prev_handler_; + + public: + WarningHandlerGuard(WarningHandler* new_handler) + : prev_handler_(c10::WarningUtils::get_warning_handler()) { + c10::WarningUtils::set_warning_handler(new_handler); + } + WarningHandlerGuard(WarningHandlerGuard&& other) = delete; + WarningHandlerGuard(const WarningHandlerGuard&) = delete; + WarningHandlerGuard& operator=(const WarningHandlerGuard&) = delete; + WarningHandlerGuard& operator=(WarningHandlerGuard&&) = delete; + ~WarningHandlerGuard() { + c10::WarningUtils::set_warning_handler(prev_handler_); + } +}; + +/// The TORCH_WARN_ONCE macro is difficult to test for. Use +/// setWarnAlways(true) to turn it into TORCH_WARN, which can be +/// tested for more easily. +C10_API void set_warnAlways(bool /*setting*/) noexcept(true); +C10_API bool get_warnAlways() noexcept(true); + +// A RAII guard that sets warn_always (not thread-local) on +// construction, and sets it back to the original value upon destruction. +struct C10_API WarnAlways { + public: + explicit WarnAlways(bool setting = true); + ~WarnAlways(); + + private: + bool prev_setting; +}; + +} // namespace WarningUtils + +// Like Error, but we always report the C++ backtrace, instead of only +// reporting when TORCH_SHOW_CPP_STACKTRACES +class C10_API ErrorAlwaysShowCppStacktrace : public Error { + using Error::Error; + const char* what_without_backtrace() const noexcept override { + return what(); + } +}; + +// Used in ATen for out-of-bound indices that can reasonably only be detected +// lazily inside a kernel (See: advanced indexing). These turn into +// IndexError when they cross to Python. +class C10_API IndexError : public Error { + using Error::Error; +}; + +// Used in ATen for invalid values. These turn into +// ValueError when they cross to Python. +class C10_API ValueError : public Error { + using Error::Error; +}; + +// Used in ATen for invalid types. These turn into +// TypeError when they cross to Python. +class C10_API TypeError : public Error { + using Error::Error; +}; + +// Used in ATen for functionality that is not implemented. These turn into +// NotImplementedError when they cross to Python. +class C10_API NotImplementedError : public Error { + using Error::Error; +}; + +// Used in ATen for buffer-related errors, e.g. trying to create a DLPack of +// an unsupported device. These turn into BufferError when they cross to +// Python. +class C10_API BufferError : public Error { + using Error::Error; +}; + +// Used in ATen for non finite indices. These turn into +// ExitException when they cross to Python. +class C10_API EnforceFiniteError : public Error { + using Error::Error; +}; + +// Used in Onnxifi backend lowering. These turn into +// ExitException when they cross to Python. +class C10_API OnnxfiBackendSystemError : public Error { + using Error::Error; +}; + +// Used for numerical errors from the linalg module. These +// turn into LinAlgError when they cross into Python. +class C10_API LinAlgError : public Error { + using Error::Error; +}; + +class C10_API OutOfMemoryError : public Error { + using Error::Error; +}; + +// Used for handling syntactic errors in input arguments. +// These turn into SyntaxError when the cross into Python. +class C10_API SyntaxError : public Error { + using Error::Error; +}; + +// Raised when accelerator API call hits an error. +// These turn into AcceleratorError when the cross into Python +class C10_API AcceleratorError : public Error { + int32_t error_code; + + public: + AcceleratorError(SourceLocation loc, int32_t code, const std::string& msg) + : Error(loc, msg), error_code(code) {} + int32_t get_error_code() const { + return error_code; + } +}; + +// Base error type for all distributed errors. +// These turn into DistError when they cross into Python. +class C10_API DistError : public Error { + using Error::Error; +}; + +// Used for collective communication library errors from the distributed module. +// These turn into DistBackendError when they cross into Python. +class C10_API DistBackendError : public DistError { + using DistError::DistError; +}; + +// Used for errors originating from the store. +// These turn into DistStoreError when they cross into Python. +class C10_API DistStoreError : public DistError { + using DistError::DistError; +}; + +// Used for errors originating from the TCP/IP stack and not from collective +// libraries. These turn into DistNetworkError when they cross into Python. +class C10_API DistNetworkError : public DistError { + using DistError::DistError; +}; + +// Raised when a queue is empty and a non-blocking pop is called. +// Translated to torch.distributed.QueueEmptyError in Python +class C10_API DistQueueEmptyError : public DistStoreError { + using DistStoreError::DistStoreError; +}; + +// A utility function to return an exception std::string by prepending its +// exception type before its what() content +C10_API std::string GetExceptionString(const std::exception& e); + +} // namespace c10 + +// Private helper macro for implementing TORCH_INTERNAL_ASSERT and TORCH_CHECK +// +// Note: In the debug build With MSVC, __LINE__ might be of long type (a.k.a +// int32_t), which is different from the definition of `SourceLocation` that +// requires unsigned int (a.k.a uint32_t) and may cause a compile error with the +// message: error C2397: conversion from 'long' to 'uint32_t' requires a +// narrowing conversion Here the static cast is used to pass the build. if this +// is used inside a lambda the __func__ macro expands to operator(), which isn't +// very useful, but hard to fix in a macro so suppressing the warning. +#define C10_THROW_ERROR(err_type, msg) \ + throw ::c10::err_type( \ + {__func__, __FILE__, static_cast(__LINE__)}, msg) + +#define C10_BUILD_ERROR(err_type, msg) \ + ::c10::err_type({__func__, __FILE__, static_cast(__LINE__)}, msg) + +// Private helper macro for workaround MSVC misexpansion of nested macro +// invocations involving __VA_ARGS__. See +// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly +#define C10_EXPAND_MSVC_WORKAROUND(x) x + +#include + +// ---------------------------------------------------------------------------- +// Error reporting macros +// ---------------------------------------------------------------------------- + +#ifdef STRIP_ERROR_MESSAGES +#define TORCH_RETHROW(e, ...) \ + do { \ + (void)e; /* Suppress unused variable warning */ \ + throw; \ + } while (false) +#else +#define TORCH_RETHROW(e, ...) \ + do { \ + e.add_context(::c10::str(__VA_ARGS__)); \ + throw; \ + } while (false) +#endif + +// A utility macro to provide assert()-like functionality; that is, enforcement +// of internal invariants in code. It supports an arbitrary number of extra +// arguments (evaluated only on failure), which will be printed in the assert +// failure message using operator<< (this is useful to print some variables +// which may be useful for debugging.) +// +// Usage: +// TORCH_INTERNAL_ASSERT(should_be_true); +// TORCH_INTERNAL_ASSERT(x == 0, "x = ", x); +// +// Assuming no bugs in PyTorch, the conditions tested by this macro should +// always be true; e.g., it should be possible to disable all of these +// conditions without changing observable user behavior. If you would like to +// do error reporting for user input, please use TORCH_CHECK instead. +// +// NOTE: It is SAFE to use this macro in production code; on failure, this +// simply raises an exception, it does NOT unceremoniously quit the process +// (unlike assert()). +// +#ifdef STRIP_ERROR_MESSAGES +#define TORCH_INTERNAL_ASSERT(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchCheckFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + #cond " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__)); \ + } +#else +// It would be nice if we could build a combined string literal out of +// the TORCH_INTERNAL_ASSERT prefix and a user-provided string literal +// as the first argument, but there doesn't seem to be any good way to +// do that while still supporting having a first argument that isn't a +// string literal. +#define TORCH_INTERNAL_ASSERT(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchInternalAssertFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + #cond \ + " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \ + __LINE__) ", please report a bug to PyTorch. ", \ + c10::str(__VA_ARGS__)); \ + } +#endif + +// A utility macro to make it easier to test for error conditions from user +// input. Like TORCH_INTERNAL_ASSERT, it supports an arbitrary number of extra +// arguments (evaluated only on failure), which will be printed in the error +// message using operator<< (e.g., you can pass any object which has +// operator<< defined. Most objects in PyTorch have these definitions!) +// +// Usage: +// TORCH_CHECK(should_be_true); // A default error message will be provided +// // in this case; but we recommend writing an +// // explicit error message, as it is more +// // user friendly. +// TORCH_CHECK(x == 0, "Expected x to be 0, but got ", x); +// +// On failure, this macro will raise an exception. If this exception propagates +// to Python, it will convert into a Python RuntimeError. +// +// NOTE: It is SAFE to use this macro in production code; on failure, this +// simply raises an exception, it does NOT unceremoniously quit the process +// (unlike CHECK() from glog.) +// +#define TORCH_CHECK_WITH(error_t, cond, ...) \ + TORCH_CHECK_WITH_MSG(error_t, cond, "", __VA_ARGS__) + +#ifdef STRIP_ERROR_MESSAGES +#define TORCH_CHECK_MSG(cond, type, ...) \ + (#cond #type " CHECK FAILED at " C10_STRINGIZE(__FILE__)) +#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + C10_THROW_ERROR(Error, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \ + } +#else + +namespace c10::detail { +template +auto torchCheckMsgImpl(const char* /*msg*/, const Args&... args) { + return ::c10::str(args...); +} +inline C10_API const char* torchCheckMsgImpl(const char* msg) { + return msg; +} +// If there is just 1 user-provided C-string argument, use it. +inline C10_API const char* torchCheckMsgImpl( + const char* /*msg*/, + const char* args) { + return args; +} +} // namespace c10::detail + +#define TORCH_CHECK_MSG(cond, type, ...) \ + (::c10::detail::torchCheckMsgImpl( \ + "Expected " #cond \ + " to be true, but got false. " \ + "(Could this error message be improved? If so, " \ + "please report an enhancement request to PyTorch.)", \ + ##__VA_ARGS__)) +#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + C10_THROW_ERROR(error_t, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \ + } +#endif + +namespace c10::detail { + +[[noreturn]] C10_API void torchCheckFail( + const char* func, + const char* file, + uint32_t line, + const std::string& msg); +[[noreturn]] C10_API void torchCheckFail( + const char* func, + const char* file, + uint32_t line, + const char* msg); + +// The c10::str() call that creates userMsg can have 1 of 3 return +// types depending on the number and types of arguments passed to +// TORCH_INTERNAL_ASSERT. 0 arguments will get a +// CompileTimeEmptyString, 1 const char * will be passed straight +// through, and anything else will get converted to std::string. +[[noreturn]] C10_API void torchInternalAssertFail( + const char* func, + const char* file, + uint32_t line, + const char* condMsg, + const char* userMsg); +[[noreturn]] inline C10_API void torchInternalAssertFail( + const char* func, + const char* file, + uint32_t line, + const char* condMsg, + ::c10::detail::CompileTimeEmptyString /*userMsg*/) { + torchCheckFail(func, file, line, condMsg); +} +[[noreturn]] C10_API void torchInternalAssertFail( + const char* func, + const char* file, + uint32_t line, + const char* condMsg, + const std::string& userMsg); + +} // namespace c10::detail + +#ifdef STANDALONE_TORCH_HEADER + +// TORCH_CHECK throws std::runtime_error instead of c10::Error which is +// useful when certain headers are used in a libtorch-independent way, +// e.g. when Vectorized is used in AOTInductor generated code. +#ifdef STRIP_ERROR_MESSAGES +#define TORCH_CHECK(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + throw std::runtime_error(TORCH_CHECK_MSG( \ + cond, \ + "", \ + __func__, \ + ", ", \ + __FILE__, \ + ":", \ + __LINE__, \ + ", ", \ + __VA_ARGS__)); \ + } +#else +#define TORCH_CHECK(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + throw std::runtime_error(TORCH_CHECK_MSG( \ + cond, \ + "", \ + __func__, \ + ", ", \ + __FILE__, \ + ":", \ + __LINE__, \ + ", ", \ + ##__VA_ARGS__)); \ + } +#endif + +#else + +#ifdef STRIP_ERROR_MESSAGES +#define TORCH_CHECK(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchCheckFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \ + } +#else +#define TORCH_CHECK(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchCheckFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \ + } +#endif + +#endif + +// An utility macro that does what `TORCH_CHECK` does if compiled in the host +// code, otherwise does nothing. Supposed to be used in the code shared between +// host and device code as an alternative for `TORCH_CHECK`. +#if defined(__CUDACC__) || defined(__HIPCC__) +#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...) +#else +#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...) TORCH_CHECK(cond, ##__VA_ARGS__) +#endif + +// Debug only version of TORCH_INTERNAL_ASSERT. This macro only checks in debug +// build, and does nothing in release build. It is appropriate to use +// in situations where you want to add an assert to a hotpath, but it is +// too expensive to run this assert on production builds. +#ifdef NDEBUG +// Optimized version - generates no code. +#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \ + while (false) \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)) +#else +#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)) +#endif + +// TODO: We're going to get a lot of similar looking string literals +// this way; check if this actually affects binary size. + +// Like TORCH_CHECK, but raises LinAlgError instead of Error. +#define TORCH_CHECK_LINALG(cond, ...) \ + TORCH_CHECK_WITH_MSG(LinAlgError, cond, "LINALG", __VA_ARGS__) + +// Like TORCH_CHECK, but raises IndexErrors instead of Errors. +#define TORCH_CHECK_INDEX(cond, ...) \ + TORCH_CHECK_WITH_MSG(IndexError, cond, "INDEX", __VA_ARGS__) + +// Like TORCH_CHECK, but raises ValueErrors instead of Errors. +#define TORCH_CHECK_VALUE(cond, ...) \ + TORCH_CHECK_WITH_MSG(ValueError, cond, "VALUE", __VA_ARGS__) + +// Like TORCH_CHECK, but raises TypeErrors instead of Errors. +#define TORCH_CHECK_TYPE(cond, ...) \ + TORCH_CHECK_WITH_MSG(TypeError, cond, "TYPE", __VA_ARGS__) + +// Like TORCH_CHECK, but raises NotImplementedErrors instead of Errors. +#define TORCH_CHECK_NOT_IMPLEMENTED(cond, ...) \ + TORCH_CHECK_WITH_MSG(NotImplementedError, cond, "TYPE", __VA_ARGS__) + +// Like TORCH_CHECK, but raises BufferError instead of Errors. +#define TORCH_CHECK_BUFFER(cond, ...) \ + TORCH_CHECK_WITH_MSG(BufferError, cond, "TYPE", __VA_ARGS__) + +#define TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(cond, ...) \ + TORCH_CHECK_WITH_MSG( \ + ErrorAlwaysShowCppStacktrace, cond, "TYPE", ##__VA_ARGS__) + +#ifdef STRIP_ERROR_MESSAGES +#define WARNING_MESSAGE_STRING(...) \ + ::c10::detail::CompileTimeEmptyString {} +#else +#define WARNING_MESSAGE_STRING(...) ::c10::str(__VA_ARGS__) +#endif + +// Report a warning to the user. Accepts an arbitrary number of extra +// arguments which are concatenated into the warning message using operator<< +// +#ifdef DISABLE_WARN +#define _TORCH_WARN_WITH(...) ((void)0); +#else +#define _TORCH_WARN_WITH(warning_t, ...) \ + ::c10::warn(::c10::Warning( \ + warning_t(), \ + {__func__, __FILE__, static_cast(__LINE__)}, \ + WARNING_MESSAGE_STRING(__VA_ARGS__), \ + false)); +#endif + +#define TORCH_WARN(...) _TORCH_WARN_WITH(::c10::UserWarning, __VA_ARGS__); + +#define TORCH_WARN_DEPRECATION(...) \ + _TORCH_WARN_WITH(::c10::DeprecationWarning, __VA_ARGS__); + +// Report a warning to the user only once. Accepts an arbitrary number of extra +// arguments which are concatenated into the warning message using operator<< +// +#define _TORCH_WARN_ONCE(...) \ + [[maybe_unused]] static const auto C10_ANONYMOUS_VARIABLE( \ + torch_warn_once_) = [&] { \ + TORCH_WARN(__VA_ARGS__); \ + return true; \ + }() + +#ifdef DISABLE_WARN +#define TORCH_WARN_ONCE(...) ((void)0); +#else +#define TORCH_WARN_ONCE(...) \ + if (::c10::WarningUtils::get_warnAlways()) { \ + TORCH_WARN(__VA_ARGS__); \ + } else { \ + _TORCH_WARN_ONCE(__VA_ARGS__); \ + } +#endif + +// Report an error with a specific argument +// NOTE: using the argument name in TORCH_CHECK's message is preferred +#define TORCH_CHECK_ARG(cond, argN, ...) \ + TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__) + +#ifndef FATAL_IF +#ifdef C10_USE_GLOG +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \ + .stream() +#else +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream() +#endif +#endif + +#ifndef NON_FATAL_IF +#ifdef C10_USE_GLOG +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger( \ + __FILE__, __LINE__, ::google::GLOG_FATAL, false) \ + .stream() +#else +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \ + .stream() +#endif +#endif + +// Binary comparison check macros +#define TORCH_CHECK_OP(val1, val2, op) \ + NON_FATAL_IF(((val1)op(val2))) \ + << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \ + << (val2) << "). " + +#define TORCH_DCHECK_OP(val1, val2, op) \ + FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \ + << (val1) << " vs. " << (val2) << "). " + +#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==) +#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=) +#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=) +#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <) +#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=) +#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >) + +// Debug versions of TORCH_CHECK_OP macros +#ifndef NDEBUG +#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >) +#else // !NDEBUG +// Optimized versions - generate no code +#define TORCH_DCHECK_EQ(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >) +#endif // NDEBUG + +// Null pointer check macro +#define TORCH_CHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false) + +#ifndef NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true) +#else // !NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + while (false) \ + TORCH_CHECK_NOTNULL(val) +#endif // NDEBUG + +// ---------------------------------------------------------------------------- +// Deprecated macros +// ---------------------------------------------------------------------------- + +namespace c10::detail { + +/* +// Deprecation disabled until we fix sites in our codebase +[[deprecated("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg) +instead.")]] +*/ +inline void deprecated_AT_ERROR() {} + +/* +// Deprecation disabled until we fix sites in our codebase +[[deprecated("AT_ASSERT is deprecated, if you mean to indicate an +internal invariant failure, use " \ + "TORCH_INTERNAL_ASSERT instead; if you mean to do user +error checking, use " \ "TORCH_CHECK. See +https://github.com/pytorch/pytorch/issues/20287 for more details.")]] +*/ +inline void deprecated_AT_ASSERT() {} + +/* +// Deprecation disabled until we fix sites in our codebase +[[deprecated("AT_ASSERTM is deprecated, if you mean to indicate an +internal invariant failure, use " \ + "TORCH_INTERNAL_ASSERT instead; if you mean to do user +error checking, use " \ "TORCH_CHECK. See +https://github.com/pytorch/pytorch/issues/20287 for more details.")]] +*/ +inline void deprecated_AT_ASSERTM() {} + +} // namespace c10::detail + +// Deprecated alias; this alias was deprecated because people kept mistakenly +// using it for user error checking. Use TORCH_INTERNAL_ASSERT or TORCH_CHECK +// instead. See https://github.com/pytorch/pytorch/issues/20287 for more +// details. +#define AT_ASSERT(...) \ + do { \ + ::c10::detail::deprecated_AT_ASSERT(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)); \ + } while (false) + +// Deprecated alias, like AT_ASSERT. The new TORCH_INTERNAL_ASSERT macro +// supports both 0-ary and variadic calls, so having a separate +// message-accepting macro is not necessary. +// +// NB: we MUST include cond explicitly here, as MSVC will miscompile the macro +// expansion, shunting all of __VA_ARGS__ to cond. An alternate workaround +// can be seen at +// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly +#define AT_ASSERTM(cond, ...) \ + do { \ + ::c10::detail::deprecated_AT_ASSERTM(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__)); \ + } while (false) + +// Deprecated alias; this alias was deprecated because it represents extra API +// surface that makes it hard for people to understand what macro to use. +// Use TORCH_CHECK(false, ...) or TORCH_INTERNAL_ASSERT(false, ...) to +// unconditionally fail at a line of code. +#define AT_ERROR(...) \ + do { \ + ::c10::detail::deprecated_AT_ERROR(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \ + } while (false) + +#endif // C10_UTIL_EXCEPTION_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h new file mode 100644 index 0000000000000000000000000000000000000000..24cdba8d3ea3d9850b673974971c9eca37ff365f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h @@ -0,0 +1,145 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +// See example implementation in TensorBase.h and TensorBody.h. +// Synopsis: +// +// repr_type -- type to use to store an owned T in ExclusivelyOwned. +// +// pointer_type -- pointer-esque type to return from +// ExclusivelyOwned's get() and operator*() methods. +// +// const_pointer_type -- similar to pointer_type, used for the const methods. +// +// static repr_type nullRepr() -- return a null instance of repr_type. +// +// template +// static repr_type createInPlace(Args&&... args) -- used by the in-place +// ExclusivelyOwned constructor. +// +// static repr_type moveToRepr(T&& x) -- move the given x into an +// instance of repr_type. used by the ExclusivelyOwned(T&&) +// constructor. +// +// static void destroyOwned(repr_type x) -- free memory for a +// known-exclusively-owned instance of x. Replaces calling repr_type's +// destructor. Being able to implement this more efficiently than +// repr_type's destructor is the main reason to use ExclusivelyOwned +// for a type. +// +// static T take(repr_type&) -- move out of the given repr_type into an owned T. +// +// static pointer_type getImpl(const repr_type&) -- return a pointer +// to the given repr_type. May take repr_type by value if that is more +// efficient. +template +struct ExclusivelyOwnedTraits; + +/// ExclusivelyOwned is a smart-pointer-like wrapper around an +/// exclusively-owned instance of some type T that normally has +/// mandatory reference counting (currently just Tensor). If you have +/// an isolated piece of code that knows that it has sole ownership of +/// an object of one of these types (i.e., because you created it +/// directly or using a factory function) and that object will not +/// escape from that isolated piece of code, then moving the object +/// into an ExclusivelyOwned will avoid an atomic reference count +/// decrement at destruction time. +/// +/// If you directly create the Tensor in the first +/// place, you can use the in_place constructor of ExclusivelyOwned to +/// additionally avoid doing any stores to initialize the refcount & +/// weakcount. +template +class ExclusivelyOwned { + using EOT = ExclusivelyOwnedTraits; + typename ExclusivelyOwnedTraits::repr_type repr_; + + public: + ExclusivelyOwned() : repr_(EOT::nullRepr()) {} + + explicit ExclusivelyOwned(T&& t) : repr_(EOT::moveToRepr(std::move(t))) {} + + template + explicit ExclusivelyOwned(std::in_place_t /*unused*/, Args&&... args) + : repr_(EOT::createInPlace(std::forward(args)...)) {} + + ExclusivelyOwned(const ExclusivelyOwned&) = delete; + + ExclusivelyOwned(ExclusivelyOwned&& rhs) noexcept + : repr_(std::move(rhs.repr_)) { + rhs.repr_ = EOT::nullRepr(); + } + + ExclusivelyOwned& operator=(const ExclusivelyOwned&) = delete; + + ExclusivelyOwned& operator=(ExclusivelyOwned&& rhs) noexcept { + EOT::destroyOwned(repr_); + repr_ = std::move(rhs.repr_); + rhs.repr_ = EOT::nullRepr(); + return *this; + } + + ExclusivelyOwned& operator=(T&& rhs) noexcept { + EOT::destroyOwned(repr_); + repr_ = EOT::moveToRepr(std::move(rhs)); + return *this; + } + + ~ExclusivelyOwned() { + EOT::destroyOwned(repr_); + // Don't bother to call the destructor of repr_, since we already + // did specialized destruction for the exclusively-owned case in + // destroyOwned! + } + + // We don't provide this because it would require us to be able to + // differentiate an owned-but-empty T from a lack of T. This is + // particularly problematic for Tensor, which wants to use an + // undefined Tensor as its null state. + explicit operator bool() const noexcept = delete; + + operator T() && { + return take(); + } + + // NOTE: the equivalent operation on MaybeOwned is a moving + // operator*. For ExclusivelyOwned, take() and operator*() may well + // have different return types, so they are different functions. + T take() && { + return EOT::take(repr_); + } + + typename EOT::const_pointer_type operator->() const { + return get(); + } + + typename EOT::const_pointer_type get() const { + return EOT::getImpl(repr_); + } + + typename EOT::pointer_type operator->() { + return get(); + } + + typename EOT::pointer_type get() { + return EOT::getImpl(repr_); + } + + std::remove_pointer_t& operator*() const { + return *get(); + } + + std::remove_pointer_t& operator*() { + return *get(); + } +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h new file mode 100644 index 0000000000000000000000000000000000000000..964c57668f629d342576a50f173247192e9f6c4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Shim header for filesystem for compilers that are too old to have it not +// in the experimental namespace + +#if __has_include() +#include +#elif __has_include() +#include +#else +#error "Neither nor is available." +#endif + +namespace c10 { + +#if __has_include() +// NOLINTNEXTLINE(misc-unused-alias-decls) +namespace filesystem = std::filesystem; +#elif __has_include() +// NOLINTNEXTLINE(misc-unused-alias-decls) +namespace filesystem = std::experimental::filesystem; +#endif + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h new file mode 100644 index 0000000000000000000000000000000000000000..c2485bfdebae3a17f0fc8131cfcf24c01052c2a9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h @@ -0,0 +1,247 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef C10_UTIL_FLAGS_H_ +#define C10_UTIL_FLAGS_H_ + +/* Commandline flags support for C10. + * + * This is a portable commandline flags tool for c10, so we can optionally + * choose to use gflags or a lightweight custom implementation if gflags is + * not possible on a certain platform. If you have gflags installed, set the + * macro C10_USE_GFLAGS will seamlessly route everything to gflags. + * + * To define a flag foo of type bool default to true, do the following in the + * *global* namespace: + * C10_DEFINE_bool(foo, true, "An example."); + * + * To use it in another .cc file, you can use C10_DECLARE_* as follows: + * C10_DECLARE_bool(foo); + * + * In both cases, you can then access the flag via FLAGS_foo. + * + * It is recommended that you build with gflags. To learn more about the flags + * usage, refer to the gflags page here: + * + * https://gflags.github.io/gflags/ + * + * Note about Python users / devs: gflags is initiated from a C++ function + * ParseCommandLineFlags, and is usually done in native binaries in the main + * function. As Python does not have a modifiable main function, it is usually + * difficult to change the flags after Python starts. Hence, it is recommended + * that one sets the default value of the flags to one that's acceptable in + * general - that will allow Python to run without wrong flags. + */ + +#include +#include + +#include + +namespace c10 { +/** + * Sets the usage message when a commandline tool is called with "--help". + */ +C10_API void SetUsageMessage(const std::string& str); + +/** + * Returns the usage message for the commandline tool set by SetUsageMessage. + */ +C10_API const char* UsageMessage(); + +/** + * Parses the commandline flags. + * + * This command parses all the commandline arguments passed in via pargc + * and argv. Once it is finished, partc and argv will contain the remaining + * commandline args that c10 does not deal with. Note that following + * convention, argv[0] contains the binary name and is not parsed. + */ +C10_API bool ParseCommandLineFlags(int* pargc, char*** pargv); + +/** + * Checks if the commandline flags has already been passed. + */ +C10_API bool CommandLineFlagsHasBeenParsed(); + +} // namespace c10 + +//////////////////////////////////////////////////////////////////////////////// +// Below are gflags and non-gflags specific implementations. +// In general, they define the following macros for one to declare (use +// C10_DECLARE) or define (use C10_DEFINE) flags: +// C10_{DECLARE,DEFINE}_{int,int64,double,bool,string} +//////////////////////////////////////////////////////////////////////////////// + +#ifdef C10_USE_GFLAGS + +//////////////////////////////////////////////////////////////////////////////// +// Begin gflags section: most functions are basically rerouted to gflags. +//////////////////////////////////////////////////////////////////////////////// +#include + +// C10 uses hidden visibility by default. However, in gflags, it only uses +// export on Windows platform (with dllexport) but not on linux/mac (with +// default visibility). As a result, to ensure that we are always exporting +// global variables, we will redefine the GFLAGS_DLL_DEFINE_FLAG macro if we +// are building C10 as a shared library. +// This has to be done after the inclusion of gflags, because some early +// versions of gflags.h (e.g. 2.0 on ubuntu 14.04) directly defines the +// macros, so we need to do definition after gflags is done. +#ifdef GFLAGS_DLL_DEFINE_FLAG +#undef GFLAGS_DLL_DEFINE_FLAG +#endif // GFLAGS_DLL_DEFINE_FLAG +#ifdef GFLAGS_DLL_DECLARE_FLAG +#undef GFLAGS_DLL_DECLARE_FLAG +#endif // GFLAGS_DLL_DECLARE_FLAG +#define GFLAGS_DLL_DEFINE_FLAG C10_EXPORT +#define GFLAGS_DLL_DECLARE_FLAG C10_IMPORT + +// gflags before 2.0 uses namespace google and after 2.1 uses namespace gflags. +// Using GFLAGS_GFLAGS_H_ to capture this change. +#ifndef GFLAGS_GFLAGS_H_ +namespace gflags = google; +#endif // GFLAGS_GFLAGS_H_ + +// Motivation about the gflags wrapper: +// (1) We would need to make sure that the gflags version and the non-gflags +// version of C10 are going to expose the same flags abstraction. One should +// explicitly use FLAGS_flag_name to access the flags. +// (2) For flag names, it is recommended to start with c10_ to distinguish it +// from regular gflags flags. For example, do +// C10_DEFINE_BOOL(c10_my_flag, true, "An example"); +// to allow one to use FLAGS_c10_my_flag. +// (3) Gflags has a design issue that does not properly expose the global flags, +// if one builds the library with -fvisibility=hidden. The current gflags (as of +// Aug 2018) only deals with the Windows case using dllexport, and not the Linux +// counterparts. As a result, we will explicitly use C10_EXPORT to export the +// flags defined in C10. This is done via a global reference, so the flag +// itself is not duplicated - under the hood it is the same global gflags flag. +#define C10_GFLAGS_DEF_WRAPPER(type, real_type, name, default_value, help_str) \ + DEFINE_##type(name, default_value, help_str); + +#define C10_DEFINE_int(name, default_value, help_str) \ + C10_GFLAGS_DEF_WRAPPER(int32, gflags::int32, name, default_value, help_str) +#define C10_DEFINE_int32(name, default_value, help_str) \ + C10_DEFINE_int(name, default_value, help_str) +#define C10_DEFINE_int64(name, default_value, help_str) \ + C10_GFLAGS_DEF_WRAPPER(int64, gflags::int64, name, default_value, help_str) +#define C10_DEFINE_double(name, default_value, help_str) \ + C10_GFLAGS_DEF_WRAPPER(double, double, name, default_value, help_str) +#define C10_DEFINE_bool(name, default_value, help_str) \ + C10_GFLAGS_DEF_WRAPPER(bool, bool, name, default_value, help_str) +#define C10_DEFINE_string(name, default_value, help_str) \ + C10_GFLAGS_DEF_WRAPPER(string, ::fLS::clstring, name, default_value, help_str) + +// DECLARE_typed_var should be used in header files and in the global namespace. +#define C10_GFLAGS_DECLARE_WRAPPER(type, real_type, name) DECLARE_##type(name); + +#define C10_DECLARE_int(name) \ + C10_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name) +#define C10_DECLARE_int32(name) C10_DECLARE_int(name) +#define C10_DECLARE_int64(name) \ + C10_GFLAGS_DECLARE_WRAPPER(int64, gflags::int64, name) +#define C10_DECLARE_double(name) \ + C10_GFLAGS_DECLARE_WRAPPER(double, double, name) +#define C10_DECLARE_bool(name) C10_GFLAGS_DECLARE_WRAPPER(bool, bool, name) +#define C10_DECLARE_string(name) \ + C10_GFLAGS_DECLARE_WRAPPER(string, ::fLS::clstring, name) + +#define TORCH_DECLARE_int(name) C10_DECLARE_int(name) +#define TORCH_DECLARE_int32(name) C10_DECLARE_int32(name) +#define TORCH_DECLARE_int64(name) C10_DECLARE_int64(name) +#define TORCH_DECLARE_double(name) C10_DECLARE_double(name) +#define TORCH_DECLARE_bool(name) C10_DECLARE_bool(name) +#define TORCH_DECLARE_string(name) C10_DECLARE_string(name) + +//////////////////////////////////////////////////////////////////////////////// +// End gflags section. +//////////////////////////////////////////////////////////////////////////////// + +#else // C10_USE_GFLAGS + +//////////////////////////////////////////////////////////////////////////////// +// Begin non-gflags section: providing equivalent functionality. +//////////////////////////////////////////////////////////////////////////////// + +namespace c10 { + +class C10_API C10FlagParser { + public: + bool success() { + return success_; + } + + protected: + template + bool Parse(const std::string& content, T* value); + bool success_{false}; +}; + +C10_DECLARE_REGISTRY(C10FlagsRegistry, C10FlagParser, const std::string&); + +} // namespace c10 + +// The macros are defined outside the c10 namespace. In your code, you should +// write the C10_DEFINE_* and C10_DECLARE_* macros outside any namespace +// as well. + +#define C10_DEFINE_typed_var(type, name, default_value, help_str) \ + C10_EXPORT type FLAGS_##name = default_value; \ + namespace c10 { \ + namespace { \ + class C10FlagParser_##name : public C10FlagParser { \ + public: \ + explicit C10FlagParser_##name(const std::string& content) { \ + success_ = C10FlagParser::Parse(content, &FLAGS_##name); \ + } \ + }; \ + RegistererC10FlagsRegistry g_C10FlagsRegistry_##name( \ + #name, \ + C10FlagsRegistry(), \ + RegistererC10FlagsRegistry::DefaultCreator, \ + "(" #type ", default " #default_value ") " help_str); \ + } \ + } + +#define C10_DEFINE_int(name, default_value, help_str) \ + C10_DEFINE_typed_var(int, name, default_value, help_str) +#define C10_DEFINE_int32(name, default_value, help_str) \ + C10_DEFINE_int(name, default_value, help_str) +#define C10_DEFINE_int64(name, default_value, help_str) \ + C10_DEFINE_typed_var(int64_t, name, default_value, help_str) +#define C10_DEFINE_double(name, default_value, help_str) \ + C10_DEFINE_typed_var(double, name, default_value, help_str) +#define C10_DEFINE_bool(name, default_value, help_str) \ + C10_DEFINE_typed_var(bool, name, default_value, help_str) +#define C10_DEFINE_string(name, default_value, help_str) \ + C10_DEFINE_typed_var(std::string, name, default_value, help_str) + +// DECLARE_typed_var should be used in header files and in the global namespace. +#define C10_DECLARE_typed_var(type, name) C10_API extern type FLAGS_##name + +#define C10_DECLARE_int(name) C10_DECLARE_typed_var(int, name) +#define C10_DECLARE_int32(name) C10_DECLARE_int(name) +#define C10_DECLARE_int64(name) C10_DECLARE_typed_var(int64_t, name) +#define C10_DECLARE_double(name) C10_DECLARE_typed_var(double, name) +#define C10_DECLARE_bool(name) C10_DECLARE_typed_var(bool, name) +#define C10_DECLARE_string(name) C10_DECLARE_typed_var(std::string, name) + +#define TORCH_DECLARE_typed_var(type, name) TORCH_API extern type FLAGS_##name + +#define TORCH_DECLARE_int(name) TORCH_DECLARE_typed_var(int, name) +#define TORCH_DECLARE_int32(name) TORCH_DECLARE_int(name) +#define TORCH_DECLARE_int64(name) TORCH_DECLARE_typed_var(int64_t, name) +#define TORCH_DECLARE_double(name) TORCH_DECLARE_typed_var(double, name) +#define TORCH_DECLARE_bool(name) TORCH_DECLARE_typed_var(bool, name) +#define TORCH_DECLARE_string(name) TORCH_DECLARE_typed_var(std::string, name) + +//////////////////////////////////////////////////////////////////////////////// +// End non-gflags section. +//////////////////////////////////////////////////////////////////////////////// + +#endif // C10_USE_GFLAGS + +#endif // C10_UTIL_FLAGS_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h new file mode 100644 index 0000000000000000000000000000000000000000..fd690e5aa345ac097a2b4022b6e5a42677e403f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h new file mode 100644 index 0000000000000000000000000000000000000000..ed07b955168f7ab08b4a20657d8f36ea7cd4123c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h new file mode 100644 index 0000000000000000000000000000000000000000..30481a62430fdf08f2107bc1ab50e811314767f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..f4e0802e2f7b1a6712f95dea5b82267d8a8498dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h new file mode 100644 index 0000000000000000000000000000000000000000..f4e0802e2f7b1a6712f95dea5b82267d8a8498dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h new file mode 100644 index 0000000000000000000000000000000000000000..f3e8c25099a630204f3c4ee345fd2a3653c14116 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h new file mode 100644 index 0000000000000000000000000000000000000000..030b23d64750b7378c8fc281c96d2fe662e38d88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..78c3d37c1698db15f05b3b3367765075be2d9046 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..b985cd3e51c325b50dd5ee368c216689888123d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h @@ -0,0 +1,82 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +/** + * This template simplifies generation of simple classes that wrap an id + * in a typesafe way. Namely, you can use it to create a very lightweight + * type that only offers equality comparators and hashing. Example: + * + * struct MyIdType final : IdWrapper { + * constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {} + * }; + * + * Then in the global top level namespace: + * + * C10_DEFINE_HASH_FOR_IDWRAPPER(MyIdType); + * + * That's it - equality operators and hash functions are automatically defined + * for you, given the underlying type supports it. + */ +template +class IdWrapper { + public: + using underlying_type = UnderlyingType; + using concrete_type = ConcreteType; + + protected: + constexpr explicit IdWrapper(underlying_type id) noexcept( + noexcept(underlying_type(std::declval()))) + : id_(id) {} + + constexpr underlying_type underlyingId() const + noexcept(noexcept(underlying_type(std::declval()))) { + return id_; + } + + private: + friend size_t hash_value(const concrete_type& v) { + return std::hash()(v.id_); + } + + // TODO Making operator== noexcept if underlying type is noexcept equality + // comparable doesn't work with GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator==( + const concrete_type& lhs, + const concrete_type& rhs) noexcept { + return lhs.id_ == rhs.id_; + } + + // TODO Making operator!= noexcept if operator== is noexcept doesn't work with + // GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator!=( + const concrete_type& lhs, + const concrete_type& rhs) noexcept { + return !(lhs == rhs); + } + + underlying_type id_; +}; + +} // namespace c10 + +#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \ + namespace std { \ + template <> \ + struct hash { \ + size_t operator()(ClassName x) const { \ + return hash_value(x); \ + } \ + }; \ + } + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h new file mode 100644 index 0000000000000000000000000000000000000000..a28803082f7b641b92dae8acf320b7b9be348d74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h @@ -0,0 +1,211 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +template +class IntrusiveList; + +class IntrusiveListHook { + template + friend class ListIterator; + + template + friend class IntrusiveList; + + IntrusiveListHook* next_{nullptr}; + IntrusiveListHook* prev_{nullptr}; + + void link_before(IntrusiveListHook* next_node) { + next_ = next_node; + prev_ = next_node->prev_; + next_node->prev_ = this; + prev_->next_ = this; + } + + public: + IntrusiveListHook() : next_(this), prev_(this) {} + + IntrusiveListHook(const IntrusiveListHook&) = delete; + IntrusiveListHook& operator=(const IntrusiveListHook&) = delete; + IntrusiveListHook(IntrusiveListHook&&) = delete; + IntrusiveListHook& operator=(IntrusiveListHook&&) = delete; + + void unlink() { + TORCH_CHECK(is_linked()); + next_->prev_ = prev_; + prev_->next_ = next_; + next_ = this; + prev_ = this; + } + + ~IntrusiveListHook() { + if (is_linked()) { + unlink(); + } + } + + bool is_linked() const { + return next_ != this; + } +}; + +template +class ListIterator { + static_assert(std::is_same_v, IntrusiveListHook>); + static_assert(std::is_base_of_v); + P* ptr_; + + friend class IntrusiveList; + + public: + using iterator_category = std::bidirectional_iterator_tag; + using value_type = std::conditional_t, const T, T>; + using difference_type = std::ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + explicit ListIterator(P* ptr) : ptr_(ptr) {} + ~ListIterator() = default; + + ListIterator(const ListIterator&) = default; + ListIterator& operator=(const ListIterator&) = default; + ListIterator(ListIterator&&) = default; + ListIterator& operator=(ListIterator&&) = default; + + template < + typename Q, + class = std::enable_if_t && !std::is_const_v>> + ListIterator(const ListIterator& rhs) : ptr_(rhs.ptr_) {} + + template < + typename Q, + class = std::enable_if_t && !std::is_const_v>> + ListIterator& operator=(const ListIterator& rhs) { + ptr_ = rhs.ptr_; + return *this; + } + + template + bool operator==(const ListIterator& other) const { + return ptr_ == other.ptr_; + } + + template + bool operator!=(const ListIterator& other) const { + return !(*this == other); + } + + auto& operator*() const { + return static_cast(*ptr_); + } + + ListIterator& operator++() { + TORCH_CHECK(ptr_); + ptr_ = ptr_->next_; + return *this; + } + + ListIterator& operator--() { + TORCH_CHECK(ptr_); + ptr_ = ptr_->prev_; + return *this; + } + + auto* operator->() const { + return static_cast(ptr_); + } +}; + +template +class IntrusiveList { + static_assert(std::is_base_of_v); + + public: + IntrusiveList() = default; + IntrusiveList(const std::initializer_list>& items) { + for (auto& item : items) { + insert(this->end(), item); + } + } + ~IntrusiveList() { + while (head_.is_linked()) { + head_.next_->unlink(); + } + } + IntrusiveList(const IntrusiveList&) = delete; + IntrusiveList& operator=(const IntrusiveList&) = delete; + IntrusiveList(IntrusiveList&&) = delete; + IntrusiveList& operator=(IntrusiveList&&) = delete; + + using iterator = ListIterator; + using const_iterator = ListIterator; + + auto begin() const { + return ++const_iterator{&head_}; + } + + auto begin() { + return ++iterator{&head_}; + } + + auto end() const { + return const_iterator{&head_}; + } + + auto end() { + return iterator{&head_}; + } + + auto rbegin() const { + return std::reverse_iterator{end()}; + } + + auto rbegin() { + return std::reverse_iterator{end()}; + } + + auto rend() const { + return std::reverse_iterator{begin()}; + } + + auto rend() { + return std::reverse_iterator{begin()}; + } + + auto iterator_to(const T& n) const { + return const_iterator{&n}; + } + + auto iterator_to(T& n) { + return iterator{&n}; + } + + iterator insert(iterator pos, T& n) { + n.link_before(pos.ptr_); + return iterator{&n}; + } + + size_t size() const { + size_t ret = 0; + for ([[maybe_unused]] auto& _ : *this) { + ret++; + } + return ret; + } + + bool empty() const { + return !head_.is_linked(); + } + + private: + IntrusiveListHook head_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h new file mode 100644 index 0000000000000000000000000000000000000000..f3e86ce2e1da5bc4d1d40ad22e4f31280ac16c2e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h @@ -0,0 +1,147 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") +#endif + +namespace c10 { +// TODO: Replace me with inline constexpr variable when C++17 becomes available +namespace detail { +template +C10_HOST_DEVICE inline constexpr T e() { + return static_cast(2.718281828459045235360287471352662); +} + +template +C10_HOST_DEVICE inline constexpr T euler() { + return static_cast(0.577215664901532860606512090082402); +} + +template +C10_HOST_DEVICE inline constexpr T frac_1_pi() { + return static_cast(0.318309886183790671537767526745028); +} + +template +C10_HOST_DEVICE inline constexpr T frac_1_sqrt_pi() { + return static_cast(0.564189583547756286948079451560772); +} + +template +C10_HOST_DEVICE inline constexpr T frac_sqrt_2() { + return static_cast(0.707106781186547524400844362104849); +} + +template +C10_HOST_DEVICE inline constexpr T frac_sqrt_3() { + return static_cast(0.577350269189625764509148780501957); +} + +template +C10_HOST_DEVICE inline constexpr T golden_ratio() { + return static_cast(1.618033988749894848204586834365638); +} + +template +C10_HOST_DEVICE inline constexpr T ln_10() { + return static_cast(2.302585092994045684017991454684364); +} + +template +C10_HOST_DEVICE inline constexpr T ln_2() { + return static_cast(0.693147180559945309417232121458176); +} + +template +C10_HOST_DEVICE inline constexpr T log_10_e() { + return static_cast(0.434294481903251827651128918916605); +} + +template +C10_HOST_DEVICE inline constexpr T log_2_e() { + return static_cast(1.442695040888963407359924681001892); +} + +template +C10_HOST_DEVICE inline constexpr T pi() { + return static_cast(3.141592653589793238462643383279502); +} + +template +C10_HOST_DEVICE inline constexpr T sqrt_2() { + return static_cast(1.414213562373095048801688724209698); +} + +template +C10_HOST_DEVICE inline constexpr T sqrt_3() { + return static_cast(1.732050807568877293527446341505872); +} + +template <> +C10_HOST_DEVICE inline constexpr BFloat16 pi() { + // According to + // https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Special_values + // pi is encoded as 4049 + return BFloat16(0x4049, BFloat16::from_bits()); +} + +template <> +C10_HOST_DEVICE inline constexpr Half pi() { + return Half(0x4248, Half::from_bits()); +} +} // namespace detail + +template +constexpr T e = c10::detail::e(); + +template +constexpr T euler = c10::detail::euler(); + +template +constexpr T frac_1_pi = c10::detail::frac_1_pi(); + +template +constexpr T frac_1_sqrt_pi = c10::detail::frac_1_sqrt_pi(); + +template +constexpr T frac_sqrt_2 = c10::detail::frac_sqrt_2(); + +template +constexpr T frac_sqrt_3 = c10::detail::frac_sqrt_3(); + +template +constexpr T golden_ratio = c10::detail::golden_ratio(); + +template +constexpr T ln_10 = c10::detail::ln_10(); + +template +constexpr T ln_2 = c10::detail::ln_2(); + +template +constexpr T log_10_e = c10::detail::log_10_e(); + +template +constexpr T log_2_e = c10::detail::log_2_e(); + +template +constexpr T pi = c10::detail::pi(); + +template +constexpr T sqrt_2 = c10::detail::sqrt_2(); + +template +constexpr T sqrt_3 = c10::detail::sqrt_3(); +} // namespace c10 + +C10_CLANG_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h new file mode 100644 index 0000000000000000000000000000000000000000..55c4697368c60f86b69db1b1bc65cf0cb2e99404 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h @@ -0,0 +1,65 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef C10_UTIL_OPTIONAL_H_ +#define C10_UTIL_OPTIONAL_H_ + +#include +#include + +// Macros.h is not needed, but it does namespace shenanigans that lots +// of downstream code seems to rely on. Feel free to remove it and fix +// up builds. + +namespace c10 { + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::bad_optional_access; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::make_optional; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::nullopt; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::nullopt_t; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::optional; +#endif + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) + +namespace detail_ { +// the call to convert(b) has return type A and converts b to type A iff b +// decltype(b) is implicitly convertible to A +template +constexpr U convert(U v) { + return v; +} +} // namespace detail_ +template +[[deprecated( + "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T +value_or_else(const std::optional& v, F&& func) { + static_assert( + std::is_convertible_v, T>, + "func parameters must be a callable that returns a type convertible to the value stored in the optional"); + return v.has_value() ? *v : detail_::convert(std::forward(func)()); +} + +template +[[deprecated( + "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T +value_or_else(std::optional&& v, F&& func) { + static_assert( + std::is_convertible_v, T>, + "func parameters must be a callable that returns a type convertible to the value stored in the optional"); + return v.has_value() ? constexpr_move(std::move(v).contained_val()) + : detail_::convert(std::forward(func)()); +} + +#endif + +} // namespace c10 +#endif // C10_UTIL_OPTIONAL_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h new file mode 100644 index 0000000000000000000000000000000000000000..fa4eaaceadd2588bbe53fcd51d3cbffde5d3b220 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { + +/** + * Mostly copied from https://llvm.org/doxygen/ScopeExit_8h_source.html + */ +template +class scope_exit { + Callable ExitFunction; + bool Engaged = true; // False once moved-from or release()d. + + public: + template + // NOLINTNEXTLINE(bugprone-forwarding-reference-overload) + explicit scope_exit(Fp&& F) : ExitFunction(std::forward(F)) {} + + scope_exit(scope_exit&& Rhs) noexcept + : ExitFunction(std::move(Rhs.ExitFunction)), Engaged(Rhs.Engaged) { + Rhs.release(); + } + scope_exit(const scope_exit&) = delete; + scope_exit& operator=(scope_exit&&) = delete; + scope_exit& operator=(const scope_exit&) = delete; + + void release() { + Engaged = false; + } + + ~scope_exit() { + if (Engaged) { + ExitFunction(); + } + } +}; + +// Keeps the callable object that is passed in, and execute it at the +// destruction of the returned object (usually at the scope exit where the +// returned object is kept). +// +// Interface is specified by p0052r2. +template +scope_exit> make_scope_exit(Callable&& F) { + return scope_exit>(std::forward(F)); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..1c40d21a692f0470d02d25bc8794f1b8d58c55a0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h @@ -0,0 +1,92 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +/** Helper class for allocating temporary fixed size arrays with SBO. + * + * This is intentionally much simpler than SmallVector, to improve performance + * at the expense of many features: + * - No zero-initialization for numeric types + * - No resizing after construction + * - No copy/move + * - No non-trivial types + */ + +namespace c10 { + +template +class SmallBuffer { + static_assert(std::is_trivial_v, "SmallBuffer is intended for POD types"); + + std::array storage_; + size_t size_{}; + T* data_{}; + + public: + SmallBuffer(size_t size) : size_(size) { + if (size > N) { + data_ = new T[size]; + } else { + data_ = &storage_[0]; + } + } + + SmallBuffer(const SmallBuffer&) = delete; + SmallBuffer& operator=(const SmallBuffer&) = delete; + + // move constructor is needed in function return + SmallBuffer(SmallBuffer&& rhs) noexcept : size_{rhs.size_} { + rhs.size_ = 0; + if (size_ > N) { + data_ = rhs.data_; + rhs.data_ = nullptr; + } else { + storage_ = std::move(rhs.storage_); + data_ = &storage_[0]; + } + } + + SmallBuffer& operator=(SmallBuffer&&) = delete; + + ~SmallBuffer() { + if (size_ > N) { + delete[] data_; + } + } + T& operator[](size_t idx) { + return data()[idx]; + } + const T& operator[](size_t idx) const { + return data()[idx]; + } + T* data() { + return data_; + } + const T* data() const { + return data_; + } + size_t size() const { + return size_; + } + T* begin() { + return data_; + } + const T* begin() const { + return data_; + } + T* end() { + return data_ + size_; + } + const T* end() const { + return data_ + size_; + } +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h new file mode 100644 index 0000000000000000000000000000000000000000..b2a4dbb0f92f530cd21dc8a63ee48f82f430393d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h @@ -0,0 +1,1472 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the SmallVector class. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::SmallVector. +// used std::is_trivially_{copy,move}_constructible +// replaced iterator_range constructor with inline Container&& constructor +// replaced LLVM_NODISCARD, LLVM_LIKELY, and LLVM_UNLIKELY with c10 equivalents +// removed LLVM_GSL_OWNER +// added SmallVector::at +// added operator<< for std::ostream +// added C10_API to export SmallVectorBase + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +/// This is all the stuff common to all SmallVectors. +/// +/// The template parameter specifies the type which should be used to hold the +/// Size and Capacity of the SmallVector, so it can be adjusted. +/// Using 32 bit size is desirable to shrink the size of the SmallVector. +/// Using 64 bit size is desirable for cases like SmallVector, where a +/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for +/// buffering bitcode output - which can exceed 4GB. +template +class C10_API SmallVectorBase { + protected: + void* BeginX; + Size_T Size = 0, Capacity; + + /// The maximum value of the Size_T used. + static constexpr size_t SizeTypeMax() { + return std::numeric_limits::max(); + } + + SmallVectorBase(void* FirstEl, size_t TotalCapacity) + : BeginX(FirstEl), Capacity(TotalCapacity) {} + + /// This is a helper for \a grow() that's out of line to reduce code + /// duplication. This function will report a fatal error if it can't grow at + /// least to \p MinSize. + void* mallocForGrow(size_t MinSize, size_t TSize, size_t& NewCapacity); + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + /// This function will report a fatal error if it cannot increase capacity. + void grow_pod(const void* FirstEl, size_t MinSize, size_t TSize); + + public: + SmallVectorBase() = delete; + size_t size() const { + return Size; + } + size_t capacity() const { + return Capacity; + } + + [[nodiscard]] bool empty() const { + return !Size; + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_t N) { + assert(N <= capacity()); + Size = N; + } +}; + +template +using SmallVectorSizeType = + std::conditional_t= 8, uint64_t, uint32_t>; + +/// Figure out the offset of the first element. +template +struct SmallVectorAlignmentAndSize { + // NOLINTNEXTLINE(*c-arrays*) + alignas(SmallVectorBase>) char Base[sizeof( + SmallVectorBase>)]; + // NOLINTNEXTLINE(*c-arrays*) + alignas(T) char FirstEl[sizeof(T)]; +}; + +/// This is the part of SmallVectorTemplateBase which does not depend on whether +/// the type T is a POD. The extra dummy template argument is used by ArrayRef +/// to avoid unnecessarily requiring T to be complete. +template +class SmallVectorTemplateCommon + : public SmallVectorBase> { + using Base = SmallVectorBase>; + + /// Find the address of the first element. For this pointer math to be valid + /// with small-size of 0 for T with lots of alignment, it's important that + /// SmallVectorStorage is properly-aligned even for small-size of 0. + void* getFirstEl() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) + + offsetof(SmallVectorAlignmentAndSize, FirstEl))); + } + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + + protected: + SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {} + + void grow_pod(size_t MinSize, size_t TSize) { + Base::grow_pod(getFirstEl(), MinSize, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return this->BeginX == getFirstEl(); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + this->BeginX = getFirstEl(); + this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect. + } + + /// Return true if V is an internal reference to the given range. + bool isReferenceToRange(const void* V, const void* First, const void* Last) + const { + // Use std::less to avoid UB. + std::less<> LessThan; + return !LessThan(V, First) && LessThan(V, Last); + } + + /// Return true if V is an internal reference to this vector. + bool isReferenceToStorage(const void* V) const { + return isReferenceToRange(V, this->begin(), this->end()); + } + + /// Return true if First and Last form a valid (possibly empty) range in this + /// vector's storage. + bool isRangeInStorage(const void* First, const void* Last) const { + // Use std::less to avoid UB. + std::less<> LessThan; + return !LessThan(First, this->begin()) && !LessThan(Last, First) && + !LessThan(this->end(), Last); + } + + /// Return true unless Elt will be invalidated by resizing the vector to + /// NewSize. + bool isSafeToReferenceAfterResize(const void* Elt, size_t NewSize) { + // Past the end. + if (C10_LIKELY(!isReferenceToStorage(Elt))) + return true; + + // Return false if Elt will be destroyed by shrinking. + if (NewSize <= this->size()) + return Elt < this->begin() + NewSize; + + // Return false if we need to grow. + return NewSize <= this->capacity(); + } + + /// Check whether Elt will be invalidated by resizing the vector to NewSize. + void assertSafeToReferenceAfterResize(const void* Elt, size_t NewSize) { + (void)Elt; // Suppress unused variable warning + (void)NewSize; // Suppress unused variable warning + assert( + isSafeToReferenceAfterResize(Elt, NewSize) && + "Attempting to reference an element of the vector in an operation " + "that invalidates it"); + } + + /// Check whether Elt will be invalidated by increasing the size of the + /// vector by N. + void assertSafeToAdd(const void* Elt, size_t N = 1) { + this->assertSafeToReferenceAfterResize(Elt, this->size() + N); + } + + /// Check whether any part of the range will be invalidated by clearing. + void assertSafeToReferenceAfterClear(const T* From, const T* To) { + if (From == To) + return; + this->assertSafeToReferenceAfterResize(From, 0); + this->assertSafeToReferenceAfterResize(To - 1, 0); + } + template < + class ItTy, + std::enable_if_t, T*>, bool> = + false> + void assertSafeToReferenceAfterClear(ItTy /*unused*/, ItTy /*unused*/) {} + + /// Check whether any part of the range will be invalidated by growing. + void assertSafeToAddRange(const T* From, const T* To) { + if (From == To) + return; + this->assertSafeToAdd(From, To - From); + this->assertSafeToAdd(To - 1, To - From); + } + template < + class ItTy, + std::enable_if_t, T*>, bool> = + false> + void assertSafeToAddRange(ItTy /*unused*/, ItTy /*unused*/) {} + + /// Reserve enough space to add one element, and return the updated element + /// pointer in case it was a reference to the storage. + template + static const T* reserveForParamAndGetAddressImpl( + U* This, + const T& Elt, + size_t N) { + size_t NewSize = This->size() + N; + if (C10_LIKELY(NewSize <= This->capacity())) + return &Elt; + + bool ReferencesStorage = false; + int64_t Index = -1; + if constexpr (!U::TakesParamByValue) { + if (C10_UNLIKELY(This->isReferenceToStorage(&Elt))) { + ReferencesStorage = true; + Index = &Elt - This->begin(); + } + } + This->grow(NewSize); + return ReferencesStorage ? This->begin() + Index : &Elt; + } + + public: + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using iterator = T*; + using const_iterator = const T*; + + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = std::reverse_iterator; + + using reference = T&; + using const_reference = const T&; + using pointer = T*; + using const_pointer = const T*; + + using Base::capacity; + using Base::empty; + using Base::size; + + // forward iterator creation methods. + iterator begin() { + return (iterator)this->BeginX; + } + const_iterator begin() const { + return (const_iterator)this->BeginX; + } + iterator end() { + return begin() + size(); + } + const_iterator end() const { + return begin() + size(); + } + + // reverse iterator creation methods. + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + size_type size_in_bytes() const { + return size() * sizeof(T); + } + constexpr size_type max_size() const { + return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T)); + } + + size_t capacity_in_bytes() const { + return capacity() * sizeof(T); + } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { + return pointer(begin()); + } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { + return const_pointer(begin()); + } + + // SmallVector::at is NOT from LLVM. + reference at(size_type idx) { + assert(idx < size()); + return begin()[idx]; + } + const_reference at(size_type idx) const { + assert(idx < size()); + return begin()[idx]; + } + reference operator[](size_type idx) { + assert(idx < size()); + return begin()[idx]; + } + const_reference operator[](size_type idx) const { + assert(idx < size()); + return begin()[idx]; + } + + reference front() { + assert(!empty()); + return begin()[0]; + } + const_reference front() const { + assert(!empty()); + return begin()[0]; + } + + reference back() { + assert(!empty()); + return end()[-1]; + } + const_reference back() const { + assert(!empty()); + return end()[-1]; + } +}; + +/// SmallVectorTemplateBase - This is where we put +/// method implementations that are designed to work with non-trivial T's. +/// +/// We approximate is_trivially_copyable with trivial move/copy construction and +/// trivial destruction. While the standard doesn't specify that you're allowed +/// copy these types with memcpy, there is no way for the type to observe this. +/// This catches the important case of std::pair, which is not +/// trivially assignable. +/// +/// XXX: if build fails here fall back to C10_IS_TRIVIALLY_COPYABLE and make a +/// note +template < + typename T, + bool = (std::is_trivially_copy_constructible_v) && + (std::is_trivially_move_constructible_v) && + std::is_trivially_destructible_v> +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { + friend class SmallVectorTemplateCommon; + + protected: + static constexpr bool TakesParamByValue = false; + using ValueParamT = const T&; + + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + static void destroy_range(T* S, T* E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy( + std::make_move_iterator(I), std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + + /// Create a new allocation big enough for \p MinSize and pass back its size + /// in \p NewCapacity. This is the first section of \a grow(). + T* mallocForGrow(size_t MinSize, size_t& NewCapacity) { + return static_cast( + SmallVectorBase>::mallocForGrow( + MinSize, sizeof(T), NewCapacity)); + } + + /// Move existing elements over to the new allocation \p NewElts, the middle + /// section of \a grow(). + void moveElementsForGrow(T* NewElts); + + /// Transfer ownership of the allocation, finishing up \a grow(). + void takeAllocationForGrow(T* NewElts, size_t NewCapacity); + + /// Reserve enough space to add one element, and return the updated element + /// pointer in case it was a reference to the storage. + const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) { + return this->reserveForParamAndGetAddressImpl(this, Elt, N); + } + + /// Reserve enough space to add one element, and return the updated element + /// pointer in case it was a reference to the storage. + T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) { + return const_cast(this->reserveForParamAndGetAddressImpl(this, Elt, N)); + } + + static T&& forward_value_param(T&& V) { + return std::move(V); + } + static const T& forward_value_param(const T& V) { + return V; + } + + void growAndAssign(size_t NumElts, const T& Elt) { + // Grow manually in case Elt is an internal reference. + size_t NewCapacity = 0; + T* NewElts = mallocForGrow(NumElts, NewCapacity); + std::uninitialized_fill_n(NewElts, NumElts, Elt); + this->destroy_range(this->begin(), this->end()); + takeAllocationForGrow(NewElts, NewCapacity); + this->set_size(NumElts); + } + + template + T& growAndEmplaceBack(ArgTypes&&... Args) { + // Grow manually in case one of Args is an internal reference. + size_t NewCapacity = 0; + T* NewElts = mallocForGrow(0, NewCapacity); + ::new ((void*)(NewElts + this->size())) T(std::forward(Args)...); + moveElementsForGrow(NewElts); + takeAllocationForGrow(NewElts, NewCapacity); + this->set_size(this->size() + 1); + return this->back(); + } + + public: + void push_back(const T& Elt) { + const T* EltPtr = reserveForParamAndGetAddress(Elt); + ::new ((void*)this->end()) T(*EltPtr); + this->set_size(this->size() + 1); + } + + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) + void push_back(T&& Elt) { + T* EltPtr = reserveForParamAndGetAddress(Elt); + ::new ((void*)this->end()) T(::std::move(*EltPtr)); + this->set_size(this->size() + 1); + } + + void pop_back() { + this->set_size(this->size() - 1); + this->end()->~T(); + } +}; + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::grow(size_t MinSize) { + size_t NewCapacity = 0; + T* NewElts = mallocForGrow(MinSize, NewCapacity); + moveElementsForGrow(NewElts); + takeAllocationForGrow(NewElts, NewCapacity); +} + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::moveElementsForGrow( + T* NewElts) { + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); +} + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::takeAllocationForGrow( + T* NewElts, + size_t NewCapacity) { + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + + this->BeginX = NewElts; + this->Capacity = NewCapacity; +} + +/// SmallVectorTemplateBase - This is where we put +/// method implementations that are designed to work with trivially copyable +/// T's. This allows using memcpy in place of copy/move construction and +/// skipping destruction. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { + friend class SmallVectorTemplateCommon; + + protected: + /// True if it's cheap enough to take parameters by value. Doing so avoids + /// overhead related to mitigations for reference invalidation. + static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void*); + + /// Either const T& or T, depending on whether it's cheap enough to take + /// parameters by value. + using ValueParamT = std::conditional_t; + + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T* /*unused*/, T* /*unused*/) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy( + T1* I, + T1* E, + T2* Dest, + std::enable_if_t, T2>>* /*unused*/ + = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(reinterpret_cast(Dest), I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize, sizeof(T)); + } + + /// Reserve enough space to add one element, and return the updated element + /// pointer in case it was a reference to the storage. + const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) { + return this->reserveForParamAndGetAddressImpl(this, Elt, N); + } + + /// Reserve enough space to add one element, and return the updated element + /// pointer in case it was a reference to the storage. + T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) { + return const_cast(this->reserveForParamAndGetAddressImpl(this, Elt, N)); + } + + /// Copy \p V or return a reference, depending on \a ValueParamT. + static ValueParamT forward_value_param(ValueParamT V) { + return V; + } + + void growAndAssign(size_t NumElts, T Elt) { + // Elt has been copied in case it's an internal reference, side-stepping + // reference invalidation problems without losing the realloc optimization. + this->set_size(0); + this->grow(NumElts); + std::uninitialized_fill_n(this->begin(), NumElts, Elt); + this->set_size(NumElts); + } + + template + T& growAndEmplaceBack(ArgTypes&&... Args) { + // Use push_back with a copy in case Args has an internal reference, + // side-stepping reference invalidation problems without losing the realloc + // optimization. + push_back(T(std::forward(Args)...)); + return this->back(); + } + + public: + void push_back(ValueParamT Elt) { + const T* EltPtr = reserveForParamAndGetAddress(Elt); + memcpy(reinterpret_cast(this->end()), EltPtr, sizeof(T)); + this->set_size(this->size() + 1); + } + + void pop_back() { + this->set_size(this->size() - 1); + } +}; + +/// This class consists of common code factored out of the SmallVector class to +/// reduce code duplication based on the SmallVector 'N' template parameter. +template +class SmallVectorImpl : public SmallVectorTemplateBase { + using SuperClass = SmallVectorTemplateBase; + + public: + using iterator = typename SuperClass::iterator; + using const_iterator = typename SuperClass::const_iterator; + using reference = typename SuperClass::reference; + using size_type = typename SuperClass::size_type; + + protected: + using SmallVectorTemplateBase::TakesParamByValue; + using ValueParamT = typename SuperClass::ValueParamT; + + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) : SmallVectorTemplateBase(N) {} + + public: + SmallVectorImpl(const SmallVectorImpl&) = delete; + + ~SmallVectorImpl() { + // Subclass has already destructed this vector's elements. + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + } + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->Size = 0; + } + + private: + template + void resizeImpl(size_type N) { + if (N < this->size()) { + this->pop_back_n(this->size() - N); + } else if (N > this->size()) { + this->reserve(N); + for (auto I = this->end(), E = this->begin() + N; I != E; ++I) + if (ForOverwrite) + new (&*I) T; + else + new (&*I) T(); + this->set_size(N); + } + } + + public: + void resize(size_type N) { + resizeImpl(N); + } + + /// Like resize, but \ref T is POD, the new values won't be initialized. + void resize_for_overwrite(size_type N) { + resizeImpl(N); + } + + void resize(size_type N, ValueParamT NV) { + if (N == this->size()) + return; + + if (N < this->size()) { + this->pop_back_n(this->size() - N); + return; + } + + // N > this->size(). Defer to append. + this->append(N - this->size(), NV); + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + void pop_back_n(size_type NumItems) { + assert(this->size() >= NumItems); + this->destroy_range(this->end() - NumItems, this->end()); + this->set_size(this->size() - NumItems); + } + + [[nodiscard]] T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl& RHS) noexcept; + + /// Add the specified range to the end of the SmallVector. + template < + typename in_iter, + typename = std::enable_if_t::iterator_category, + std::input_iterator_tag>>> + void append(in_iter in_start, in_iter in_end) { + this->assertSafeToAddRange(in_start, in_end); + size_type NumInputs = std::distance(in_start, in_end); + this->reserve(this->size() + NumInputs); + this->uninitialized_copy(in_start, in_end, this->end()); + this->set_size(this->size() + NumInputs); + } + + /// Append \p NumInputs copies of \p Elt to the end. + void append(size_type NumInputs, ValueParamT Elt) { + const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs); + std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr); + this->set_size(this->size() + NumInputs); + } + + void append(std::initializer_list IL) { + append(IL.begin(), IL.end()); + } + + void append(const SmallVectorImpl& RHS) { + append(RHS.begin(), RHS.end()); + } + + void assign(size_type NumElts, ValueParamT Elt) { + // Note that Elt could be an internal reference. + if (NumElts > this->capacity()) { + this->growAndAssign(NumElts, Elt); + return; + } + + // Assign over existing elements. + std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt); + if (NumElts > this->size()) + std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt); + else if (NumElts < this->size()) + this->destroy_range(this->begin() + NumElts, this->end()); + this->set_size(NumElts); + } + + // FIXME: Consider assigning over existing elements, rather than clearing & + // re-initializing them - for all assign(...) variants. + + template < + typename in_iter, + typename = std::enable_if_t::iterator_category, + std::input_iterator_tag>>> + void assign(in_iter in_start, in_iter in_end) { + this->assertSafeToReferenceAfterClear(in_start, in_end); + clear(); + append(in_start, in_end); + } + + void assign(std::initializer_list IL) { + clear(); + append(IL); + } + + void assign(const SmallVectorImpl& RHS) { + assign(RHS.begin(), RHS.end()); + } + + iterator erase(iterator I) { + assert( + this->isReferenceToStorage(I) && "Iterator to erase is out of bounds."); + + iterator N = I; + // Shift all elts down one. + std::move(I + 1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return N; + } + + iterator erase(iterator S, iterator E) { + assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->set_size(I - this->begin()); + return N; + } + + private: + template + iterator insert_one_impl(iterator I, ArgType&& Elt) { + // Callers ensure that ArgType is derived from T. + static_assert( + std::is_same>, T>:: + value, + "ArgType must be derived from T!"); + + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::forward(Elt)); + return this->end() - 1; + } + + assert( + this->isReferenceToStorage(I) && + "Insertion iterator is out of bounds."); + + // Grow if necessary. + size_t Index = I - this->begin(); + std::remove_reference_t* EltPtr = + this->reserveForParamAndGetAddress(Elt); + I = this->begin() + Index; + + ::new ((void*)this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end() - 1, this->end()); + this->set_size(this->size() + 1); + + // If we just moved the element we're inserting, be sure to update + // the reference (never happens if TakesParamByValue). + static_assert( + !TakesParamByValue || std::is_same_v, + "ArgType must be 'T' when taking by value!"); + if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end())) + ++EltPtr; + + *I = ::std::forward(*EltPtr); + return I; + } + + public: + iterator insert(iterator I, T&& Elt) { + return insert_one_impl(I, this->forward_value_param(std::move(Elt))); + } + + iterator insert(iterator I, const T& Elt) { + return insert_one_impl(I, this->forward_value_param(Elt)); + } + + iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin() + InsertElt; + } + + assert( + this->isReferenceToStorage(I) && + "Insertion iterator is out of bounds."); + + // Ensure there is enough space, and get the (maybe updated) address of + // Elt. + const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert); + + // Uninvalidate the iterator. + I = this->begin() + InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - I) >= NumToInsert) { + T* OldEnd = this->end(); + append( + std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd - NumToInsert, OldEnd); + + // If we just moved the element we're inserting, be sure to update + // the reference (never happens if TakesParamByValue). + if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end()) + EltPtr += NumToInsert; + + std::fill_n(I, NumToInsert, *EltPtr); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* OldEnd = this->end(); + this->set_size(this->size() + NumToInsert); + size_t NumOverwritten = OldEnd - I; + this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); + + // If we just moved the element we're inserting, be sure to update + // the reference (never happens if TakesParamByValue). + if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end()) + EltPtr += NumToInsert; + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, *EltPtr); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr); + return I; + } + + template < + typename ItTy, + typename = std::enable_if_t::iterator_category, + std::input_iterator_tag>>> + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin() + InsertElt; + } + + assert( + this->isReferenceToStorage(I) && + "Insertion iterator is out of bounds."); + + // Check that the reserve that follows doesn't invalidate the iterators. + this->assertSafeToAddRange(From, To); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin() + InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - I) >= NumToInsert) { + T* OldEnd = this->end(); + append( + std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd - NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* OldEnd = this->end(); + this->set_size(this->size() + NumToInsert); + size_t NumOverwritten = OldEnd - I; + this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); + + // Replace the overwritten part. + for (T* J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; + ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list IL) { + insert(I, IL.begin(), IL.end()); + } + + template + reference emplace_back(ArgTypes&&... Args) { + if (C10_UNLIKELY(this->size() >= this->capacity())) + return this->growAndEmplaceBack(std::forward(Args)...); + + ::new ((void*)this->end()) T(std::forward(Args)...); + this->set_size(this->size() + 1); + return this->back(); + } + + SmallVectorImpl& operator=(const SmallVectorImpl& RHS); + + SmallVectorImpl& operator=(SmallVectorImpl&& RHS) noexcept( + std::is_nothrow_move_constructible_v && + std::is_nothrow_destructible_v); + + bool operator==(const SmallVectorImpl& RHS) const { + if (this->size() != RHS.size()) + return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl& RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl& RHS) const { + return std::lexicographical_compare( + this->begin(), this->end(), RHS.begin(), RHS.end()); + } +}; + +template +void SmallVectorImpl::swap(SmallVectorImpl& RHS) noexcept { + if (this == &RHS) + return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->Size, RHS.Size); + std::swap(this->Capacity, RHS.Capacity); + return; + } + this->reserve(RHS.size()); + RHS.reserve(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) + NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end()); + RHS.set_size(RHS.size() + EltDiff); + this->destroy_range(this->begin() + NumShared, this->end()); + this->set_size(NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end()); + this->set_size(this->size() + EltDiff); + this->destroy_range(RHS.begin() + NumShared, RHS.end()); + RHS.set_size(NumShared); + } +} + +template +SmallVectorImpl& SmallVectorImpl::operator=( + const SmallVectorImpl& RHS) { + // Avoid self-assignment. + if (this == &RHS) + return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->set_size(RHSSize); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->clear(); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy( + RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); + + // Set end. + this->set_size(RHSSize); + return *this; +} + +template +SmallVectorImpl& SmallVectorImpl:: +operator=(SmallVectorImpl&& RHS) noexcept( + std::is_nothrow_move_constructible_v && + std::is_nothrow_destructible_v) { + // Avoid self-assignment. + if (this == &RHS) + return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) + free(this->begin()); + this->BeginX = RHS.BeginX; + this->Size = RHS.Size; + this->Capacity = RHS.Capacity; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->set_size(RHSSize); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->clear(); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin() + CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move( + RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); + + // Set end. + this->set_size(RHSSize); + + RHS.clear(); + return *this; +} + +/// Storage for the SmallVector elements. This is specialized for the N=0 case +/// to avoid allocating unnecessary storage. +template +struct SmallVectorStorage { + alignas(T) char InlineElts[N * sizeof(T)]; +}; + +/// We need the storage to be properly aligned even for small-size of 0 so that +/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is +/// well-defined. +template +struct alignas(T) SmallVectorStorage {}; + +/// Forward declaration of SmallVector so that +/// calculateSmallVectorDefaultInlinedElements can reference +/// `sizeof(SmallVector)`. +template +class /* LLVM_GSL_OWNER */ SmallVector; + +/// Helper class for calculating the default number of inline elements for +/// `SmallVector`. +/// +/// This should be migrated to a constexpr function when our minimum +/// compiler support is enough for multi-statement constexpr functions. +template +struct CalculateSmallVectorDefaultInlinedElements { + // Parameter controlling the default number of inlined elements + // for `SmallVector`. + // + // The default number of inlined elements ensures that + // 1. There is at least one inlined element. + // 2. `sizeof(SmallVector) <= kPreferredSmallVectorSizeof` unless + // it contradicts 1. + static constexpr size_t kPreferredSmallVectorSizeof = 64; + + // static_assert that sizeof(T) is not "too big". + // + // Because our policy guarantees at least one inlined element, it is possible + // for an arbitrarily large inlined element to allocate an arbitrarily large + // amount of inline storage. We generally consider it an antipattern for a + // SmallVector to allocate an excessive amount of inline storage, so we want + // to call attention to these cases and make sure that users are making an + // intentional decision if they request a lot of inline storage. + // + // We want this assertion to trigger in pathological cases, but otherwise + // not be too easy to hit. To accomplish that, the cutoff is actually somewhat + // larger than kPreferredSmallVectorSizeof (otherwise, + // `SmallVector>` would be one easy way to trip it, and that + // pattern seems useful in practice). + // + // One wrinkle is that this assertion is in theory non-portable, since + // sizeof(T) is in general platform-dependent. However, we don't expect this + // to be much of an issue, because most LLVM development happens on 64-bit + // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for + // 32-bit hosts, dodging the issue. The reverse situation, where development + // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a + // 64-bit host, is expected to be very rare. + static_assert( + sizeof(T) <= 256, + "You are trying to use a default number of inlined elements for " + "`SmallVector` but `sizeof(T)` is really big! Please use an " + "explicit number of inlined elements with `SmallVector` to make " + "sure you really want that much inline storage."); + + // Discount the size of the header itself when calculating the maximum inline + // bytes. + static constexpr size_t PreferredInlineBytes = + kPreferredSmallVectorSizeof - sizeof(SmallVector); + static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T); + static constexpr size_t value = + NumElementsThatFit == 0 ? 1 : NumElementsThatFit; +}; + +/// This is a 'vector' (really, a variable-sized array), optimized +/// for the case when the array is small. It contains some number of elements +/// in-place, which allows it to avoid heap allocation when the actual number of +/// elements is below that threshold. This allows normal "small" cases to be +/// fast without losing generality for large inputs. +/// +/// \note +/// In the absence of a well-motivated choice for the number of inlined +/// elements \p N, it is recommended to use \c SmallVector (that is, +/// omitting the \p N). This will choose a default number of inlined elements +/// reasonable for allocation on the stack (for example, trying to keep \c +/// sizeof(SmallVector) around 64 bytes). +/// +/// \warning This does not attempt to be exception safe. +/// +/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h +template < + typename T, + unsigned N = CalculateSmallVectorDefaultInlinedElements::value> +class /* LLVM_GSL_OWNER */ SmallVector : public SmallVectorImpl, + SmallVectorStorage { + public: + SmallVector() : SmallVectorImpl(N) {} + + ~SmallVector() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + } + + explicit SmallVector(size_t Size, const T& Value = T()) + : SmallVectorImpl(N) { + this->assign(Size, Value); + } + + template < + typename ItTy, + typename = std::enable_if_t::iterator_category, + std::input_iterator_tag>>> + SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { + this->append(S, E); + } + + // note: The enable_if restricts Container to types that have a .begin() and + // .end() that return valid input iterators. + template < + typename Container, + std::enable_if_t< + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .begin())>::iterator_category, + std::input_iterator_tag> && + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .end())>::iterator_category, + std::input_iterator_tag>, + int> = 0> + explicit SmallVector(Container&& c) : SmallVectorImpl(N) { + this->append(c.begin(), c.end()); + } + + SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { + this->assign(IL); + } + + SmallVector(const SmallVector& RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(RHS); + } + + SmallVector& operator=(const SmallVector& RHS) { + SmallVectorImpl::operator=(RHS); + return *this; + } + + SmallVector(SmallVector&& RHS) noexcept( + std::is_nothrow_move_assignable_v>) + : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + // note: The enable_if restricts Container to types that have a .begin() and + // .end() that return valid input iterators. + template < + typename Container, + std::enable_if_t< + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .begin())>::iterator_category, + std::input_iterator_tag> && + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .end())>::iterator_category, + std::input_iterator_tag>, + int> = 0> + SmallVector& operator=(const Container& RHS) { + this->assign(RHS.begin(), RHS.end()); + return *this; + } + + SmallVector(SmallVectorImpl&& RHS) noexcept( + std::is_nothrow_move_assignable_v>) + : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + SmallVector& operator=(SmallVector&& RHS) noexcept( + std::is_nothrow_move_assignable_v>) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + SmallVector& operator=(SmallVectorImpl&& RHS) noexcept( + std::is_nothrow_move_constructible_v>) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + // note: The enable_if restricts Container to types that have a .begin() and + // .end() that return valid input iterators. + template < + typename Container, + std::enable_if_t< + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .begin())>::iterator_category, + std::input_iterator_tag> && + std::is_convertible_v< + typename std::iterator_traits< + decltype(std::declval() + .end())>::iterator_category, + std::input_iterator_tag>, + int> = 0> + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + SmallVector& operator=(Container&& C) { + this->assign(C.begin(), C.end()); + return *this; + } + + SmallVector& operator=(std::initializer_list IL) { + this->assign(IL); + return *this; + } +}; + +template +inline size_t capacity_in_bytes(const SmallVector& X) { + return X.capacity_in_bytes(); +} + +template +std::ostream& operator<<(std::ostream& out, const SmallVector& list) { + int i = 0; + out << '['; + for (auto e : list) { + if (i++ > 0) + out << ", "; + out << e; + } + out << ']'; + return out; +} + +template +using ValueTypeFromRangeType = std::remove_const_t< + std::remove_reference_t()))>>; + +/// Given a range of type R, iterate the entire range and return a +/// SmallVector with elements of the vector. This is useful, for example, +/// when you want to iterate a range and then sort the results. +template +// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) +SmallVector, Size> to_vector(R&& Range) { + return {std::begin(Range), std::end(Range)}; +} +template +SmallVector< + ValueTypeFromRangeType, + CalculateSmallVectorDefaultInlinedElements< + ValueTypeFromRangeType>::value> +// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) +to_vector(R&& Range) { + return {std::begin(Range), std::end(Range)}; +} + +} // end namespace c10 + +namespace std { + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap( + c10::SmallVectorImpl& LHS, + c10::SmallVectorImpl& RHS) noexcept { + LHS.swap(RHS); +} + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap( + c10::SmallVector& LHS, + c10::SmallVector& RHS) noexcept { + LHS.swap(RHS); +} + +} // end namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h new file mode 100644 index 0000000000000000000000000000000000000000..7c77905085305f5b2884985df2857a219a760c56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h @@ -0,0 +1,267 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef C10_UTIL_STRINGUTIL_H_ +#define C10_UTIL_STRINGUTIL_H_ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") +#endif + +namespace c10 { + +namespace detail { + +// Obtains the base name from a full path. +C10_API std::string StripBasename(const std::string& full_path); + +C10_API std::string ExcludeFileExtension(const std::string& full_path); + +struct CompileTimeEmptyString { + operator const std::string&() const { + static const std::string empty_string_literal; + return empty_string_literal; + } + operator const char*() const { + return ""; + } +}; + +template +struct CanonicalizeStrTypes { + using type = const T&; +}; + +template +// NOLINTNEXTLINE(*c-arrays*) +struct CanonicalizeStrTypes { + using type = const char*; +}; + +inline std::ostream& _str(std::ostream& ss) { + return ss; +} + +template +struct Streamable : std::false_type {}; + +template +struct Streamable() << T{})> + : std::true_type {}; + +template +inline std::ostream& _str(std::ostream& ss, const T& t) { + if constexpr (std::is_enum_v && !Streamable::value) { + // NOLINTNEXTLINE(modernize-type-traits) + return _str(ss, static_cast::type>(t)); + } else { + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + ss << t; + return ss; + } +} + +template +inline std::ostream& _str(std::ostream& ss, const std::optional& t) { + if (t.has_value()) { + return _str(ss, t.value()); + } + ss << "std::nullopt"; + return ss; +} +// Overloads of _str for wide types; forces narrowing. +C10_API std::ostream& _str(std::ostream& ss, const wchar_t* wCStr); +C10_API std::ostream& _str(std::ostream& ss, const wchar_t& wChar); +C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString); + +template <> +inline std::ostream& _str( + std::ostream& ss, + const CompileTimeEmptyString& /*unused*/) { + return ss; +} + +template +inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) { + return _str(_str(ss, t), args...); +} + +template +struct _str_wrapper final { + static std::string call(const Args&... args) { + std::ostringstream ss; + _str(ss, args...); + return ss.str(); + } +}; + +// Specializations for already-a-string types. +template <> +struct _str_wrapper final { + // return by reference to avoid the binary size of a string copy + static const std::string& call(const std::string& str) { + return str; + } +}; + +template <> +struct _str_wrapper final { + static const char* call(const char* str) { + return str; + } +}; + +// For c10::str() with an empty argument list (which is common in our assert +// macros), we don't want to pay the binary size for constructing and +// destructing a stringstream or even constructing a string. +template <> +struct _str_wrapper<> final { + static CompileTimeEmptyString call() { + return CompileTimeEmptyString(); + } +}; + +} // namespace detail + +// Convert a list of string-like arguments into a single string. +template +inline auto str(const Args&... args) { + return detail::_str_wrapper< + typename detail::CanonicalizeStrTypes::type...>::call(args...); +} + +template +inline std::string Join(const std::string& delimiter, const Container& v) { + std::stringstream s; + int cnt = static_cast(v.size()) - 1; + for (auto i = v.begin(); i != v.end(); ++i, --cnt) { + s << (*i) << (cnt ? delimiter : ""); + } + return std::move(s).str(); +} + +// Replace all occurrences of "from" substring to "to" string. +// Returns number of replacements +size_t C10_API +ReplaceAll(std::string& s, std::string_view from, std::string_view to); + +/// Represents a location in source code (for debugging). +struct C10_API SourceLocation { + const char* function; + const char* file; + uint32_t line; +}; + +std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); + +// unix isprint but insensitive to locale +inline bool isPrint(char s) { + return s > 0x1f && s < 0x7f; +} + +inline void printQuotedString(std::ostream& stmt, const std::string_view str) { + stmt << '"'; + for (auto s : str) { + switch (s) { + case '\\': + stmt << "\\\\"; + break; + case '\'': + stmt << "\\'"; + break; + case '\"': + stmt << "\\\""; + break; + case '\a': + stmt << "\\a"; + break; + case '\b': + stmt << "\\b"; + break; + case '\f': + stmt << "\\f"; + break; + case '\n': + stmt << "\\n"; + break; + case '\r': + stmt << "\\r"; + break; + case '\t': + stmt << "\\t"; + break; + case '\v': + stmt << "\\v"; + break; + default: + if (isPrint(s)) { + stmt << s; + } else { + // C++ io has stateful formatting settings. Messing with + // them is probably worse than doing this manually. + // NOLINTNEXTLINE(*c-arrays*) + char buf[4] = "000"; + // NOLINTNEXTLINE(*narrowing-conversions) + buf[2] += s % 8; + s /= 8; + // NOLINTNEXTLINE(*narrowing-conversions) + buf[1] += s % 8; + s /= 8; + // NOLINTNEXTLINE(*narrowing-conversions) + buf[0] += s; + stmt << "\\" << buf; + } + break; + } + } + stmt << '"'; +} + +template +std::optional tryToNumber(const char* symbol) = delete; +template +std::optional tryToNumber(const std::string& symbol) = delete; + +/* + * Convert a string to a 64 bit integer. Trailing whitespaces are not supported. + * Similarly, integer string with trailing characters like "123abc" will be + * rejected. + */ +template <> +C10_API std::optional tryToNumber(const char* symbol); +template <> +C10_API std::optional tryToNumber(const std::string& symbol); + +/* + * Convert a string to a double. Trailing whitespaces are not supported. + * Similarly, integer string with trailing characters like "123abc" will + * be rejected. + */ +template <> +C10_API std::optional tryToNumber(const char* symbol); +template <> +C10_API std::optional tryToNumber(const std::string& symbol); + +C10_API std::vector split( + std::string_view target, + char delimiter); +} // namespace c10 + +C10_CLANG_DIAGNOSTIC_POP() + +#endif // C10_UTIL_STRINGUTIL_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h new file mode 100644 index 0000000000000000000000000000000000000000..c78564263ebfe172abcb5c097a8c222606e8f019 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +/** + * A very simple Synchronization class for error-free use of data + * in a multi-threaded context. See folly/docs/Synchronized.md for + * the inspiration of this class. + * + * Full URL: + * https://github.com/facebook/folly/blob/main/folly/docs/Synchronized.md + * + * This class implements a small subset of the generic functionality + * implemented by folly:Synchronized. Specifically, only withLock + * is implemented here since it's the smallest possible API that is + * able to cover a large surface area of functionality offered by + * folly::Synchronized. + */ +template +class Synchronized final { + mutable std::mutex mutex_; + T data_; + + public: + Synchronized() = default; + Synchronized(T const& data) : data_(data) {} + Synchronized(T&& data) : data_(std::move(data)) {} + + // Don't permit copy construction, move, assignment, or + // move assignment, since the underlying std::mutex + // isn't necessarily copyable/moveable. + Synchronized(Synchronized const&) = delete; + Synchronized(Synchronized&&) = delete; + Synchronized operator=(Synchronized const&) = delete; + Synchronized operator=(Synchronized&&) = delete; + ~Synchronized() = default; + + /** + * To use, call withLock with a callback that accepts T either + * by copy or by reference. Use the protected variable in the + * provided callback safely. + */ + template + auto withLock(CB&& cb) { + std::lock_guard guard(this->mutex_); + return std::forward(cb)(this->data_); + } + + /** + * To use, call withLock with a callback that accepts T either + * by copy or by const reference. Use the protected variable in + * the provided callback safely. + */ + template + auto withLock(CB&& cb) const { + std::lock_guard guard(this->mutex_); + return std::forward(cb)(this->data_); + } +}; +} // end namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..03ba6f5b39ba567f65bfa375df66c413a88c171b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h @@ -0,0 +1,90 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include + +namespace c10 { + +enum class C10_API_ENUM DebugInfoKind : uint8_t { + PRODUCER_INFO = 0, + MOBILE_RUNTIME_INFO, + PROFILER_STATE, + INFERENCE_CONTEXT, // for inference usage + PARAM_COMMS_INFO, + + TEST_INFO, // used only in tests + TEST_INFO_2, // used only in tests +}; + +class C10_API DebugInfoBase { + public: + DebugInfoBase() = default; + virtual ~DebugInfoBase() = default; +}; + +// Thread local debug information is propagated across the forward +// (including async fork tasks) and backward passes and is supposed +// to be utilized by the user's code to pass extra information from +// the higher layers (e.g. model id) down to the lower levels +// (e.g. to the operator observers used for debugging, logging, +// profiling, etc) +class C10_API ThreadLocalDebugInfo { + public: + static DebugInfoBase* get(DebugInfoKind kind); + + // Get current ThreadLocalDebugInfo + static std::shared_ptr current(); + + // Internal, use DebugInfoGuard/ThreadLocalStateGuard + static void _forceCurrentDebugInfo( + std::shared_ptr info); + + // Push debug info struct of a given kind + static void _push(DebugInfoKind kind, std::shared_ptr info); + // Pop debug info, throws in case the last pushed + // debug info is not of a given kind + static std::shared_ptr _pop(DebugInfoKind kind); + // Peek debug info, throws in case the last pushed debug info is not of the + // given kind + static std::shared_ptr _peek(DebugInfoKind kind); + + private: + std::shared_ptr info_; + DebugInfoKind kind_; + std::shared_ptr parent_info_; + + friend class DebugInfoGuard; +}; + +// DebugInfoGuard is used to set debug information, +// ThreadLocalDebugInfo is semantically immutable, the values are set +// through the scope-based guard object. +// Nested DebugInfoGuard adds/overrides existing values in the scope, +// restoring the original values after exiting the scope. +// Users can access the values through the ThreadLocalDebugInfo::get() call; +class C10_API DebugInfoGuard { + public: + DebugInfoGuard(DebugInfoKind kind, std::shared_ptr info); + + explicit DebugInfoGuard(std::shared_ptr info); + + ~DebugInfoGuard(); + + DebugInfoGuard(const DebugInfoGuard&) = delete; + DebugInfoGuard(DebugInfoGuard&&) = delete; + DebugInfoGuard& operator=(const DebugInfoGuard&) = delete; + DebugInfoGuard& operator=(DebugInfoGuard&&) = delete; + + private: + bool active_ = false; + std::shared_ptr prev_info_ = nullptr; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h new file mode 100644 index 0000000000000000000000000000000000000000..fe2282d2973c030f2abb788009acf8ce661f3fd8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h @@ -0,0 +1,132 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) +#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1 +#define C10_TYPENAME_CONSTEXPR constexpr +#endif + +namespace c10::util { + +struct type_index final : IdWrapper { + constexpr explicit type_index(uint64_t checksum) : IdWrapper(checksum) {} + + // Allow usage in std::map / std::set + // TODO Disallow this and rather use std::unordered_map/set everywhere + friend constexpr bool operator<(type_index lhs, type_index rhs) noexcept { + return lhs.underlyingId() < rhs.underlyingId(); + } + + friend std::ostream& operator<<(std::ostream& stream, type_index typeId) { + return stream << typeId.underlyingId(); + } +}; + +namespace detail { + +template +inline constexpr c10::c10_string_view fully_qualified_type_name_impl() { +#if defined(_MSC_VER) && !defined(__clang__) + constexpr std::string_view fun_sig = __FUNCSIG__; +#if defined(__NVCC__) + constexpr std::string_view prefix = + "c10::basic_string_view c10::util::detail::fully_qualified_type_name_impl<"; + constexpr std::string_view suffix = ">()"; +#else + constexpr std::string_view prefix = + "class c10::basic_string_view __cdecl c10::util::detail::fully_qualified_type_name_impl<"; + constexpr std::string_view suffix = ">(void)"; +#endif +#elif defined(__clang__) + constexpr std::string_view fun_sig = __PRETTY_FUNCTION__; + constexpr std::string_view prefix = + "c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [T = "; + constexpr std::string_view suffix = "]"; +#elif defined(__GNUC__) + constexpr std::string_view fun_sig = __PRETTY_FUNCTION__; + constexpr std::string_view prefix = + "constexpr c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [with T = "; + constexpr std::string_view suffix = + "; c10::c10_string_view = c10::basic_string_view]"; +#endif +#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__) + static_assert(c10::starts_with( + static_cast(fun_sig), + static_cast(prefix))); + static_assert(c10::ends_with( + static_cast(fun_sig), + static_cast(suffix))); +#endif + return fun_sig.substr( + prefix.size(), fun_sig.size() - prefix.size() - suffix.size()); +} + +#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__) +template +inline constexpr uint64_t type_index_impl() { +// Idea: __PRETTY_FUNCTION__ (or __FUNCSIG__ on msvc) contains a qualified name +// of this function, including its template parameter, i.e. including the +// type we want an id for. We use this name and run crc64 on it to get a type +// id. +#if defined(_MSC_VER) && !defined(__clang__) + return crc64(__FUNCSIG__, sizeof(__FUNCSIG__)).checksum(); +#elif defined(__clang__) + return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum(); +#elif defined(__GNUC__) + return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum(); +#endif +} +#endif + +} // namespace detail + +template +inline constexpr type_index get_type_index() { +#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__) + // To enforce that this is really computed at compile time, we pass the + // type index through std::integral_constant. + return type_index{std::integral_constant< + uint64_t, + detail::type_index_impl>()>::value}; +#else + // There's nothing in theory preventing us from running this on device code + // except for nvcc throwing a compiler error if we enable it. + return (abort(), type_index(0)); +#endif +} + +#if !defined(TORCH_PEDANTIC) +// Use precomputed hashsum for std::string +// Needed to workaround ambiguity in class name resolution +// into __PRETTY_FUNCTION__ when abovementioned class is defined in inlined +// namespace. In multi-ABI C++ library, `std::string` is an alias to +// `std::__cxx11::basic_string` which depending on compiler flags can be +// resolved to `basic_string` either in `std` namespace or in +// `std::__cxx11` one (`__cxx11` is an inline namespace) +template <> +inline constexpr type_index get_type_index() { + // hashsum for std::basic_string + return type_index{4193213214807308375ULL}; +} +#endif + +template +inline constexpr std::string_view get_fully_qualified_type_name() noexcept { + return static_cast( + detail::fully_qualified_type_name_impl()); +} +} // namespace c10::util + +C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::type_index) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h new file mode 100644 index 0000000000000000000000000000000000000000..f511333fc7d9ca2b9e29fd7512e4cd0cb8776b25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h new file mode 100644 index 0000000000000000000000000000000000000000..9d49c82cbd8948cdd7bb2b9fd758f7875e5dfdb7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h new file mode 100644 index 0000000000000000000000000000000000000000..68d2c2ce7feac15b4fab16f4124e41633433a213 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#if defined(_WIN32) +#include +#include +#include +#endif + +namespace c10 { +#if defined(_WIN32) +C10_API std::wstring u8u16(const std::string& str); +C10_API std::string u16u8(const std::wstring& wstr); +#endif +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h new file mode 100644 index 0000000000000000000000000000000000000000..ccae4f78e54b38345cab1f41b97b70293d0a35a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace c10::monitor { +namespace detail { +class WaitCounterImpl; + +class WaitCounterBackendIf { + public: + virtual ~WaitCounterBackendIf() = default; + + virtual intptr_t start( + std::chrono::steady_clock::time_point now) noexcept = 0; + virtual void stop( + std::chrono::steady_clock::time_point now, + intptr_t ctx) noexcept = 0; +}; + +class WaitCounterBackendFactoryIf { + public: + virtual ~WaitCounterBackendFactoryIf() = default; + + // May return nullptr. + // In this case the counter will be ignored by the given backend. + virtual std::unique_ptr create( + std::string_view key) noexcept = 0; +}; + +C10_API void registerWaitCounterBackend( + std::unique_ptr /*factory*/); + +C10_API std::vector> +getRegisteredWaitCounterBackends(); +} // namespace detail + +// A handle to a wait counter. +class C10_API WaitCounterHandle { + public: + explicit WaitCounterHandle(std::string_view key); + + class WaitGuard { + public: + WaitGuard(WaitGuard&& other) noexcept + : handle_{std::exchange(other.handle_, {})}, + ctxs_{std::move(other.ctxs_)} {} + WaitGuard(const WaitGuard&) = delete; + WaitGuard& operator=(const WaitGuard&) = delete; + WaitGuard& operator=(WaitGuard&&) = delete; + + ~WaitGuard() { + stop(); + } + + void stop() { + if (auto handle = std::exchange(handle_, nullptr)) { + handle->stop(ctxs_); + } + } + + private: + WaitGuard(WaitCounterHandle& handle, SmallVector&& ctxs) + : handle_{&handle}, ctxs_{std::move(ctxs)} {} + + friend class WaitCounterHandle; + + WaitCounterHandle* handle_; + SmallVector ctxs_; + }; + + // Starts a waiter + WaitGuard start(); + + private: + // Stops the waiter. Each start() call should be matched by exactly one stop() + // call. + void stop(const SmallVector& ctxs); + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + detail::WaitCounterImpl& impl_; +}; +} // namespace c10::monitor + +#define STATIC_WAIT_COUNTER(_key) \ + []() -> ::c10::monitor::WaitCounterHandle& { \ + static ::c10::monitor::WaitCounterHandle handle(#_key); \ + return handle; \ + }() + +#define STATIC_SCOPED_WAIT_COUNTER(_name) \ + auto C10_ANONYMOUS_VARIABLE(SCOPE_GUARD) = STATIC_WAIT_COUNTER(_name).start(); + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h new file mode 100644 index 0000000000000000000000000000000000000000..948d03d509175254b3f54c60a4b501dd62f870b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h new file mode 100644 index 0000000000000000000000000000000000000000..fe5b67c454490e06d88752b708d9543cda0ae6d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h new file mode 100644 index 0000000000000000000000000000000000000000..33da59051855d7e726fe83d19b9c39e8ab355317 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h @@ -0,0 +1,411 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H) +#error \ + "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead." +#endif + +namespace c10_complex_math { + +// Exponential functions + +template +C10_HOST_DEVICE inline c10::complex exp(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::exp(static_cast>(x))); +#else + return static_cast>( + std::exp(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::log(static_cast>(x))); +#else + return static_cast>( + std::log(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log10(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::log10(static_cast>(x))); +#else + return static_cast>( + std::log10(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log2(const c10::complex& x) { + const c10::complex log2 = c10::complex(::log(2.0), 0.0); + return c10_complex_math::log(x) / log2; +} + +// Power functions +// +#if defined(_LIBCPP_VERSION) || \ + (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)) +namespace _detail { +C10_API c10::complex sqrt(const c10::complex& in); +C10_API c10::complex sqrt(const c10::complex& in); +C10_API c10::complex acos(const c10::complex& in); +C10_API c10::complex acos(const c10::complex& in); +} // namespace _detail +#endif + +template +C10_HOST_DEVICE inline c10::complex sqrt(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sqrt(static_cast>(x))); +#elif !( \ + defined(_LIBCPP_VERSION) || \ + (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))) + return static_cast>( + std::sqrt(static_cast>(x))); +#else + return _detail::sqrt(x); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>(thrust::pow( + static_cast>(x), static_cast>(y))); +#else + return static_cast>(std::pow( + static_cast>(x), static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const T& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(static_cast>(x), y)); +#else + return static_cast>( + std::pow(static_cast>(x), y)); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const T& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(x, static_cast>(y))); +#else + return static_cast>( + std::pow(x, static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>(thrust::pow( + static_cast>(x), static_cast>(y))); +#else + return static_cast>(std::pow( + static_cast>(x), static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const U& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(static_cast>(x), y)); +#else + return static_cast>( + std::pow(static_cast>(x), y)); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const T& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(x, static_cast>(y))); +#else + return static_cast>( + std::pow(x, static_cast>(y))); +#endif +} + +// Trigonometric functions + +template +C10_HOST_DEVICE inline c10::complex sin(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sin(static_cast>(x))); +#else + return static_cast>( + std::sin(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex cos(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::cos(static_cast>(x))); +#else + return static_cast>( + std::cos(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex tan(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::tan(static_cast>(x))); +#else + return static_cast>( + std::tan(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex asin(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::asin(static_cast>(x))); +#else + return static_cast>( + std::asin(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex acos(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::acos(static_cast>(x))); +#elif !defined(_LIBCPP_VERSION) + return static_cast>( + std::acos(static_cast>(x))); +#else + return _detail::acos(x); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex atan(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::atan(static_cast>(x))); +#else + return static_cast>( + std::atan(static_cast>(x))); +#endif +} + +// Hyperbolic functions + +template +C10_HOST_DEVICE inline c10::complex sinh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sinh(static_cast>(x))); +#else + return static_cast>( + std::sinh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex cosh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::cosh(static_cast>(x))); +#else + return static_cast>( + std::cosh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex tanh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::tanh(static_cast>(x))); +#else + return static_cast>( + std::tanh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex asinh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::asinh(static_cast>(x))); +#else + return static_cast>( + std::asinh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex acosh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::acosh(static_cast>(x))); +#else + return static_cast>( + std::acosh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex atanh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::atanh(static_cast>(x))); +#else + return static_cast>( + std::atanh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log1p(const c10::complex& z) { +#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \ + defined(__HIPCC__) + // For Mac, the new implementation yielded a high relative error. Falling back + // to the old version for now. + // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 + // For CUDA we also use this one, as thrust::log(thrust::complex) takes + // *forever* to compile + + // log1p(z) = log(1 + z) + // Let's define 1 + z = r * e ^ (i * a), then we have + // log(r * e ^ (i * a)) = log(r) + i * a + // With z = x + iy, the term r can be written as + // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5 + // = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5 + // So, log(r) is + // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2) + // = 0.5 * log1p(x * (x + 2) + y ^ 2) + // we need to use the expression only on certain condition to avoid overflow + // and underflow from `(x * (x + 2) + y ^ 2)` + T x = z.real(); + T y = z.imag(); + T zabs = std::abs(z); + T theta = std::atan2(y, x + T(1)); + if (zabs < 0.5) { + T r = x * (T(2) + x) + y * y; + if (r == 0) { // handle underflow + return {x, theta}; + } + return {T(0.5) * std::log1p(r), theta}; + } else { + T z0 = std::hypot(x + 1, y); + return {std::log(z0), theta}; + } +#else + // CPU path + // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 + c10::complex u = z + T(1); + if (u == T(1)) { + return z; + } else { + auto log_u = log(u); + if (u - T(1) == z) { + return log_u; + } + return log_u * (z / (u - T(1))); + } +#endif +} + +template +C10_HOST_DEVICE inline c10::complex expm1(const c10::complex& z) { + // expm1(z) = exp(z) - 1 + // Define z = x + i * y + // f = e ^ (x + i * y) - 1 + // = e ^ x * e ^ (i * y) - 1 + // = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y)) + // = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y) + // = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y) + T x = z.real(); + T y = z.imag(); + T a = std::sin(y / 2); + T er = std::expm1(x) * std::cos(y) - T(2) * a * a; + T ei = std::exp(x) * std::sin(y); + return {er, ei}; +} + +} // namespace c10_complex_math + +using c10_complex_math::acos; +using c10_complex_math::acosh; +using c10_complex_math::asin; +using c10_complex_math::asinh; +using c10_complex_math::atan; +using c10_complex_math::atanh; +using c10_complex_math::cos; +using c10_complex_math::cosh; +using c10_complex_math::exp; +using c10_complex_math::expm1; +using c10_complex_math::log; +using c10_complex_math::log10; +using c10_complex_math::log1p; +using c10_complex_math::log2; +using c10_complex_math::pow; +using c10_complex_math::sin; +using c10_complex_math::sinh; +using c10_complex_math::sqrt; +using c10_complex_math::tan; +using c10_complex_math::tanh; + +namespace std { + +using c10_complex_math::acos; +using c10_complex_math::acosh; +using c10_complex_math::asin; +using c10_complex_math::asinh; +using c10_complex_math::atan; +using c10_complex_math::atanh; +using c10_complex_math::cos; +using c10_complex_math::cosh; +using c10_complex_math::exp; +using c10_complex_math::expm1; +using c10_complex_math::log; +using c10_complex_math::log10; +using c10_complex_math::log1p; +using c10_complex_math::log2; +using c10_complex_math::pow; +using c10_complex_math::sin; +using c10_complex_math::sinh; +using c10_complex_math::sqrt; +using c10_complex_math::tan; +using c10_complex_math::tanh; + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..44152b72cb35b7df727ece02b089350be04a9f7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H) +#error \ + "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead." +#endif + +#include + +namespace c10 { + +template +struct is_complex : public std::false_type {}; + +template +struct is_complex> : public std::true_type {}; + +template +struct is_complex> : public std::true_type {}; + +// Extract double from std::complex; is identity otherwise +// TODO: Write in more idiomatic C++17 +template +struct scalar_value_type { + using type = T; +}; +template +struct scalar_value_type> { + using type = T; +}; +template +struct scalar_value_type> { + using type = T; +}; + +} // namespace c10 + +namespace std { + +template +class numeric_limits> : public numeric_limits {}; + +template +bool isnan(const c10::complex& v) { + return std::isnan(v.real()) || std::isnan(v.imag()); +} + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h new file mode 100644 index 0000000000000000000000000000000000000000..6bc7c7956f3986ca3c3f10252bd6eb06a7fd1104 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10 { + +// Note: Explicit implementation of copysign for Half and BFloat16 +// is needed to workaround g++-7/8 crash on aarch64, but also makes +// copysign faster for the half-precision types +template +inline auto copysign(const T& a, const U& b) { + return std::copysign(a, b); +} + +// Implement copysign for half precision floats using bit ops +// Sign is the most significant bit for both half and bfloat16 types +inline c10::Half copysign(c10::Half a, c10::Half b) { + return c10::Half((a.x & 0x7fff) | (b.x & 0x8000), c10::Half::from_bits()); +} + +inline c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) { + return c10::BFloat16( + (a.x & 0x7fff) | (b.x & 0x8000), c10::BFloat16::from_bits()); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h new file mode 100644 index 0000000000000000000000000000000000000000..538a6e271f9d56564bcd8ff73071974991513009 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10::utils { + +// Set an environment variable. +C10_API void set_env( + const char* name, + const char* value, + bool overwrite = true); + +// Checks an environment variable is set. +C10_API bool has_env(const char* name) noexcept; + +// Reads an environment variable and returns +// - std::optional, if set equal to "1" +// - std::optional, if set equal to "0" +// - nullopt, otherwise +// +// NB: +// Issues a warning if the value of the environment variable is not 0 or 1. +C10_API std::optional check_env(const char* name); + +// Reads the value of an environment variable if it is set. +// However, check_env should be used if the value is assumed to be a flag. +C10_API std::optional get_env(const char* name) noexcept; + +} // namespace c10::utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b83f9c931e4cf13b648336b4331a6f33b0a6fda2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h new file mode 100644 index 0000000000000000000000000000000000000000..73687a69d1bbc0bfe4a0d449cbf43f10437e29bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h @@ -0,0 +1,403 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// This file is based on the uint128 implementation of protobuf at +// https://github.com/protocolbuffers/protobuf/blob/1e88936fce10cf773cb72b44c6a7f48b38c7578b/src/google/protobuf/stubs/int128.h +// +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include +#include + +namespace c10 { + +struct uint128_pod; + +// TODO(xiaofeng): Define GOOGLE_PROTOBUF_HAS_CONSTEXPR when constexpr is +// available. +#ifdef GOOGLE_PROTOBUF_HAS_CONSTEXPR +#define UINT128_CONSTEXPR constexpr +#else +#define UINT128_CONSTEXPR +#endif + +class uint128; +inline uint128& operator<<=(uint128& self, int amount); + +// An unsigned 128-bit integer type. Thread-compatible. +class C10_API uint128 { + public: + UINT128_CONSTEXPR uint128(); // Sets to 0, but don't trust on this behavior. + UINT128_CONSTEXPR uint128(uint64_t top, uint64_t bottom); +#ifndef SWIG + UINT128_CONSTEXPR uint128(int bottom); + UINT128_CONSTEXPR uint128(uint32_t bottom); // Top 96 bits = 0 +#endif + UINT128_CONSTEXPR uint128(uint64_t bottom); // hi_ = 0 + UINT128_CONSTEXPR uint128(const uint128_pod& val); + + // Trivial copy constructor, assignment operator and destructor. + + void Initialize(uint64_t top, uint64_t bottom); + + // Arithmetic operators. + uint128& operator+=(const uint128& b); + uint128& operator-=(const uint128& b); + uint128& operator*=(const uint128& b); + // Long division/modulo for uint128. + uint128& operator/=(const uint128& b); + uint128& operator%=(const uint128& b); + uint128 operator++(int); + uint128 operator--(int); + // Make msvc happy with using operator<<= from DivModImpl + // which is a static function, and linker complained about missing + // static version of this overload + friend uint128& operator<<=(uint128& /*self*/, int /*amount*/); + uint128& operator>>=(int /*amount*/); + uint128& operator&=(const uint128& b); + uint128& operator|=(const uint128& b); + uint128& operator^=(const uint128& b); + uint128& operator++(); + uint128& operator--(); + + friend uint64_t Uint128Low64(const uint128& v); + friend uint64_t Uint128High64(const uint128& v); + + // We add "std::" to avoid including all of port.h. + C10_API friend std::ostream& operator<<(std::ostream& o, const uint128& b); + + private: + static void DivModImpl( + uint128 dividend, + uint128 divisor, + uint128* quotient_ret, + uint128* remainder_ret); + + // Little-endian memory order optimizations can benefit from + // having lo_ first, hi_ last. + // See util/endian/endian.h and Load128/Store128 for storing a uint128. + uint64_t lo_; + uint64_t hi_; + + // Not implemented, just declared for catching automatic type conversions. + uint128(uint8_t); + uint128(uint16_t); + uint128(float v); + uint128(double v); +}; + +// This is a POD form of uint128 which can be used for static variables which +// need to be operated on as uint128. +struct uint128_pod { + // Note: The ordering of fields is different than 'class uint128' but the + // same as its 2-arg constructor. This enables more obvious initialization + // of static instances, which is the primary reason for this struct in the + // first place. This does not seem to defeat any optimizations wrt + // operations involving this struct. + uint64_t hi; + uint64_t lo; +}; + +C10_API extern const uint128_pod kuint128max; + +// allow uint128 to be logged +C10_API extern std::ostream& operator<<(std::ostream& o, const uint128& b); + +// Methods to access low and high pieces of 128-bit value. +// Defined externally from uint128 to facilitate conversion +// to native 128-bit types when compilers support them. +inline uint64_t Uint128Low64(const uint128& v) { + return v.lo_; +} +inline uint64_t Uint128High64(const uint128& v) { + return v.hi_; +} + +// TODO: perhaps it would be nice to have int128, a signed 128-bit type? + +// -------------------------------------------------------------------------- +// Implementation details follow +// -------------------------------------------------------------------------- +inline bool operator==(const uint128& lhs, const uint128& rhs) { + return ( + Uint128Low64(lhs) == Uint128Low64(rhs) && + Uint128High64(lhs) == Uint128High64(rhs)); +} +inline bool operator!=(const uint128& lhs, const uint128& rhs) { + return !(lhs == rhs); +} + +inline UINT128_CONSTEXPR uint128::uint128() : lo_(0), hi_(0) {} +inline UINT128_CONSTEXPR uint128::uint128(uint64_t top, uint64_t bottom) + : lo_(bottom), hi_(top) {} +inline UINT128_CONSTEXPR uint128::uint128(const uint128_pod& v) + : lo_(v.lo), hi_(v.hi) {} +inline UINT128_CONSTEXPR uint128::uint128(uint64_t bottom) + : lo_(bottom), hi_(0) {} +#ifndef SWIG +inline UINT128_CONSTEXPR uint128::uint128(uint32_t bottom) + : lo_(bottom), hi_(0) {} +inline UINT128_CONSTEXPR uint128::uint128(int bottom) + : lo_(bottom), hi_(static_cast((bottom < 0) ? -1 : 0)) {} +#endif + +#undef UINT128_CONSTEXPR + +inline void uint128::Initialize(uint64_t top, uint64_t bottom) { + hi_ = top; + lo_ = bottom; +} + +// Comparison operators. + +#define CMP128(op) \ + inline bool operator op(const uint128& lhs, const uint128& rhs) { \ + return (Uint128High64(lhs) == Uint128High64(rhs)) \ + ? (Uint128Low64(lhs) op Uint128Low64(rhs)) \ + : (Uint128High64(lhs) op Uint128High64(rhs)); \ + } + +CMP128(<) +CMP128(>) +CMP128(>=) +CMP128(<=) + +#undef CMP128 + +// Unary operators + +inline uint128 operator-(const uint128& val) { + const uint64_t hi_flip = ~Uint128High64(val); + const uint64_t lo_flip = ~Uint128Low64(val); + const uint64_t lo_add = lo_flip + 1; + if (lo_add < lo_flip) { + return uint128(hi_flip + 1, lo_add); + } + return uint128(hi_flip, lo_add); +} + +inline bool operator!(const uint128& val) { + return !Uint128High64(val) && !Uint128Low64(val); +} + +// Logical operators. + +inline uint128 operator~(const uint128& val) { + return uint128(~Uint128High64(val), ~Uint128Low64(val)); +} + +#define LOGIC128(op) \ + inline uint128 operator op(const uint128& lhs, const uint128& rhs) { \ + return uint128( \ + Uint128High64(lhs) op Uint128High64(rhs), \ + Uint128Low64(lhs) op Uint128Low64(rhs)); \ + } + +LOGIC128(|) +LOGIC128(&) +LOGIC128(^) + +#undef LOGIC128 + +#define LOGICASSIGN128(op) \ + inline uint128& uint128::operator op(const uint128 & other) { \ + hi_ op other.hi_; \ + lo_ op other.lo_; \ + return *this; \ + } + +LOGICASSIGN128(|=) +LOGICASSIGN128(&=) +LOGICASSIGN128(^=) + +#undef LOGICASSIGN128 + +// Shift operators. + +inline uint128 operator<<(const uint128& val, int amount) { + // uint64_t shifts of >= 64 are undefined, so we will need some + // special-casing. + if (amount < 64) { + if (amount == 0) { + return val; + } + uint64_t new_hi = + (Uint128High64(val) << amount) | (Uint128Low64(val) >> (64 - amount)); + uint64_t new_lo = Uint128Low64(val) << amount; + return uint128(new_hi, new_lo); + } else if (amount < 128) { + return uint128(Uint128Low64(val) << (amount - 64), 0); + } else { + return uint128(0, 0); + } +} + +inline uint128 operator>>(const uint128& val, int amount) { + // uint64_t shifts of >= 64 are undefined, so we will need some + // special-casing. + if (amount < 64) { + if (amount == 0) { + return val; + } + uint64_t new_hi = Uint128High64(val) >> amount; + uint64_t new_lo = + (Uint128Low64(val) >> amount) | (Uint128High64(val) << (64 - amount)); + return uint128(new_hi, new_lo); + } else if (amount < 128) { + return uint128(0, Uint128High64(val) >> (amount - 64)); + } else { + return uint128(0, 0); + } +} + +inline uint128& operator<<=(uint128& self, int amount) { + // uint64_t shifts of >= 64 are undefined, so we will need some + // special-casing. + if (amount < 64) { + if (amount != 0) { + self.hi_ = (self.hi_ << amount) | (self.lo_ >> (64 - amount)); + self.lo_ = self.lo_ << amount; + } + } else if (amount < 128) { + self.hi_ = self.lo_ << (amount - 64); + self.lo_ = 0; + } else { + self.hi_ = 0; + self.lo_ = 0; + } + return self; +} + +inline uint128& uint128::operator>>=(int amount) { + // uint64_t shifts of >= 64 are undefined, so we will need some + // special-casing. + if (amount < 64) { + if (amount != 0) { + lo_ = (lo_ >> amount) | (hi_ << (64 - amount)); + hi_ = hi_ >> amount; + } + } else if (amount < 128) { + lo_ = hi_ >> (amount - 64); + hi_ = 0; + } else { + lo_ = 0; + hi_ = 0; + } + return *this; +} + +inline uint128 operator+(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) += rhs; +} + +inline uint128 operator-(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) -= rhs; +} + +inline uint128 operator*(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) *= rhs; +} + +inline uint128 operator/(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) /= rhs; +} + +inline uint128 operator%(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) %= rhs; +} + +inline uint128& uint128::operator+=(const uint128& b) { + hi_ += b.hi_; + uint64_t lolo = lo_ + b.lo_; + if (lolo < lo_) + ++hi_; + lo_ = lolo; + return *this; +} + +inline uint128& uint128::operator-=(const uint128& b) { + hi_ -= b.hi_; + if (b.lo_ > lo_) + --hi_; + lo_ -= b.lo_; + return *this; +} + +inline uint128& uint128::operator*=(const uint128& b) { + uint64_t a96 = hi_ >> 32; + uint64_t a64 = hi_ & 0xffffffffu; + uint64_t a32 = lo_ >> 32; + uint64_t a00 = lo_ & 0xffffffffu; + uint64_t b96 = b.hi_ >> 32; + uint64_t b64 = b.hi_ & 0xffffffffu; + uint64_t b32 = b.lo_ >> 32; + uint64_t b00 = b.lo_ & 0xffffffffu; + // multiply [a96 .. a00] x [b96 .. b00] + // terms higher than c96 disappear off the high side + // terms c96 and c64 are safe to ignore carry bit + uint64_t c96 = a96 * b00 + a64 * b32 + a32 * b64 + a00 * b96; + uint64_t c64 = a64 * b00 + a32 * b32 + a00 * b64; + this->hi_ = (c96 << 32) + c64; + this->lo_ = 0; + // add terms after this one at a time to capture carry + *this += uint128(a32 * b00) << 32; + *this += uint128(a00 * b32) << 32; + *this += a00 * b00; + return *this; +} + +inline uint128 uint128::operator++(int) { + uint128 tmp(*this); + *this += 1; + return tmp; +} + +inline uint128 uint128::operator--(int) { + uint128 tmp(*this); + *this -= 1; + return tmp; +} + +inline uint128& uint128::operator++() { + *this += 1; + return *this; +} + +inline uint128& uint128::operator--() { + *this -= 1; + return *this; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h new file mode 100644 index 0000000000000000000000000000000000000000..148a9bf4a20002de4396c9e0a26ea695b8ed1c98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h @@ -0,0 +1,1278 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace pybind11 { +template +class class_; +} + +namespace torch::utils { +class PyObjectPreservation; +} + +namespace c10 { +class intrusive_ptr_target; +namespace raw { +namespace weak_intrusive_ptr { +inline void incref(intrusive_ptr_target* self); +} +namespace intrusive_ptr { +inline void incref(intrusive_ptr_target* self); +} + +// constructor tag used by intrusive_ptr constructors +struct DontIncreaseRefcount {}; +} // namespace raw + +namespace detail { +constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF; +constexpr uint64_t kImpracticallyHugeWeakReferenceCount = + (kImpracticallyHugeReferenceCount << 32); +constexpr uint64_t kReferenceCountOne = 1; +constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32); +constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne); +// Indicates whether the object has a PyObject wrapper. +constexpr uint64_t kHasPyObject = (uint64_t(1) << 63); + +template +struct intrusive_target_default_null_type final { + static constexpr TTarget* singleton() noexcept { + return nullptr; + } +}; + +template +TTarget* assign_ptr_(TTarget* rhs) { + if (FromNullType::singleton() == rhs) { + return ToNullType::singleton(); + } else { + return rhs; + } +} + +inline uint32_t refcount(uint64_t combined_refcount) { + return static_cast(combined_refcount); +} + +inline uint32_t weakcount(uint64_t combined_refcount) { + return static_cast((combined_refcount & ~kHasPyObject) >> 32); +} + +inline bool has_pyobject(uint64_t combined_refcount) { + return (combined_refcount & kHasPyObject) != 0; +} + +inline bool is_uniquely_owned(uint64_t combined_refcount) { + return (combined_refcount & ~detail::kHasPyObject) == detail::kUniqueRef; +} + +// The only requirement for refcount increment is that it happens-before +// decrement, so no additional memory ordering is needed. +inline uint64_t atomic_combined_refcount_increment( + std::atomic& combined_refcount, + uint64_t inc) { + return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc; +} + +inline uint32_t atomic_weakcount_increment( + std::atomic& combined_refcount) { + return detail::weakcount(atomic_combined_refcount_increment( + combined_refcount, kWeakReferenceCountOne)); +} + +// The requirement is that all modifications to the managed object happen-before +// invocation of the managed object destructor, and that allocation of the +// managed object storage happens-before deallocation of the storage. +// +// To get this ordering, all non-final decrements must synchronize-with the +// final decrement. So all non-final decrements have to store-release while the +// final decrement has to load-acquire, either directly or with the help of +// fences. But it's easiest just to have all decrements be acq-rel. And it turns +// out, on modern architectures and chips, it's also fastest. +inline uint64_t atomic_combined_refcount_decrement( + std::atomic& combined_refcount, + uint64_t dec) { + return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec; +} + +inline uint32_t atomic_weakcount_decrement( + std::atomic& combined_refcount) { + return detail::weakcount(atomic_combined_refcount_decrement( + combined_refcount, kWeakReferenceCountOne)); +} + +template +struct TargetTraits { + static constexpr bool can_have_pyobject = false; +}; + +} // namespace detail + +/** + * intrusive_ptr is an alternative to shared_ptr that has better + * performance because it does the refcounting intrusively + * (i.e. in a member of the object itself). + * Your class T needs to inherit from intrusive_ptr_target to allow it to be + * used in an intrusive_ptr. Your class's constructor should not allow + *`this` to escape to other threads or create an intrusive_ptr from `this`. + */ + +// Note [Stack allocated intrusive_ptr_target safety] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// A well known problem with std::enable_shared_from_this is that it +// allows you to create a std::shared_ptr from a stack allocated object, +// which is totally bogus because the object will die once you return +// from the stack. In intrusive_ptr, we can detect that this has occurred, +// because we set the refcount/weakcount of objects which inherit from +// intrusive_ptr_target to zero, *unless* we can prove that the object +// was dynamically allocated (e.g., via make_intrusive). +// +// Thus, whenever you transmute a T* into a intrusive_ptr, we check +// and make sure that the refcount isn't zero (or, a more subtle +// test for weak_intrusive_ptr, for which the refcount may validly +// be zero, but the weak refcount better not be zero), because that +// tells us if the object was allocated by us. If it wasn't, no +// intrusive_ptr for you! + +// NOLINTNEXTLINE(cppcoreguidelines-virtual-class-destructor) +class C10_API intrusive_ptr_target { + // Note [Weak references for intrusive refcounting] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Here's the scheme: + // + // - refcount == number of strong references to the object + // weakcount == number of weak references to the object, + // plus one more if refcount > 0 + // An invariant: refcount > 0 => weakcount > 0 + // + // - c10::StorageImpl stays live as long as there are any strong + // or weak pointers to it (weakcount > 0, since strong + // references count as a +1 to weakcount) + // + // - finalizers are called and data_ptr is deallocated when refcount == 0 + // + // - Once refcount == 0, it can never again be > 0 (the transition + // from > 0 to == 0 is monotonic) + // + // - When you access c10::StorageImpl via a weak pointer, you must + // atomically increment the use count, if it is greater than 0. + // If it is not, you must report that the storage is dead. + // + //.We use a single combined count for refcount and weakcount so that + // we can atomically operate on both at the same time for performance + // and defined behaviors. + // + // Note [PyObject preservation for Tensor and Storages] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // intrusive_ptr has special support for preserving PyObject wrappers + // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of + // the combined_refcount_ is used to indicate whether the object has a + // PyObject wrapper. + // + // - The PyObject, if it exists, holds a strong reference to the + // intrusive_ptr_target. + // + // - When the refcount goes from 1 to 2, we incref the PyObject. + // + // - When the refcount goes from 2 to 1, we decref the PyObject. + // + // In other words, the intrusive_ptr keeps the PyObject alive as long as there + // are other C++ references to the intrusive_ptr_target. + + mutable std::atomic combined_refcount_; + static_assert(sizeof(std::atomic) == 8); + static_assert(alignof(std::atomic) == 8); + static_assert(std::atomic::is_always_lock_free); + + template + friend class intrusive_ptr; + friend inline void raw::intrusive_ptr::incref(intrusive_ptr_target* self); + + template + friend class weak_intrusive_ptr; + friend inline void raw::weak_intrusive_ptr::incref( + intrusive_ptr_target* self); + + template + friend struct ExclusivelyOwnedTensorTraits; + + friend class torch::utils::PyObjectPreservation; + + protected: + // protected destructor. We never want to destruct intrusive_ptr_target* + // directly. + virtual ~intrusive_ptr_target() { +// Disable -Wterminate and -Wexceptions so we're allowed to use assertions +// (i.e. throw exceptions) in a destructor. +// We also have to disable -Wunknown-warning-option and -Wpragmas, because +// some other compilers don't know about -Wterminate or -Wexceptions and +// will show a warning about unknown warning options otherwise. +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +#pragma warning( \ + disable : 4297) // function assumed not to throw an exception but does +#else +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic ignored "-Wunknown-warning-option" +#pragma GCC diagnostic ignored "-Wterminate" +#pragma GCC diagnostic ignored "-Wexceptions" +#endif + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + // Second condition is there to accommodate + // unsafe_adapt_non_heap_allocated: since we are doing our own + // deallocation in that case, it is correct for each + // expected_decref to have happened (some user code tried to + // decref and thus free the object, but it didn't happen right + // away) or not (no user code tried to free the object, and + // now it's getting destroyed through whatever mechanism the + // caller of unsafe_adapt_non_heap_allocated wanted to + // use). We choose our reference count such that the count + // will not dip below kImpracticallyHugeReferenceCount regardless. + refcount() == 0 || + refcount() >= detail::kImpracticallyHugeReferenceCount, + "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ", + refcount()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + // See ~intrusive_ptr for optimization that will frequently result in 1 + // at destruction time. + weakcount() == 1 || weakcount() == 0 || + weakcount() == detail::kImpracticallyHugeReferenceCount - 1 || + weakcount() == detail::kImpracticallyHugeReferenceCount, + "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it"); +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#else +#pragma GCC diagnostic pop +#endif + } + + constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {} + + // intrusive_ptr_target supports copy and move: but refcount and weakcount + // don't participate (since they are intrinsic properties of the memory + // location) + intrusive_ptr_target(intrusive_ptr_target&& /*other*/) noexcept + : intrusive_ptr_target() {} + + intrusive_ptr_target& operator=(intrusive_ptr_target&& /*other*/) noexcept { + return *this; + } + + intrusive_ptr_target(const intrusive_ptr_target& /*other*/) noexcept + : intrusive_ptr_target() {} + + intrusive_ptr_target& operator=( + const intrusive_ptr_target& /*other*/) noexcept { + return *this; + } + + private: + /** + * This is called when refcount reaches zero. + * You can override this to release expensive resources. + * There might still be weak references, so your object might not get + * destructed yet, but you can assume the object isn't used anymore, + * i.e. no more calls to methods or accesses to members (we just can't + * destruct it yet because we need the weakcount accessible). + * + * If there are no weak references (i.e. your class is about to be + * destructed), this function WILL NOT be called. + */ + virtual void release_resources() {} + + /** + * These two methods are called when the refcount transitions between one + * and two and the object has a PyObject wrapper. + */ + virtual void incref_pyobject() const noexcept {} + virtual void decref_pyobject() const noexcept {} + virtual bool try_incref_pyobject() const noexcept { + return false; + } + + uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const { + return detail::refcount(combined_refcount_.load(order)); + } + + uint32_t weakcount( + std::memory_order order = std::memory_order_relaxed) const { + return detail::weakcount(combined_refcount_.load(order)); + } +}; + +namespace detail { + +#ifndef C10_MOBILE +template <> +struct TargetTraits { + // A generic intrusive_ptr may actually be a TensorImpl + // or StorageImpl, so we have to allow for PyObject support. + static constexpr bool can_have_pyobject = true; +}; +#endif + +} // namespace detail + +template +class weak_intrusive_ptr; + +template < + class TTarget, + class NullType = detail::intrusive_target_default_null_type> +class intrusive_ptr final { + private: +// the following static assert would be nice to have but it requires +// the target class T to be fully defined when intrusive_ptr is instantiated +// this is a problem for classes that contain pointers to themselves +// static_assert( +// std::is_base_of_v, +// "intrusive_ptr can only be used for classes that inherit from +// intrusive_ptr_target."); +#ifndef _WIN32 + // This static_assert triggers on MSVC + // error C2131: expression did not evaluate to a constant + static_assert( + // NOLINTNEXTLINE(misc-redundant-expression) + NullType::singleton() == NullType::singleton(), + "NullType must have a constexpr singleton() method"); +#endif + static_assert( + std::is_base_of_v< + TTarget, + std::remove_pointer_t>, + "NullType::singleton() must return a element_type* pointer"); + + TTarget* target_; + + template + friend struct ExclusivelyOwnedTensorTraits; + template + friend class intrusive_ptr; + friend class weak_intrusive_ptr; + + // Make pybind11::class_ be a friend class of intrusive_ptr, so that custom + // smart holder in pybind11 could access the private constructor of + // intrusive_ptr(T*) which took the ownership of the object. This is required + // by customer holder macro PYBIND11_DECLARE_HOLDER_TYPE, where it uses + // intrusive_ptr(TTarget*) to initialize and take ownership of the object. For + // details, see + // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers + template + friend class pybind11::class_; + + void retain_() noexcept { + if (target_ != NullType::singleton()) { + uint64_t combined = detail::atomic_combined_refcount_increment( + target_->combined_refcount_, detail::kReferenceCountOne); + uint32_t new_refcount = detail::refcount(combined); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + new_refcount != 1, + "intrusive_ptr: Cannot increase refcount after it reached zero."); + + if constexpr (detail::TargetTraits::can_have_pyobject) { + // If the refcount transitioned from 1 to 2, we need to incref the + // PyObject. In other words, we need to ensure that the PyObject stays + // alive now that we have a C++ reference to this object in addition to + // the PyObject itself. + if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) { + target_->incref_pyobject(); + } + } else { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !detail::has_pyobject(combined), + "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set."); + } + } + } + + void reset_() noexcept { + if (target_ != NullType::singleton()) { + reset_not_null_(target_); + } + } + + // C10_NOINLINE to keep binary size a bit smaller. We pass TTarget* here + // to avoid an extra pointer dereference in the call from reset_(). + C10_NOINLINE static void reset_not_null_(TTarget* target) noexcept { + if (detail::is_uniquely_owned( + target->combined_refcount_.load(std::memory_order_acquire))) { + // Both counts are 1, so there are no weak references and + // we are releasing the last strong reference. No other + // threads can observe the effects of this target deletion + // call (e.g. calling use_count()) without a data race. + target->combined_refcount_.store(0, std::memory_order_relaxed); + delete target; + return; + } + + auto combined_refcount = detail::atomic_combined_refcount_decrement( + target->combined_refcount_, detail::kReferenceCountOne); + uint32_t new_refcount = detail::refcount(combined_refcount); + bool has_pyobject = detail::has_pyobject(combined_refcount); + if (new_refcount == 0) { + if (detail::weakcount(combined_refcount) == 1) { + delete target; + return; + } + // See comment above about weakcount. As long as refcount>0, + // weakcount is one larger than the actual number of weak references. + // So we need to decrement it here. + release_resources_and_decrement_weakrefs_(target); + } else if constexpr (detail::TargetTraits::can_have_pyobject) { + // If the refcount transitioned from 2 to 1, we need to decref the + // PyObject. In other words, we don't want to keep the PyObject alive if + // there are no C++ references to this object other than the PyObject + // itself. + if (has_pyobject && new_refcount == 1) { + target->decref_pyobject(); + } + } else { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !has_pyobject, + "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set."); + } + } + + C10_NOINLINE static void release_resources_and_decrement_weakrefs_( + TTarget* target) noexcept { + // justification for const_cast: release_resources is basically a + // destructor and a destructor always mutates the object, even for + // const objects. + const_cast*>(target)->release_resources(); + if (detail::atomic_weakcount_decrement(target->combined_refcount_) == 0) { + delete target; + } + } + + // raw pointer constructors are not public because we shouldn't make + // intrusive_ptr out of raw pointers except from inside the make_intrusive(), + // reclaim() and weak_intrusive_ptr::lock() implementations. + + // This constructor will increase the ref counter for you. + // This constructor will be used by the make_intrusive(), and also pybind11, + // which wrap the intrusive_ptr holder around the raw pointer and incref + // correspondingly (pybind11 requires raw pointer constructor to incref by + // default). + explicit intrusive_ptr(TTarget* target) + : intrusive_ptr(target, raw::DontIncreaseRefcount{}) { + if (target_ != NullType::singleton()) { + // We just created result.target_, so we know no other thread has + // access to it, so we know we needn't care about memory ordering. + // (On x86_64, a store with memory_order_relaxed generates a plain old + // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is + // much more expensive: https://godbolt.org/z/eKPzj8.) + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + target_->combined_refcount_.load(std::memory_order_relaxed) == 0, + "intrusive_ptr: Newly-created target had non-zero refcounts. Does its " + "constructor do something strange like incref or create an " + "intrusive_ptr from `this`?"); + target_->combined_refcount_.store( + detail::kUniqueRef, std::memory_order_relaxed); + } + } + + public: + using element_type = TTarget; + + intrusive_ptr() noexcept + : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {} + + /* implicit */ intrusive_ptr(std::nullptr_t) noexcept + : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {} + + // This constructor will not increase the ref counter for you. + // We use the tagged dispatch mechanism to explicitly mark this constructor + // to not increase the refcount + explicit intrusive_ptr( + TTarget* target, + raw::DontIncreaseRefcount /*unused*/) noexcept + : target_(target) {} + + explicit intrusive_ptr(std::unique_ptr rhs) noexcept + : intrusive_ptr(rhs.release()) {} + + intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) { + rhs.target_ = NullType::singleton(); + } + + template + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) + /* implicit */ intrusive_ptr(intrusive_ptr&& rhs) noexcept + : target_( + detail::assign_ptr_(rhs.target_)) { + static_assert( + std::is_convertible_v, + "Type mismatch. intrusive_ptr move constructor got pointer of wrong type."); + rhs.target_ = FromNullType::singleton(); + } + + intrusive_ptr(const intrusive_ptr& rhs) : target_(rhs.target_) { + retain_(); + } + + template + /* implicit */ intrusive_ptr(const intrusive_ptr& rhs) + : target_( + detail::assign_ptr_(rhs.target_)) { + static_assert( + std::is_convertible_v, + "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type."); + retain_(); + } + + ~intrusive_ptr() noexcept { + reset_(); + } + + intrusive_ptr& operator=(intrusive_ptr&& rhs) & noexcept { + // NOLINTNEXTLINE(*assign*) + return this->template operator= (std::move(rhs)); + } + + template + intrusive_ptr& operator=(intrusive_ptr&& rhs) & noexcept { + static_assert( + std::is_convertible_v, + "Type mismatch. intrusive_ptr move assignment got pointer of wrong type."); + intrusive_ptr tmp = std::move(rhs); + swap(tmp); + return *this; + } + + // Assignment is implemented using copy and swap. That's safe for self + // assignment. + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment) + intrusive_ptr& operator=(const intrusive_ptr& rhs) & noexcept { + // NOLINTNEXTLINE(*assign-operator, *assignment-signature) + return this->template operator= (rhs); + } + + template + intrusive_ptr& operator=( + const intrusive_ptr& rhs) & noexcept { + static_assert( + std::is_convertible_v, + "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type."); + intrusive_ptr tmp = rhs; + swap(tmp); + return *this; + } + + TTarget* get() const noexcept { + return target_; + } + + TTarget& operator*() const noexcept { + return *target_; + } + + TTarget* operator->() const noexcept { + return target_; + } + + operator bool() const noexcept { + return target_ != NullType::singleton(); + } + + void reset() noexcept { + reset_(); + target_ = NullType::singleton(); + } + + void swap(intrusive_ptr& rhs) noexcept { + std::swap(target_, rhs.target_); + } + + // We do a lot of null-pointer checks in our code, good to have this be cheap. + bool defined() const noexcept { + return target_ != NullType::singleton(); + } + + uint32_t use_count() const noexcept { + if (target_ == NullType::singleton()) { + return 0; + } + return target_->refcount(std::memory_order_relaxed); + } + + uint32_t weak_use_count() const noexcept { + if (target_ == NullType::singleton()) { + return 0; + } + return target_->weakcount(std::memory_order_relaxed); + } + + bool unique() const noexcept { + return use_count() == 1; + } + + /** + * Stronger than unique() in that it must not have any weakrefs as well. + */ + bool is_uniquely_owned() const noexcept { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton()); + return detail::is_uniquely_owned( + target_->combined_refcount_.load(std::memory_order_acquire)); + } + + /** + * Returns an owning (!) pointer to the underlying object and makes the + * intrusive_ptr instance invalid. That means the refcount is not decreased. + * You *must* put the returned pointer back into a intrusive_ptr using + * intrusive_ptr::reclaim(ptr) to properly destruct it. + * This is helpful for C APIs. + */ + TTarget* release() noexcept { + // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign) + TTarget* result = target_; + target_ = NullType::singleton(); + return result; + } + + /** + * Takes an owning pointer to TTarget* and creates an intrusive_ptr that takes + * over ownership. That means the refcount is not increased. + * This is the counter-part to intrusive_ptr::release() and the pointer + * passed in *must* have been created using intrusive_ptr::release(). + */ + static intrusive_ptr reclaim(TTarget* owning_ptr) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 || + owning_ptr->weakcount(), + "TTarget violates the invariant that refcount > 0 => weakcount > 0"); + return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{}); + } + + /** + * Takes an owning pointer to TTarget* and creates an intrusive_ptr + * representing a new reference, i.e. the raw pointer retains + * ownership. + */ + static intrusive_ptr reclaim_copy(TTarget* owning_ptr) { + auto ret = reclaim(owning_ptr); + ret.retain_(); + return ret; + } + + /** + * Allocate a heap object with args and wrap it inside a intrusive_ptr and + * incref. This is a helper function to let make_intrusive() access private + * intrusive_ptr constructors. + */ + template + static intrusive_ptr make(Args&&... args) { + return intrusive_ptr(new TTarget(std::forward(args)...)); + } + + /** + * Turn a new instance of TTarget (e.g., literally allocated + * using new TTarget(...) into an intrusive_ptr. If possible, + * use intrusive_ptr::make instead which statically guarantees + * that the allocation was done properly. + * + * At the moment, the only reason this method exists is because + * pybind11 holder types expect to be able to allocate in + * this way (because pybind11 handles the new allocation itself). + */ + static intrusive_ptr unsafe_steal_from_new(TTarget* raw_ptr) { + return intrusive_ptr(raw_ptr); + } + + /** + * Turn an instance of TTarget that should not be reference counted + * (e.g., allocated into an arena with placement new) into an + * intrusive_ptr. This is gratuitously unsafe and should only be + * used if you can guarantee that the pointer will not escape and be + * refcounted as normal. + * + * `expected_decrefs` is a debugging parameter: it indicates the + * number of strong owners the intrusive_ptr_target in question is + * expected to get. In most use cases, this will likely be 1. + * + * The reason this method exists is for manually sharing + * StorageImpls across Tensors in the static runtime. It needs + * access to private intrusive_ptr members so that the refcounts can + * be initialized to custom values. + */ + static intrusive_ptr unsafe_adapt_non_heap_allocated( + TTarget* raw_ptr, + uint32_t expected_decrefs) { + intrusive_ptr result(raw_ptr, raw::DontIncreaseRefcount{}); + // kImpracticallyHugeReferenceCount is impractically huge for a reference + // count, while being in no danger of overflowing uint32_t. We actually only + // need to initialize the refcount to 2 -- we are just doing an unbalanced + // incref to prevent the non-heap-allocated target from being + // freed, and we are optimizing that incref by directly + // initializing the refcounts rather than doing an expensive + // atomic increment. The reason to use kImpracticallyHugeReferenceCount is + // to accommodate the debug assertions in ~intrusive_ptr_target. +#ifdef NDEBUG + expected_decrefs = 0; +#endif + result.target_->combined_refcount_.store( + detail::refcount( + detail::kImpracticallyHugeReferenceCount + expected_decrefs) | + detail::kImpracticallyHugeWeakReferenceCount, + std::memory_order_relaxed); + return result; + } + + /** + * Turn a **non-owning raw pointer** to an intrusive_ptr. It is + * the moral equivalent of enable_shared_from_this on a shared pointer. + * + * This method is only valid for objects that are already live. If + * you are looking for the moral equivalent of unique_ptr(T*) + * constructor, see steal_from_new. + * + * TODO: https://github.com/pytorch/pytorch/issues/56482 + */ + static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) { + // See Note [Stack allocated intrusive_ptr_target safety] + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0, + "intrusive_ptr: Can only reclaim pointers that are owned by someone"); + auto ptr = reclaim(raw_ptr); // doesn't increase refcount + ptr.retain_(); + return ptr; + } +}; + +template < + class TTarget, + class NullType = detail::intrusive_target_default_null_type, + class... Args> +inline intrusive_ptr make_intrusive(Args&&... args) { + return intrusive_ptr::make(std::forward(args)...); +} + +template +inline void swap( + intrusive_ptr& lhs, + intrusive_ptr& rhs) noexcept { + lhs.swap(rhs); +} + +// To allow intrusive_ptr inside std::map or std::set, we need operator< +template +inline bool operator<( + const intrusive_ptr& lhs, + const intrusive_ptr& rhs) noexcept { + return lhs.get() < rhs.get(); +} + +template +inline bool operator==( + const intrusive_ptr& lhs, + const intrusive_ptr& rhs) noexcept { + return lhs.get() == rhs.get(); +} + +template +inline bool operator==( + const intrusive_ptr& lhs, + std::nullptr_t) noexcept { + return lhs.get() == nullptr; +} + +template +inline bool operator==( + std::nullptr_t, + const intrusive_ptr& rhs) noexcept { + return nullptr == rhs.get(); +} + +template +inline bool operator!=( + const intrusive_ptr& lhs, + const intrusive_ptr& rhs) noexcept { + return !operator==(lhs, rhs); +} + +template +inline bool operator!=( + const intrusive_ptr& lhs, + std::nullptr_t) noexcept { + return !operator==(lhs, nullptr); +} + +template +inline bool operator!=( + std::nullptr_t, + const intrusive_ptr& rhs) noexcept { + return !operator==(nullptr, rhs); +} +template +struct MaybeOwnedTraits> { + using owned_type = c10::intrusive_ptr; + using borrow_type = c10::intrusive_ptr; + + static borrow_type createBorrow(const owned_type& from) { + return borrow_type::reclaim(from.get()); + } + + static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) { + lhs.release(); + lhs = borrow_type::reclaim(rhs.get()); + } + + static void destroyBorrow(borrow_type& toDestroy) { + toDestroy.release(); + } + + static const owned_type& referenceFromBorrow( + const borrow_type& borrow) noexcept { + return borrow; + } + + static const owned_type* pointerFromBorrow( + const borrow_type& borrow) noexcept { + return &borrow; + } + + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) noexcept { + return true; + } +}; + +template < + typename TTarget, + class NullType = detail::intrusive_target_default_null_type> +class weak_intrusive_ptr final { + private: + static_assert( + std::is_base_of_v, + "intrusive_ptr can only be used for classes that inherit from intrusive_ptr_target."); +#ifndef _WIN32 + // This static_assert triggers on MSVC + // error C2131: expression did not evaluate to a constant + static_assert( + NullType::singleton() == NullType::singleton(), + "NullType must have a constexpr singleton() method"); +#endif + static_assert( + std::is_base_of_v< + TTarget, + std::remove_pointer_t>, + "NullType::singleton() must return a element_type* pointer"); + + TTarget* target_; + + template + friend class weak_intrusive_ptr; + + void retain_() { + if (target_ != NullType::singleton()) { + uint32_t new_weakcount = + detail::atomic_weakcount_increment(target_->combined_refcount_); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + new_weakcount != 1, + "weak_intrusive_ptr: Cannot increase weakcount after it reached zero."); + } + } + + void reset_() noexcept { + if (target_ != NullType::singleton() && + detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) { + // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete) + delete target_; + } + target_ = NullType::singleton(); + } + + constexpr explicit weak_intrusive_ptr(TTarget* target) : target_(target) {} + + public: + using element_type = TTarget; + + explicit weak_intrusive_ptr(const intrusive_ptr& ptr) + : weak_intrusive_ptr(ptr.get()) { + retain_(); + } + + weak_intrusive_ptr(weak_intrusive_ptr&& rhs) noexcept : target_(rhs.target_) { + rhs.target_ = NullType::singleton(); + } + + template + /* implicit */ weak_intrusive_ptr( + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) + weak_intrusive_ptr&& rhs) noexcept + : target_( + detail::assign_ptr_(rhs.target_)) { + static_assert( + std::is_convertible_v, + "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type."); + rhs.target_ = FromNullType::singleton(); + } + + weak_intrusive_ptr(const weak_intrusive_ptr& rhs) : target_(rhs.target_) { + retain_(); + } + + template + /* implicit */ weak_intrusive_ptr( + const weak_intrusive_ptr& rhs) + : target_( + detail::assign_ptr_(rhs.target_)) { + static_assert( + std::is_convertible_v, + "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type."); + retain_(); + } + + ~weak_intrusive_ptr() noexcept { + reset_(); + } + + weak_intrusive_ptr& operator=(weak_intrusive_ptr&& rhs) & noexcept { + // NOLINTNEXTLINE(*assign*) + return this->template operator= (std::move(rhs)); + } + + template + weak_intrusive_ptr& operator=( + weak_intrusive_ptr&& rhs) & noexcept { + static_assert( + std::is_convertible_v, + "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type."); + weak_intrusive_ptr tmp = std::move(rhs); + swap(tmp); + return *this; + } + + weak_intrusive_ptr& operator=(const weak_intrusive_ptr& rhs) & noexcept { + if (this == &rhs) { + return *this; + } + // NOLINTNEXTLINE(*assign*) + return this->template operator= (rhs); + } + + weak_intrusive_ptr& operator=( + const intrusive_ptr& rhs) & noexcept { + weak_intrusive_ptr tmp(rhs); + swap(tmp); + return *this; + } + + template + weak_intrusive_ptr& operator=( + const weak_intrusive_ptr& rhs) & noexcept { + static_assert( + std::is_convertible_v, + "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type."); + weak_intrusive_ptr tmp = rhs; + swap(tmp); + return *this; + } + + void reset() noexcept { + reset_(); + } + + void swap(weak_intrusive_ptr& rhs) noexcept { + TTarget* tmp = target_; + target_ = rhs.target_; + rhs.target_ = tmp; + } + + // NB: This should ONLY be used by the std::hash implementation + // for weak_intrusive_ptr. Another way you could do this is + // friend std::hash, but this triggers two + // bugs: + // + // (1) It triggers an nvcc bug, where std::hash in a friend class + // declaration gets preprocessed into hash, which then cannot + // actually be found. The error in this case looks like: + // + // error: no template named 'hash'; did you mean 'std::hash'? + // + // (2) On OS X, std::hash is declared as a struct, not a class. + // This twings: + // + // error: class 'hash' was previously declared as a struct + // [-Werror,-Wmismatched-tags] + // + // Both of these are work-aroundable, but on the whole, I decided + // it would be simpler and easier to make work if we just expose + // an unsafe getter for target_ + // + TTarget* _unsafe_get_target() const noexcept { + return target_; + } + + uint32_t use_count() const noexcept { + if (target_ == NullType::singleton()) { + return 0; + } + return target_->refcount( + std::memory_order_relaxed); // refcount, not weakcount! + } + + uint32_t weak_use_count() const noexcept { + if (target_ == NullType::singleton()) { + return 0; + } + return target_->weakcount(std::memory_order_relaxed); + } + + bool expired() const noexcept { + return use_count() == 0; + } + + intrusive_ptr lock() const noexcept { + if (target_ == NullType::singleton()) { + return intrusive_ptr(); + } else { + bool increfed = false; + auto combined_refcount = + target_->combined_refcount_.load(std::memory_order_relaxed); + do { + if (detail::refcount(combined_refcount) == 0) { + // Object already destructed, no strong references left anymore. + // Return nullptr. + return intrusive_ptr(); + } + if constexpr (detail::TargetTraits::can_have_pyobject) { + if (detail::has_pyobject(combined_refcount) && + detail::refcount(combined_refcount) == 1 && !increfed) { + // Object has a python wrapper with no other C++ references. + // We need to to incref the Python object before we acquire a + // strong reference to the C++ object to avoid a situation + // where the Python object is deallocated concurrently. + if (!target_->try_incref_pyobject()) { + return intrusive_ptr(); + } + increfed = true; + } + } + } while (!target_->combined_refcount_.compare_exchange_weak( + combined_refcount, + combined_refcount + detail::kReferenceCountOne, + std::memory_order_acquire, + std::memory_order_relaxed)); + + if constexpr (detail::TargetTraits::can_have_pyobject) { + if (increfed && detail::refcount(combined_refcount) != 1) { + target_->decref_pyobject(); + } + } + + return intrusive_ptr( + target_, raw::DontIncreaseRefcount{}); + } + } + + /** + * Returns an owning (but still only weakly referenced) pointer to the + * underlying object and makes the weak_intrusive_ptr instance invalid. + * That means the weakcount is not decreased. + * You *must* put the returned pointer back into a weak_intrusive_ptr using + * weak_intrusive_ptr::reclaim(ptr) to properly destruct it. + * This is helpful for C APIs. + */ + TTarget* release() noexcept { + TTarget* result = target_; + target_ = NullType::singleton(); + return result; + } + + /** + * Takes an owning (but must be weakly referenced) pointer to TTarget* and + * creates a weak_intrusive_ptr that takes over ownership. + * This means that the weakcount is not increased. + * This is the counter-part to weak_intrusive_ptr::release() and the pointer + * passed in *must* have been created using weak_intrusive_ptr::release(). + */ + static weak_intrusive_ptr reclaim(TTarget* owning_weak_ptr) { + // See Note [Stack allocated intrusive_ptr_target safety] + // if refcount > 0, weakcount must be >1 for weak references to exist. + // see weak counting explanation at top of this file. + // if refcount == 0, weakcount only must be >0. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + owning_weak_ptr == NullType::singleton() || + owning_weak_ptr->weakcount() > 1 || + (owning_weak_ptr->refcount() == 0 && + owning_weak_ptr->weakcount() > 0), + "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release()."); + return weak_intrusive_ptr(owning_weak_ptr); + } + + /** + * Takes a pointer to TTarget* (may be weak or strong) and creates a + * new weak_intrusive_ptr representing a new weak reference, i.e. + * the raw pointer retains ownership. + */ + static weak_intrusive_ptr reclaim_copy(TTarget* owning_ptr) { + auto ret = reclaim(owning_ptr); + ret.retain_(); + return ret; + } + + template + friend bool operator<( + const weak_intrusive_ptr& lhs, + const weak_intrusive_ptr& rhs) noexcept; + template + friend bool operator==( + const weak_intrusive_ptr& lhs, + const weak_intrusive_ptr& rhs) noexcept; +}; + +template +inline void swap( + weak_intrusive_ptr& lhs, + weak_intrusive_ptr& rhs) noexcept { + lhs.swap(rhs); +} + +// To allow weak_intrusive_ptr inside std::map or std::set, we need operator< +template +inline bool operator<( + const weak_intrusive_ptr& lhs, + const weak_intrusive_ptr& rhs) noexcept { + return lhs.target_ < rhs.target_; +} + +template +inline bool operator==( + const weak_intrusive_ptr& lhs, + const weak_intrusive_ptr& rhs) noexcept { + return lhs.target_ == rhs.target_; +} + +template +inline bool operator!=( + const weak_intrusive_ptr& lhs, + const weak_intrusive_ptr& rhs) noexcept { + return !operator==(lhs, rhs); +} + +// Alias for documentary purposes, to more easily distinguish +// weak raw intrusive pointers from intrusive pointers. +using weak_intrusive_ptr_target = intrusive_ptr_target; + +// This namespace provides some methods for working with +// raw pointers that subclass intrusive_ptr_target. They are not provided +// as methods on intrusive_ptr_target, because ideally you would not need these +// methods at all (use smart pointers), but if you are dealing with legacy code +// that still needs to pass around raw pointers, you may find these quite +// useful. +// +// An important usage note: some functions are only valid if you have a +// strong raw pointer to the object, while others are only valid if you +// have a weak raw pointer to the object. ONLY call intrusive_ptr namespace +// functions on strong pointers, and weak_intrusive_ptr namespace functions +// on weak pointers. If you mix it up, you may get an assert failure. +namespace raw { + +namespace intrusive_ptr { + +// WARNING: Unlike the reclaim() API, it is NOT valid to pass +// NullType::singleton to this function +inline void incref(intrusive_ptr_target* self) { + if (self) { + uint64_t combined = detail::atomic_combined_refcount_increment( + self->combined_refcount_, detail::kReferenceCountOne); + +#ifndef C10_MOBILE + if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) { + self->incref_pyobject(); + } +#else + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined)); +#endif + } +} + +// WARNING: Unlike the reclaim() API, it is NOT valid to pass +// NullType::singleton to this function +inline void decref(intrusive_ptr_target* self) { + // Let it die + c10::intrusive_ptr::reclaim(self); + // NB: Caller still has 'self' pointer, but it's now invalid. + // If you want more safety, used the actual c10::intrusive_ptr class +} + +template +inline T* make_weak(T* self) { + // NB: 'this' is a strong pointer, but we return a weak pointer + auto ptr = c10::intrusive_ptr::reclaim(self); + c10::weak_intrusive_ptr wptr(ptr); + ptr.release(); + return wptr.release(); +} + +inline uint32_t use_count(intrusive_ptr_target* self) { + auto ptr = c10::intrusive_ptr::reclaim(self); + auto r = ptr.use_count(); + ptr.release(); + return r; +} + +} // namespace intrusive_ptr + +namespace weak_intrusive_ptr { + +inline void incref(weak_intrusive_ptr_target* self) { + detail::atomic_weakcount_increment(self->combined_refcount_); +} + +inline void decref(weak_intrusive_ptr_target* self) { + // Let it die + c10::weak_intrusive_ptr::reclaim(self); + // NB: You still "have" the 'self' pointer, but it's now invalid. + // If you want more safety, used the actual c10::weak_intrusive_ptr class +} + +template +inline T* lock(T* self) { + auto wptr = c10::weak_intrusive_ptr::reclaim(self); + auto ptr = wptr.lock(); + wptr.release(); + return ptr.release(); +} + +// This gives the STRONG refcount of a WEAK pointer +inline uint32_t use_count(weak_intrusive_ptr_target* self) { + auto wptr = c10::weak_intrusive_ptr::reclaim(self); + auto r = wptr.use_count(); + wptr.release(); + return r; +} + +} // namespace weak_intrusive_ptr + +} // namespace raw + +} // namespace c10 + +namespace std { +// To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or +// std::unordered_set, we need std::hash +template +struct hash> { + size_t operator()(const c10::intrusive_ptr& x) const { + return std::hash()(x.get()); + } +}; +template +struct hash> { + size_t operator()(const c10::weak_intrusive_ptr& x) const { + return std::hash()(x._unsafe_get_target()); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h new file mode 100644 index 0000000000000000000000000000000000000000..bc2a018db397a56dee0199af77509fc23dfe405b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h @@ -0,0 +1,128 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2004-present Facebook. All Rights Reserved. + +#pragma once + +#include + +#include +#include +#include +#include + +namespace c10 { + +namespace detail { + +template < + typename I, + bool one_sided = false, + std::enable_if_t, int> = 0> +struct integer_iterator { + using iterator_category = std::input_iterator_tag; + using value_type = I; + using difference_type = std::ptrdiff_t; + using pointer = I*; + using reference = I&; + + explicit constexpr integer_iterator(I val) : value(val) {} + + constexpr I operator*() const { + return value; + } + + constexpr I const* operator->() const { + return &value; + } + + constexpr integer_iterator& operator++() { + ++value; + return *this; + } + + constexpr integer_iterator operator++(int) { + const auto copy = *this; + ++*this; + return copy; + } + + constexpr bool operator==(const integer_iterator& other) const { + if constexpr (one_sided) { + // Range-for loops' end test is `begin != end`, not `begin < + // end`. To handle `c10::irange(n)` where n < 0 (which should be + // empty), we just make `begin != end` fail whenever `end` is + // negative. + return is_negative(other.value) || value == other.value; + } else { + return value == other.value; + } + // Suppress "warning: missing return statement at end of non-void function" + // which Nvidia's Robert Crovella confirms is an NVCC compiler error + // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27 + // `__builtin_unreachable();` would be best here, but it's not + // available with all compilers. So we instead return an arbitrary + // value trusting that this line will, in fact, never be reached. + return false; // Horrible hack + } + + constexpr bool operator!=(const integer_iterator& other) const { + return !(*this == other); + } + + protected: + I value; +}; + +} // namespace detail + +template < + typename I, + bool one_sided = false, + std::enable_if_t, bool> = true> +struct integer_range { + public: + constexpr integer_range(I begin, I end) : begin_(begin), end_(end) {} + using iterator = detail::integer_iterator; + constexpr iterator begin() const { + return begin_; + } + constexpr iterator end() const { + return end_; + } + + private: + iterator begin_; + iterator end_; +}; + +/// Creates an integer range for the half-open interval [begin, end) +/// If end<=begin, then the range is empty. +/// The range has the type of the `end` integer; `begin` integer is +/// cast to this type. +template < + typename Integer1, + typename Integer2, + std::enable_if_t, bool> = true, + std::enable_if_t, bool> = true> +constexpr integer_range irange(Integer1 begin, Integer2 end) { + // If end<=begin then the range is empty; we can achieve this effect by + // choosing the larger of {begin, end} as the loop terminator + return { + static_cast(begin), + std::max(static_cast(begin), end)}; +} + +/// Creates an integer range for the half-open interval [0, end) +/// If end<=begin, then the range is empty +template < + typename Integer, + std::enable_if_t, bool> = true> +constexpr integer_range irange(Integer end) { + return {Integer(), end}; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h new file mode 100644 index 0000000000000000000000000000000000000000..6884e20d112ace8886c69b10499f830c58c3703f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h @@ -0,0 +1,910 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains some functions that are useful for math stuff. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __ANDROID_NDK__ +#include +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#ifndef LLVM_GNUC_PREREQ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +#define LLVM_GNUC_PREREQ(maj, min, patch) \ + ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \ + ((maj) << 20) + ((min) << 10) + (patch)) +#elif defined(__GNUC__) && defined(__GNUC_MINOR__) +#define LLVM_GNUC_PREREQ(maj, min, patch) \ + ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10)) +#else +#define LLVM_GNUC_PREREQ(maj, min, patch) 0 +#endif +#endif + +#ifdef _MSC_VER +// Declare these intrinsics manually rather including intrin.h. It's very +// expensive, and MathExtras.h is popular. +// #include +extern "C" { +unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask); +unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask); +unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask); +unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask); +} +#endif + +namespace c10::llvm { +/// The behavior an operation has on an input of 0. +enum ZeroBehavior { + /// The returned value is undefined. + ZB_Undefined, + /// The returned value is numeric_limits::max() + ZB_Max, + /// The returned value is numeric_limits::digits + ZB_Width +}; + +namespace detail { +template +struct TrailingZerosCounter { + static std::size_t count(T Val, ZeroBehavior /*unused*/) { + if (!Val) + return std::numeric_limits::digits; + if (Val & 0x1) + return 0; + + // Bisection method. + std::size_t ZeroBits = 0; + T Shift = std::numeric_limits::digits >> 1; + T Mask = std::numeric_limits::max() >> Shift; + while (Shift) { + if ((Val & Mask) == 0) { + Val >>= Shift; + ZeroBits |= Shift; + } + Shift >>= 1; + Mask >>= Shift; + } + return ZeroBits; + } +}; + +#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER) +template +struct TrailingZerosCounter { + static std::size_t count(T Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0) + return __builtin_ctz(Val); +#elif defined(_MSC_VER) + unsigned long Index; + _BitScanForward(&Index, Val); + return Index; +#endif + } +}; + +#if !defined(_MSC_VER) || defined(_M_X64) +template +struct TrailingZerosCounter { + static std::size_t count(T Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0) + return __builtin_ctzll(Val); +#elif defined(_MSC_VER) + unsigned long Index; + _BitScanForward64(&Index, Val); + return Index; +#endif + } +}; +#endif +#endif +} // namespace detail + +/// Count number of 0's from the least significant bit to the most +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + static_assert( + std::numeric_limits::is_integer && !std::numeric_limits::is_signed, + "Only unsigned integral types are allowed."); + return llvm::detail::TrailingZerosCounter::count(Val, ZB); +} + +namespace detail { +template +struct LeadingZerosCounter { + static std::size_t count(T Val, ZeroBehavior /*unused*/) { + if (!Val) + return std::numeric_limits::digits; + + // Bisection method. + std::size_t ZeroBits = 0; + for (T Shift = std::numeric_limits::digits >> 1; Shift; Shift >>= 1) { + T Tmp = Val >> Shift; + if (Tmp) + Val = Tmp; + else + ZeroBits |= Shift; + } + return ZeroBits; + } +}; + +#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER) +template +struct LeadingZerosCounter { + static std::size_t count(T Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0) + return __builtin_clz(Val); +#elif defined(_MSC_VER) + unsigned long Index; + _BitScanReverse(&Index, Val); + return Index ^ 31; +#endif + } +}; + +#if !defined(_MSC_VER) || defined(_M_X64) +template +struct LeadingZerosCounter { + static std::size_t count(T Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0) + return __builtin_clzll(Val); +#elif defined(_MSC_VER) + unsigned long Index; + _BitScanReverse64(&Index, Val); + return Index ^ 63; +#endif + } +}; +#endif +#endif +} // namespace detail + +/// Count number of 0's from the most significant bit to the least +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + static_assert( + std::numeric_limits::is_integer && !std::numeric_limits::is_signed, + "Only unsigned integral types are allowed."); + return llvm::detail::LeadingZerosCounter::count(Val, ZB); +} + +/// Get the index of the first set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + return countTrailingZeros(Val, ZB_Undefined); +} + +/// Create a bitmask with the N right-most bits set to 1, and all other +/// bits set to 0. Only unsigned types are allowed. +template +T maskTrailingOnes(unsigned N) { + static_assert(std::is_unsigned_v, "Invalid type!"); + const unsigned Bits = CHAR_BIT * sizeof(T); + assert(N <= Bits && "Invalid bit index"); + return N == 0 ? 0 : (T(-1) >> (Bits - N)); +} + +/// Create a bitmask with the N left-most bits set to 1, and all other +/// bits set to 0. Only unsigned types are allowed. +template +T maskLeadingOnes(unsigned N) { + return ~maskTrailingOnes(CHAR_BIT * sizeof(T) - N); +} + +/// Create a bitmask with the N right-most bits set to 0, and all other +/// bits set to 1. Only unsigned types are allowed. +template +T maskTrailingZeros(unsigned N) { + return maskLeadingOnes(CHAR_BIT * sizeof(T) - N); +} + +/// Create a bitmask with the N left-most bits set to 0, and all other +/// bits set to 1. Only unsigned types are allowed. +template +T maskLeadingZeros(unsigned N) { + return maskTrailingOnes(CHAR_BIT * sizeof(T) - N); +} + +/// Get the index of the last set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + // Use ^ instead of - because both gcc and llvm can remove the associated ^ + // in the __builtin_clz intrinsic on x86. + return countLeadingZeros(Val, ZB_Undefined) ^ + (std::numeric_limits::digits - 1); +} + +/// Macro compressed bit reversal table for 256 bits. +/// +/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +/// NOLINTNEXTLINE(*c-arrays*) +static constexpr unsigned char BitReverseTable256[256] = { +#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 +#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) +#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) + R6(0), + R6(2), + R6(1), + R6(3) +#undef R2 +#undef R4 +#undef R6 +}; + +/// Reverse the bits in \p Val. +template +T reverseBits(T Val) { + // NOLINTNEXTLINE(*c-arrays*) + unsigned char in[sizeof(Val)]; + // NOLINTNEXTLINE(*c-arrays*) + unsigned char out[sizeof(Val)]; + std::memcpy(in, &Val, sizeof(Val)); + for (unsigned i = 0; i < sizeof(Val); ++i) + out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; + std::memcpy(&Val, out, sizeof(Val)); + return Val; +} + +// NOTE: The following support functions use the _32/_64 extensions instead of +// type overloading so that signed and unsigned integers can be used without +// ambiguity. + +/// Return the high 32 bits of a 64 bit value. +constexpr inline uint32_t Hi_32(uint64_t Value) { + return static_cast(Value >> 32); +} + +/// Return the low 32 bits of a 64 bit value. +constexpr inline uint32_t Lo_32(uint64_t Value) { + return static_cast(Value); +} + +/// Make a 64-bit integer from a high / low pair of 32-bit integers. +constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { + return ((uint64_t)High << 32) | (uint64_t)Low; +} + +/// Checks if an integer fits into the given bit width. +template +constexpr inline bool isInt(int64_t x) { + return N >= 64 || + (-(INT64_C(1) << (N - 1)) <= x && x < (INT64_C(1) << (N - 1))); +} +// Template specializations to get better code for common cases. +template <> +constexpr inline bool isInt<8>(int64_t x) { + return static_cast(x) == x; +} +template <> +constexpr inline bool isInt<16>(int64_t x) { + return static_cast(x) == x; +} +template <> +constexpr inline bool isInt<32>(int64_t x) { + return static_cast(x) == x; +} + +/// Checks if a signed integer is an N bit number shifted left by S. +template +constexpr inline bool isShiftedInt(int64_t x) { + static_assert( + N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); + static_assert(N + S <= 64, "isShiftedInt with N + S > 64 is too wide."); + return isInt(x) && (x % (UINT64_C(1) << S) == 0); +} + +/// Checks if an unsigned integer fits into the given bit width. +/// +/// This is written as two functions rather than as simply +/// +/// return N >= 64 || X < (UINT64_C(1) << N); +/// +/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting +/// left too many places. +template +constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { + static_assert(N > 0, "isUInt<0> doesn't make sense"); + return X < (UINT64_C(1) << N); +} +template +constexpr inline std::enable_if_t= 64, bool> isUInt(uint64_t /*X*/) { + return true; +} + +// Template specializations to get better code for common cases. +template <> +constexpr inline bool isUInt<8>(uint64_t x) { + return static_cast(x) == x; +} +template <> +constexpr inline bool isUInt<16>(uint64_t x) { + return static_cast(x) == x; +} +template <> +constexpr inline bool isUInt<32>(uint64_t x) { + return static_cast(x) == x; +} + +/// Checks if a unsigned integer is an N bit number shifted left by S. +template +constexpr inline bool isShiftedUInt(uint64_t x) { + static_assert( + N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); + static_assert( + N + S <= 64, "isShiftedUInt with N + S > 64 is too wide."); + // Per the two static_asserts above, S must be strictly less than 64. So + // 1 << S is not undefined behavior. + return isUInt(x) && (x % (UINT64_C(1) << S) == 0); +} + +/// Gets the maximum value for a N-bit unsigned integer. +inline uint64_t maxUIntN(uint64_t N) { + assert(N > 0 && N <= 64 && "integer width out of range"); + + // uint64_t(1) << 64 is undefined behavior, so we can't do + // (uint64_t(1) << N) - 1 + // without checking first that N != 64. But this works and doesn't have a + // branch. + return UINT64_MAX >> (64 - N); +} + +// Ignore the false warning "Arithmetic overflow" for MSVC +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + +/// Gets the minimum value for a N-bit signed integer. +inline int64_t minIntN(int64_t N) { + assert(N > 0 && N <= 64 && "integer width out of range"); + // NOLINTNEXTLINE(*-narrowing-conversions) + return -(UINT64_C(1) << (N - 1)); +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/// Gets the maximum value for a N-bit signed integer. +inline int64_t maxIntN(int64_t N) { + assert(N > 0 && N <= 64 && "integer width out of range"); + + // This relies on two's complement wraparound when N == 64, so we convert to + // int64_t only at the very end to avoid UB. + // NOLINTNEXTLINE(*-narrowing-conversions) + return (UINT64_C(1) << (N - 1)) - 1; +} + +/// Checks if an unsigned integer fits into the given (dynamic) bit width. +inline bool isUIntN(unsigned N, uint64_t x) { + return N >= 64 || x <= maxUIntN(N); +} + +/// Checks if an signed integer fits into the given (dynamic) bit width. +inline bool isIntN(unsigned N, int64_t x) { + return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); +} + +/// Return true if the argument is a non-empty sequence of ones starting at the +/// least significant bit with the remainder zero (32 bit version). +/// Ex. isMask_32(0x0000FFFFU) == true. +constexpr inline bool isMask_32(uint32_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// Return true if the argument is a non-empty sequence of ones starting at the +/// least significant bit with the remainder zero (64 bit version). +constexpr inline bool isMask_64(uint64_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// Return true if the argument contains a non-empty sequence of ones with the +/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. +constexpr inline bool isShiftedMask_32(uint32_t Value) { + return Value && isMask_32((Value - 1) | Value); +} + +/// Return true if the argument contains a non-empty sequence of ones with the +/// remainder zero (64 bit version.) +constexpr inline bool isShiftedMask_64(uint64_t Value) { + return Value && isMask_64((Value - 1) | Value); +} + +/// Return true if the argument is a power of two > 0. +/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) +constexpr inline bool isPowerOf2_32(uint32_t Value) { + return Value && !(Value & (Value - 1)); +} + +/// Return true if the argument is a power of two > 0 (64 bit edition.) +constexpr inline bool isPowerOf2_64(uint64_t Value) { + return Value && !(Value & (Value - 1)); +} + +/// Count the number of ones from the most significant bit to the first +/// zero bit. +/// +/// Ex. countLeadingOnes(0xFF0FFF00) == 8. +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of all ones. Only ZB_Width and +/// ZB_Undefined are valid arguments. +template +std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { + static_assert( + std::numeric_limits::is_integer && !std::numeric_limits::is_signed, + "Only unsigned integral types are allowed."); + return countLeadingZeros(~Value, ZB); +} + +/// Count the number of ones from the least significant bit to the first +/// zero bit. +/// +/// Ex. countTrailingOnes(0x00FF00FF) == 8. +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of all ones. Only ZB_Width and +/// ZB_Undefined are valid arguments. +template +std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { + static_assert( + std::numeric_limits::is_integer && !std::numeric_limits::is_signed, + "Only unsigned integral types are allowed."); + return countTrailingZeros(~Value, ZB); +} + +namespace detail { +template +struct PopulationCounter { + static unsigned count(T Value) { + // Generic version, forward to 32 bits. + static_assert(SizeOfT <= 4, "Not implemented!"); +#if defined(__GNUC__) && __GNUC__ >= 4 + return __builtin_popcount(Value); +#else + uint32_t v = Value; + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; +#endif + } +}; + +template +struct PopulationCounter { + static unsigned count(T Value) { +#if defined(__GNUC__) && __GNUC__ >= 4 + return __builtin_popcountll(Value); +#else + uint64_t v = Value; + v = v - ((v >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); + v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); +#endif + } +}; +} // namespace detail + +/// Count the number of set bits in a value. +/// Ex. countPopulation(0xF000F000) = 8 +/// Returns 0 if the word is zero. +template +inline unsigned countPopulation(T Value) { + static_assert( + std::numeric_limits::is_integer && !std::numeric_limits::is_signed, + "Only unsigned integral types are allowed."); + return detail::PopulationCounter::count(Value); +} + +/// Return the log base 2 of the specified value. +inline double Log2(double Value) { +#if defined(__ANDROID_API__) && __ANDROID_API__ < 18 + return __builtin_log(Value) / __builtin_log(2.0); +#else + return log2(Value); +#endif +} + +/// Return the floor log base 2 of the specified value, -1 if the value is zero. +/// (32 bit edition.) +/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 +inline unsigned Log2_32(uint32_t Value) { + return static_cast(31 - countLeadingZeros(Value)); +} + +/// Return the floor log base 2 of the specified value, -1 if the value is zero. +/// (64 bit edition.) +inline unsigned Log2_64(uint64_t Value) { + return static_cast(63 - countLeadingZeros(Value)); +} + +/// Return the ceil log base 2 of the specified value, 32 if the value is zero. +/// (32 bit edition). +/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 +inline unsigned Log2_32_Ceil(uint32_t Value) { + return static_cast(32 - countLeadingZeros(Value - 1)); +} + +/// Return the ceil log base 2 of the specified value, 64 if the value is zero. +/// (64 bit edition.) +inline unsigned Log2_64_Ceil(uint64_t Value) { + return static_cast(64 - countLeadingZeros(Value - 1)); +} + +/// Return the greatest common divisor of the values using Euclid's algorithm. +inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { + while (B) { + uint64_t T = B; + B = A % B; + A = T; + } + return A; +} + +/// This function takes a 64-bit integer and returns the bit equivalent double. +inline double BitsToDouble(uint64_t Bits) { + double D = 0; + static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); + memcpy(&D, &Bits, sizeof(Bits)); + return D; +} + +/// This function takes a 32-bit integer and returns the bit equivalent float. +inline float BitsToFloat(uint32_t Bits) { + // TODO: Use std::bit_cast once C++20 becomes available. + return c10::bit_cast(Bits); +} + +/// This function takes a double and returns the bit equivalent 64-bit integer. +/// Note that copying doubles around changes the bits of NaNs on some hosts, +/// notably x86, so this routine cannot be used if these bits are needed. +inline uint64_t DoubleToBits(double Double) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + uint64_t Bits; + static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); + memcpy(&Bits, &Double, sizeof(Double)); + return Bits; +} + +/// This function takes a float and returns the bit equivalent 32-bit integer. +/// Note that copying floats around changes the bits of NaNs on some hosts, +/// notably x86, so this routine cannot be used if these bits are needed. +inline uint32_t FloatToBits(float Float) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + uint32_t Bits; + static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); + memcpy(&Bits, &Float, sizeof(Float)); + return Bits; +} + +/// A and B are either alignments or offsets. Return the minimum alignment that +/// may be assumed after adding the two together. +constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { + // The largest power of 2 that divides both A and B. + // + // Replace "-Value" by "1+~Value" in the following commented code to avoid + // MSVC warning C4146 + // return (A | B) & -(A | B); + return (A | B) & (1 + ~(A | B)); +} + +/// Aligns \c Addr to \c Alignment bytes, rounding up. +/// +/// Alignment should be a power of two. This method rounds up, so +/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8. +inline uintptr_t alignAddr(const void* Addr, size_t Alignment) { + assert( + Alignment && isPowerOf2_64((uint64_t)Alignment) && + "Alignment is not a power of two!"); + + assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr); + + return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1)); +} + +/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment +/// bytes, rounding up. +inline size_t alignmentAdjustment(const void* Ptr, size_t Alignment) { + return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr; +} + +/// Returns the next power of two (in 64-bits) that is strictly greater than A. +/// Returns zero on overflow. +inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +/// Returns the power of two which is less than or equal to the given value. +/// Essentially, it is a floor operation across the domain of powers of two. +inline uint64_t PowerOf2Floor(uint64_t A) { + if (!A) + return 0; + return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); +} + +/// Returns the power of two which is greater than or equal to the given value. +/// Essentially, it is a ceil operation across the domain of powers of two. +inline uint64_t PowerOf2Ceil(uint64_t A) { + if (!A) + return 0; + return NextPowerOf2(A - 1); +} + +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// If non-zero \p Skew is specified, the return value will be a minimal +/// integer that is greater than or equal to \p Value and equal to +/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than +/// \p Align, its value is adjusted to '\p Skew mod \p Align'. +/// +/// Examples: +/// \code +/// alignTo(5, 8) = 8 +/// alignTo(17, 8) = 24 +/// alignTo(~0LL, 8) = 0 +/// alignTo(321, 255) = 510 +/// +/// alignTo(5, 8, 7) = 7 +/// alignTo(17, 8, 1) = 17 +/// alignTo(~0LL, 8, 3) = 3 +/// alignTo(321, 255, 42) = 552 +/// \endcode +inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { + assert(Align != 0u && "Align can't be 0."); + Skew %= Align; + return (Value + Align - 1 - Skew) / Align * Align + Skew; +} + +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \c Align. \c Align must be non-zero. +template +constexpr inline uint64_t alignTo(uint64_t Value) { + static_assert(Align != 0u, "Align must be non-zero"); + return (Value + Align - 1) / Align * Align; +} + +/// Returns the integer ceil(Numerator / Denominator). +inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { + return alignTo(Numerator, Denominator) / Denominator; +} + +/// \c alignTo for contexts where a constant expression is required. +/// \sa alignTo +/// +/// \todo FIXME: remove when \c constexpr becomes really \c constexpr +template +struct AlignTo { + static_assert(Align != 0u, "Align must be non-zero"); + template + struct from_value { + static const uint64_t value = (Value + Align - 1) / Align * Align; + }; +}; + +/// Returns the largest uint64_t less than or equal to \p Value and is +/// \p Skew mod \p Align. \p Align must be non-zero +inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { + assert(Align != 0u && "Align can't be 0."); + Skew %= Align; + return (Value - Skew) / Align * Align + Skew; +} + +/// Returns the offset to the next integer (mod 2**64) that is greater than +/// or equal to \p Value and is a multiple of \p Align. \p Align must be +/// non-zero. +inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) { + return alignTo(Value, Align) - Value; +} + +/// Sign-extend the number in the bottom B bits of X to a 32-bit integer. +/// Requires 0 < B <= 32. +template +constexpr inline int32_t SignExtend32(uint32_t X) { + static_assert(B > 0, "Bit width can't be 0."); + static_assert(B <= 32, "Bit width out of range."); + return int32_t(X << (32 - B)) >> (32 - B); +} + +/// Sign-extend the number in the bottom B bits of X to a 32-bit integer. +/// Requires 0 < B < 32. +inline int32_t SignExtend32(uint32_t X, unsigned B) { + assert(B > 0 && "Bit width can't be 0."); + assert(B <= 32 && "Bit width out of range."); + return int32_t(X << (32 - B)) >> (32 - B); +} + +/// Sign-extend the number in the bottom B bits of X to a 64-bit integer. +/// Requires 0 < B < 64. +template +constexpr inline int64_t SignExtend64(uint64_t x) { + static_assert(B > 0, "Bit width can't be 0."); + static_assert(B <= 64, "Bit width out of range."); + return int64_t(x << (64 - B)) >> (64 - B); +} + +/// Sign-extend the number in the bottom B bits of X to a 64-bit integer. +/// Requires 0 < B < 64. +inline int64_t SignExtend64(uint64_t X, unsigned B) { + assert(B > 0 && "Bit width can't be 0."); + assert(B <= 64 && "Bit width out of range."); + return int64_t(X << (64 - B)) >> (64 - B); +} + +/// Subtract two unsigned integers, X and Y, of type T and return the absolute +/// value of the result. +template +std::enable_if_t, T> AbsoluteDifference(T X, T Y) { + return std::max(X, Y) - std::min(X, Y); +} + +/// Add two unsigned integers, X and Y, of type T. Clamp the result to the +/// maximum representable value of T on overflow. ResultOverflowed indicates if +/// the result is larger than the maximum representable value of type T. +template +std::enable_if_t, T> SaturatingAdd( + T X, + T Y, + bool* ResultOverflowed = nullptr) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool Dummy; + bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; + // Hacker's Delight, p. 29 + T Z = X + Y; + Overflowed = (Z < X || Z < Y); + if (Overflowed) + return std::numeric_limits::max(); + else + return Z; +} + +/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the +/// maximum representable value of T on overflow. ResultOverflowed indicates if +/// the result is larger than the maximum representable value of type T. +template +std::enable_if_t, T> SaturatingMultiply( + T X, + T Y, + bool* ResultOverflowed = nullptr) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool Dummy; + bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; + + // Hacker's Delight, p. 30 has a different algorithm, but we don't use that + // because it fails for uint16_t (where multiplication can have undefined + // behavior due to promotion to int), and requires a division in addition + // to the multiplication. + + Overflowed = false; + + // Log2(Z) would be either Log2Z or Log2Z + 1. + // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z + // will necessarily be less than Log2Max as desired. + int Log2Z = Log2_64(X) + Log2_64(Y); + const T Max = std::numeric_limits::max(); + int Log2Max = Log2_64(Max); + if (Log2Z < Log2Max) { + return X * Y; + } + if (Log2Z > Log2Max) { + Overflowed = true; + return Max; + } + + // We're going to use the top bit, and maybe overflow one + // bit past it. Multiply all but the bottom bit then add + // that on at the end. + T Z = (X >> 1) * Y; + if (Z & ~(Max >> 1)) { + Overflowed = true; + return Max; + } + Z <<= 1; + if (X & 1) + return SaturatingAdd(Z, Y, ResultOverflowed); + + return Z; +} + +/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to +/// the product. Clamp the result to the maximum representable value of T on +/// overflow. ResultOverflowed indicates if the result is larger than the +/// maximum representable value of type T. +template +std::enable_if_t, T> SaturatingMultiplyAdd( + T X, + T Y, + T A, + bool* ResultOverflowed = nullptr) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool Dummy; + bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; + + T Product = SaturatingMultiply(X, Y, &Overflowed); + if (Overflowed) + return Product; + + return SaturatingAdd(A, Product, &Overflowed); +} + +/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. +extern const float huge_valf; +} // namespace c10::llvm + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h new file mode 100644 index 0000000000000000000000000000000000000000..4ae58609b5d56135e59075d5428e03a6c99ff230 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +C10_DECLARE_bool(caffe2_cpu_numa_enabled); + +namespace c10 { + +/** + * Check whether NUMA is enabled + */ +C10_API bool IsNUMAEnabled(); + +/** + * Bind to a given NUMA node + */ +C10_API void NUMABind(int numa_node_id); + +/** + * Get the NUMA id for a given pointer `ptr` + */ +C10_API int GetNUMANode(const void* ptr); + +/** + * Get number of NUMA nodes + */ +C10_API int GetNumNUMANodes(); + +/** + * Move the memory pointed to by `ptr` of a given size to another NUMA node + */ +C10_API void NUMAMove(void* ptr, size_t size, int numa_node_id); + +/** + * Get the current NUMA node id + */ +C10_API int GetCurrentNUMANode(); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h new file mode 100644 index 0000000000000000000000000000000000000000..e991a567ec5eac9c967f4743255de1eb51c9338a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h @@ -0,0 +1,2222 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Taken from +// https://github.com/skarupke/flat_hash_map/blob/2c4687431f978f02a3780e24b8b701d22aa32d9c/flat_hash_map.hpp +// with fixes applied: +// - https://github.com/skarupke/flat_hash_map/pull/25 +// - https://github.com/skarupke/flat_hash_map/pull/26 +// - replace size_t with uint64_t to fix it for 32bit +// - add "GCC diagnostic" pragma to ignore -Wshadow +// - make sherwood_v3_table::convertible_to_iterator public because GCC5 seems +// to have issues with it otherwise +// - fix compiler warnings in operator templated_iterator +// - make use of 'if constexpr' and eliminate AssignIfTrue template + +// Copyright Malte Skarupke 2017. +// Distributed under the Boost Software License, Version 1.0. +// (See http://www.boost.org/LICENSE_1_0.txt) + +// Modified to maintain insertion and deletion order through a doubly-linked +// list + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__ +#else +#define SKA_NOINLINE(...) __VA_ARGS__ __attribute__((noinline)) +#endif + +namespace ska_ordered { + +struct prime_number_hash_policy; +struct power_of_two_hash_policy; +struct fibonacci_hash_policy; + +namespace detailv3 { +template +struct functor_storage : Functor { + functor_storage() = default; + functor_storage(const Functor& functor) : Functor(functor) {} + template + Result operator()(Args&&... args) { + return static_cast(*this)(std::forward(args)...); + } + template + Result operator()(Args&&... args) const { + return static_cast(*this)(std::forward(args)...); + } +}; +template +struct functor_storage { + typedef Result (*function_ptr)(Args...); + function_ptr function; + functor_storage(function_ptr function) : function(function) {} + Result operator()(Args... args) const { + return function(std::forward(args)...); + } + operator function_ptr&() { + return function; + } + operator const function_ptr&() { + return function; + } +}; +template +struct KeyOrValueHasher : functor_storage { + typedef functor_storage hasher_storage; + KeyOrValueHasher() = default; + KeyOrValueHasher(const hasher& hash) : hasher_storage(hash) {} + uint64_t operator()(const key_type& key) { + return static_cast(*this)(key); + } + uint64_t operator()(const key_type& key) const { + return static_cast(*this)(key); + } + uint64_t operator()(const value_type& value) { + return static_cast(*this)(value.first); + } + uint64_t operator()(const value_type& value) const { + return static_cast(*this)(value.first); + } + template + uint64_t operator()(const std::pair& value) { + return static_cast(*this)(value.first); + } + template + uint64_t operator()(const std::pair& value) const { + return static_cast(*this)(value.first); + } +}; +template +struct KeyOrValueEquality : functor_storage { + typedef functor_storage equality_storage; + KeyOrValueEquality() = default; + KeyOrValueEquality(const key_equal& equality) : equality_storage(equality) {} + bool operator()(const key_type& lhs, const key_type& rhs) { + return static_cast(*this)(lhs, rhs); + } + bool operator()(const key_type& lhs, const value_type& rhs) { + return static_cast(*this)(lhs, rhs.first); + } + bool operator()(const value_type& lhs, const key_type& rhs) { + return static_cast(*this)(lhs.first, rhs); + } + bool operator()(const value_type& lhs, const value_type& rhs) { + return static_cast(*this)(lhs.first, rhs.first); + } + template + bool operator()(const key_type& lhs, const std::pair& rhs) { + return static_cast(*this)(lhs, rhs.first); + } + template + bool operator()(const std::pair& lhs, const key_type& rhs) { + return static_cast(*this)(lhs.first, rhs); + } + template + bool operator()(const value_type& lhs, const std::pair& rhs) { + return static_cast(*this)(lhs.first, rhs.first); + } + template + bool operator()(const std::pair& lhs, const value_type& rhs) { + return static_cast(*this)(lhs.first, rhs.first); + } + template + bool operator()(const std::pair& lhs, const std::pair& rhs) { + return static_cast(*this)(lhs.first, rhs.first); + } +}; +static constexpr int8_t min_lookups = 4; +template +// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions) +struct sherwood_v3_entry { + // NOLINTNEXTLINE(modernize-use-equals-default) + sherwood_v3_entry() {} + sherwood_v3_entry(int8_t distance_from_desired) + : distance_from_desired(distance_from_desired) {} + // NOLINTNEXTLINE(modernize-use-equals-default) + ~sherwood_v3_entry() {} + + bool has_value() const { + return distance_from_desired >= 0; + } + bool is_empty() const { + return distance_from_desired < 0; + } + bool is_at_desired_position() const { + return distance_from_desired <= 0; + } + template + void emplace(int8_t distance, Args&&... args) { + new (std::addressof(value)) T(std::forward(args)...); + distance_from_desired = distance; + } + + void destroy_value() { + value.~T(); + distance_from_desired = -1; + } + + sherwood_v3_entry* prev = nullptr; + sherwood_v3_entry* next = nullptr; + int8_t distance_from_desired = -1; + static constexpr int8_t special_end_value = 0; + union { + T value; + }; +}; + +inline int8_t log2(uint64_t value) { + static constexpr std::array table = { + 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, + 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, + 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, + 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5}; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + value |= value >> 32; + return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58]; +} + +inline uint64_t next_power_of_two(uint64_t i) { + --i; + i |= i >> 1; + i |= i >> 2; + i |= i >> 4; + i |= i >> 8; + i |= i >> 16; + i |= i >> 32; + ++i; + return i; +} + +// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t +// (it takes CWG1558 into account and also works for older compilers) +template +struct make_void { + typedef void type; +}; +template +using void_t = typename make_void::type; + +template +struct HashPolicySelector { + typedef fibonacci_hash_policy type; +}; +template +struct HashPolicySelector> { + typedef typename T::hash_policy type; +}; + +template < + typename T, + typename FindKey, + typename ArgumentHash, + typename Hasher, + typename ArgumentEqual, + typename Equal, + typename ArgumentAlloc, + typename EntryAlloc> +class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal { + using Entry = detailv3::sherwood_v3_entry; + using AllocatorTraits = std::allocator_traits; + using EntryPointer = typename AllocatorTraits::pointer; + + public: + struct convertible_to_iterator; + + using value_type = T; + using size_type = uint64_t; + using difference_type = std::ptrdiff_t; + using hasher = ArgumentHash; + using key_equal = ArgumentEqual; + using allocator_type = EntryAlloc; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + + sherwood_v3_table() = default; + explicit sherwood_v3_table( + size_type bucket_count, + const ArgumentHash& hash = ArgumentHash(), + const ArgumentEqual& equal = ArgumentEqual(), + const ArgumentAlloc& alloc = ArgumentAlloc()) + : EntryAlloc(alloc), Hasher(hash), Equal(equal) { + rehash(bucket_count); + } + sherwood_v3_table(size_type bucket_count, const ArgumentAlloc& alloc) + : sherwood_v3_table( + bucket_count, + ArgumentHash(), + ArgumentEqual(), + alloc) {} + sherwood_v3_table( + size_type bucket_count, + const ArgumentHash& hash, + const ArgumentAlloc& alloc) + : sherwood_v3_table(bucket_count, hash, ArgumentEqual(), alloc) {} + explicit sherwood_v3_table(const ArgumentAlloc& alloc) : EntryAlloc(alloc) {} + template + sherwood_v3_table( + It first, + It last, + size_type bucket_count = 0, + const ArgumentHash& hash = ArgumentHash(), + const ArgumentEqual& equal = ArgumentEqual(), + const ArgumentAlloc& alloc = ArgumentAlloc()) + : sherwood_v3_table(bucket_count, hash, equal, alloc) { + insert(first, last); + } + template + sherwood_v3_table( + It first, + It last, + size_type bucket_count, + const ArgumentAlloc& alloc) + : sherwood_v3_table( + first, + last, + bucket_count, + ArgumentHash(), + ArgumentEqual(), + alloc) {} + template + sherwood_v3_table( + It first, + It last, + size_type bucket_count, + const ArgumentHash& hash, + const ArgumentAlloc& alloc) + : sherwood_v3_table( + first, + last, + bucket_count, + hash, + ArgumentEqual(), + alloc) {} + sherwood_v3_table( + std::initializer_list il, + size_type bucket_count = 0, + const ArgumentHash& hash = ArgumentHash(), + const ArgumentEqual& equal = ArgumentEqual(), + const ArgumentAlloc& alloc = ArgumentAlloc()) + : sherwood_v3_table(bucket_count, hash, equal, alloc) { + if (bucket_count == 0) + rehash(il.size()); + insert(il.begin(), il.end()); + } + sherwood_v3_table( + std::initializer_list il, + size_type bucket_count, + const ArgumentAlloc& alloc) + : sherwood_v3_table( + il, + bucket_count, + ArgumentHash(), + ArgumentEqual(), + alloc) {} + sherwood_v3_table( + std::initializer_list il, + size_type bucket_count, + const ArgumentHash& hash, + const ArgumentAlloc& alloc) + : sherwood_v3_table(il, bucket_count, hash, ArgumentEqual(), alloc) {} + sherwood_v3_table(const sherwood_v3_table& other) + : sherwood_v3_table( + other, + AllocatorTraits::select_on_container_copy_construction( + other.get_allocator())) {} + sherwood_v3_table(const sherwood_v3_table& other, const ArgumentAlloc& alloc) + : EntryAlloc(alloc), + Hasher(other), + Equal(other), + _max_load_factor(other._max_load_factor) { + rehash_for_other_container(other); + try { + insert(other.begin(), other.end()); + } catch (...) { + clear(); + deallocate_data(entries, num_slots_minus_one, max_lookups); + throw; + } + } + sherwood_v3_table(sherwood_v3_table&& other) noexcept + : EntryAlloc(std::move(other)), + Hasher(std::move(other)), + Equal(std::move(other)) { + swap_pointers(other); + } + sherwood_v3_table( + sherwood_v3_table&& other, + const ArgumentAlloc& alloc) noexcept + : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) { + swap_pointers(other); + } + sherwood_v3_table& operator=(const sherwood_v3_table& other) { + if (this == std::addressof(other)) + return *this; + + clear(); + if constexpr (AllocatorTraits::propagate_on_container_copy_assignment:: + value) { + if (static_cast(*this) != + static_cast(other)) { + reset_to_empty_state(); + } + static_cast(*this) = other; + } + _max_load_factor = other._max_load_factor; + static_cast(*this) = other; + static_cast(*this) = other; + rehash_for_other_container(other); + insert(other.begin(), other.end()); + return *this; + } + sherwood_v3_table& operator=(sherwood_v3_table&& other) noexcept { + if (this == std::addressof(other)) + return *this; + else if constexpr (AllocatorTraits::propagate_on_container_move_assignment:: + value) { + clear(); + reset_to_empty_state(); + static_cast(*this) = std::move(other); + swap_pointers(other); + } else if ( + static_cast(*this) == static_cast(other)) { + swap_pointers(other); + } else { + clear(); + _max_load_factor = other._max_load_factor; + rehash_for_other_container(other); + for (T& elem : other) + emplace(std::move(elem)); + other.clear(); + } + static_cast(*this) = std::move(other); + static_cast(*this) = std::move(other); + return *this; + } + ~sherwood_v3_table() { + clear(); + deallocate_data(entries, num_slots_minus_one, max_lookups); + } + + const allocator_type& get_allocator() const { + return static_cast(*this); + } + const ArgumentEqual& key_eq() const { + return static_cast(*this); + } + const ArgumentHash& hash_function() const { + return static_cast(*this); + } + + template + struct templated_iterator { + templated_iterator() = default; + templated_iterator(EntryPointer current) : current(current) {} + EntryPointer current = EntryPointer(); + + using iterator_category = std::forward_iterator_tag; + using value_type = ValueType; + using difference_type = ptrdiff_t; + using pointer = ValueType*; + using reference = ValueType&; + + friend bool operator==( + const templated_iterator& lhs, + const templated_iterator& rhs) { + return lhs.current == rhs.current; + } + friend bool operator!=( + const templated_iterator& lhs, + const templated_iterator& rhs) { + return !(lhs == rhs); + } + + templated_iterator& operator++() { + current = current->next; + return *this; + } + templated_iterator operator++(int) { + templated_iterator copy(*this); + ++*this; + return copy; + } + + ValueType& operator*() const { + return current->value; + } + ValueType* operator->() const { + return std::addressof(current->value); + } + + // the template automatically disables the operator when value_type is + // already const, because that would cause a lot of compiler warnings + // otherwise. + template < + class target_type = const value_type, + class = std::enable_if_t< + std::is_same_v && + !std::is_same_v>> + operator templated_iterator() const { + return {current}; + } + }; + using iterator = templated_iterator; + using const_iterator = templated_iterator; + + iterator begin() { + return sentinel->next; + } + const_iterator begin() const { + return sentinel->next; + } + const_iterator cbegin() const { + return begin(); + } + iterator end() { + return sentinel; + } + const_iterator end() const { + return sentinel; + } + const_iterator cend() const { + return end(); + } + + iterator find(const FindKey& key) { + uint64_t index = + hash_policy.index_for_hash(hash_object(key), num_slots_minus_one); + EntryPointer it = entries + ptrdiff_t(index); + for (int8_t distance = 0; it->distance_from_desired >= distance; + ++distance, ++it) { + if (compares_equal(key, it->value)) + return {it}; + } + return end(); + } + const_iterator find(const FindKey& key) const { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + return const_cast(this)->find(key); + } + uint64_t count(const FindKey& key) const { + return find(key) == end() ? 0 : 1; + } + std::pair equal_range(const FindKey& key) { + iterator found = find(key); + if (found == end()) + return {found, found}; + else + return {found, std::next(found)}; + } + std::pair equal_range( + const FindKey& key) const { + const_iterator found = find(key); + if (found == end()) + return {found, found}; + else + return {found, std::next(found)}; + } + + template + std::pair emplace(Key&& key, Args&&... args) { + uint64_t index = + hash_policy.index_for_hash(hash_object(key), num_slots_minus_one); + EntryPointer current_entry = entries + ptrdiff_t(index); + int8_t distance_from_desired = 0; + for (; current_entry->distance_from_desired >= distance_from_desired; + ++current_entry, ++distance_from_desired) { + // insertion of an existing key does not change ordering + if (compares_equal(key, current_entry->value)) + return {{current_entry}, false}; + } + return emplace_new_key( + distance_from_desired, + current_entry, + std::forward(key), + std::forward(args)...); + } + + std::pair insert(const value_type& value) { + return emplace(value); + } + std::pair insert(value_type&& value) { + return emplace(std::move(value)); + } + template + iterator emplace_hint(const_iterator /*unused*/, Args&&... args) { + return emplace(std::forward(args)...).first; + } + iterator insert(const_iterator /*unused*/, const value_type& value) { + return emplace(value).first; + } + iterator insert(const_iterator /*unused*/, value_type&& value) { + return emplace(std::move(value)).first; + } + + template + void insert(It begin, It end) { + for (; begin != end; ++begin) { + emplace(*begin); + } + } + void insert(std::initializer_list il) { + insert(il.begin(), il.end()); + } + + void rehash(uint64_t num_buckets) { + num_buckets = std::max( + num_buckets, + static_cast(std::ceil( + static_cast(num_elements) / + static_cast(_max_load_factor)))); + if (num_buckets == 0) { + reset_to_empty_state(); + return; + } + auto new_prime_index = hash_policy.next_size_over(num_buckets); + if (num_buckets == bucket_count()) + return; + int8_t new_max_lookups = compute_max_lookups(num_buckets); + EntryPointer new_buckets( + AllocatorTraits::allocate(*this, num_buckets + new_max_lookups)); + EntryPointer special_end_item = + new_buckets + static_cast(num_buckets + new_max_lookups - 1); + for (EntryPointer it = new_buckets; it != special_end_item; ++it) + it->distance_from_desired = -1; + special_end_item->distance_from_desired = Entry::special_end_value; + std::swap(entries, new_buckets); + std::swap(num_slots_minus_one, num_buckets); + --num_slots_minus_one; + hash_policy.commit(new_prime_index); + int8_t old_max_lookups = max_lookups; + max_lookups = new_max_lookups; + num_elements = 0; + + auto start = sentinel->next; + // point sentinel to itself; + reset_list(); + // reinsert list + for (EntryPointer it = start; it != sentinel;) { + auto next = it->next; + emplace(std::move(it->value)); + it->destroy_value(); + it = next; + } + + deallocate_data(new_buckets, num_buckets, old_max_lookups); + } + + void reserve(uint64_t num_elements_) { + uint64_t required_buckets = num_buckets_for_reserve(num_elements_); + if (required_buckets > bucket_count()) + rehash(required_buckets); + } + + void replace_linked_list_position( + EntryPointer to_be_replaced, + EntryPointer new_node) { + remove_from_list(new_node); + insert_after(new_node, to_be_replaced->prev); + remove_from_list(to_be_replaced); + } + + // the return value is a type that can be converted to an iterator + // the reason for doing this is that it's not free to find the + // iterator pointing at the next element. if you care about the + // next iterator, turn the return value into an iterator + convertible_to_iterator erase(const_iterator to_erase) { + EntryPointer current = to_erase.current; + remove_from_list(current); + current->destroy_value(); + --num_elements; + + for (EntryPointer next = current + ptrdiff_t(1); + !next->is_at_desired_position(); + ++current, ++next) { + // if an entry is being removed, and there are other entries with the + // same hash, the other entries get moved to their desired position by + // reinserting. + current->emplace(next->distance_from_desired - 1, std::move(next->value)); + replace_linked_list_position(next, current); + next->destroy_value(); + } + return {to_erase.current}; + } + + iterator erase(const_iterator begin_it, const_iterator end_it) { + // whenever an entry is removed and there are other entries with the same + // hash, the other entries must get moved to their desired position. + // any reference to a moved entry is invalidated. + // here, we iterate through the range, and make sure that we update + // the pointer to our next entry in the list or the end of the iterator + // when it is invalidated. + + auto curr_iter = begin_it.current; + auto next_iter = curr_iter->next; + auto end_iter = end_it.current; + + while (curr_iter != end_iter) { + remove_from_list(curr_iter); + curr_iter->destroy_value(); + --num_elements; + + for (EntryPointer next_hash_slot = curr_iter + ptrdiff_t(1); + !next_hash_slot->is_at_desired_position(); + ++curr_iter, ++next_hash_slot) { + curr_iter->emplace( + next_hash_slot->distance_from_desired - 1, + std::move(next_hash_slot->value)); + replace_linked_list_position(next_hash_slot, curr_iter); + next_hash_slot->destroy_value(); + + // we are invalidating next_iter or end_iter + if (next_hash_slot == end_iter) { + end_iter = curr_iter; + } else if (next_hash_slot == next_iter) { + next_iter = curr_iter; + } + } + curr_iter = next_iter; + next_iter = curr_iter->next; + } + + return {end_iter}; + } + + uint64_t erase(const FindKey& key) { + auto found = find(key); + if (found == end()) + return 0; + else { + erase(found); + return 1; + } + } + + void clear() { + for (EntryPointer it = entries, + end = it + + static_cast(num_slots_minus_one + max_lookups); + it != end; + ++it) { + if (it->has_value()) + it->destroy_value(); + } + reset_list(); + num_elements = 0; + } + + void shrink_to_fit() { + rehash_for_other_container(*this); + } + + void swap(sherwood_v3_table& other) noexcept { + using std::swap; + swap_pointers(other); + swap(static_cast(*this), static_cast(other)); + swap( + static_cast(*this), static_cast(other)); + if (AllocatorTraits::propagate_on_container_swap::value) + swap(static_cast(*this), static_cast(other)); + } + + uint64_t size() const { + return num_elements; + } + uint64_t max_size() const { + return (AllocatorTraits::max_size(*this)) / sizeof(Entry); + } + uint64_t bucket_count() const { + return num_slots_minus_one ? num_slots_minus_one + 1 : 0; + } + size_type max_bucket_count() const { + return (AllocatorTraits::max_size(*this) - min_lookups) / sizeof(Entry); + } + uint64_t bucket(const FindKey& key) const { + return hash_policy.index_for_hash(hash_object(key), num_slots_minus_one); + } + float load_factor() const { + uint64_t buckets = bucket_count(); + if (buckets) + return static_cast(num_elements) / bucket_count(); + else + return 0; + } + void max_load_factor(float value) { + _max_load_factor = value; + } + float max_load_factor() const { + return _max_load_factor; + } + + bool empty() const { + return num_elements == 0; + } + + private: + EntryPointer entries = empty_default_table(); + uint64_t num_slots_minus_one = 0; + typename HashPolicySelector::type hash_policy; + int8_t max_lookups = detailv3::min_lookups - 1; + float _max_load_factor = 0.5f; + uint64_t num_elements = 0; + std::unique_ptr> sentinel_val; + + // head of doubly linked list + EntryPointer sentinel = initSentinel(); + + EntryPointer initSentinel() { + // needs to be a pointer so that hash map can be used with forward declared + // types + sentinel_val = std::make_unique>(); + sentinel = sentinel_val.get(); + reset_list(); + return sentinel; + } + + EntryPointer empty_default_table() { + EntryPointer result = + AllocatorTraits::allocate(*this, detailv3::min_lookups); + EntryPointer special_end_item = + result + static_cast(detailv3::min_lookups - 1); + for (EntryPointer it = result; it != special_end_item; ++it) + it->distance_from_desired = -1; + special_end_item->distance_from_desired = Entry::special_end_value; + return result; + } + + static int8_t compute_max_lookups(uint64_t num_buckets) { + int8_t desired = detailv3::log2(num_buckets); + return std::max(detailv3::min_lookups, desired); + } + + uint64_t num_buckets_for_reserve(uint64_t num_elements_) const { + return static_cast(std::ceil( + static_cast(num_elements_) / + std::min(0.5, static_cast(_max_load_factor)))); + } + void rehash_for_other_container(const sherwood_v3_table& other) { + rehash( + std::min(num_buckets_for_reserve(other.size()), other.bucket_count())); + } + + void swap_pointers(sherwood_v3_table& other) { + using std::swap; + swap(hash_policy, other.hash_policy); + swap(entries, other.entries); + swap(num_slots_minus_one, other.num_slots_minus_one); + swap(num_elements, other.num_elements); + swap(max_lookups, other.max_lookups); + swap(_max_load_factor, other._max_load_factor); + swap(sentinel, other.sentinel); + swap(sentinel_val, other.sentinel_val); + } + + void reset_list() { + sentinel->next = sentinel; + sentinel->prev = sentinel; + } + + void remove_from_list(EntryPointer elem) { + elem->prev->next = elem->next; + elem->next->prev = elem->prev; + } + + void insert_after(EntryPointer new_elem, EntryPointer prev) { + auto next = prev->next; + + prev->next = new_elem; + new_elem->prev = prev; + + new_elem->next = next; + next->prev = new_elem; + } + + void swap_adjacent_nodes(EntryPointer before, EntryPointer after) { + // sentinel stays constant, so before->prev cannot equal after + auto before_prev = before->prev; + auto after_next = after->next; + + before_prev->next = after; + after->prev = before_prev; + + after_next->prev = before; + before->next = after_next; + + before->prev = after; + after->next = before; + } + + void swap_positions(EntryPointer p1, EntryPointer p2) { + if (p1 == p2) { + return; + } + if (p1->next == p2) { + return swap_adjacent_nodes(p1, p2); + } else if (p2->next == p1) { + return swap_adjacent_nodes(p2, p1); + } + + auto p1_prev = p1->prev; + auto p1_next = p1->next; + + auto p2_prev = p2->prev; + auto p2_next = p2->next; + + p1_prev->next = p2; + p2->prev = p1_prev; + + p1_next->prev = p2; + p2->next = p1_next; + + p2_prev->next = p1; + p1->prev = p2_prev; + + p2_next->prev = p1; + p1->next = p2_next; + } + + void append_to_list(EntryPointer new_tail) { + insert_after(new_tail, sentinel->prev); + } + + template + SKA_NOINLINE(std::pair) + emplace_new_key( + int8_t distance_from_desired, + EntryPointer current_entry, + Key&& key, + Args&&... args) { + using std::swap; + if (num_slots_minus_one == 0 || distance_from_desired == max_lookups || + static_cast(num_elements + 1) > + static_cast(num_slots_minus_one + 1) * + static_cast(_max_load_factor)) { + grow(); + return emplace(std::forward(key), std::forward(args)...); + } else if (current_entry->is_empty()) { + current_entry->emplace( + distance_from_desired, + std::forward(key), + std::forward(args)...); + ++num_elements; + append_to_list(current_entry); + return {{current_entry}, true}; + } + value_type to_insert(std::forward(key), std::forward(args)...); + swap(distance_from_desired, current_entry->distance_from_desired); + // We maintain the invariant that: + // - result.current_entry contains the new value we're inserting + // and is in the LinkedList position of to_insert + // - to_insert contains the value that represents the position of + // result.current_entry + swap(to_insert, current_entry->value); + iterator result = {current_entry}; + for (++distance_from_desired, ++current_entry;; ++current_entry) { + if (current_entry->is_empty()) { + current_entry->emplace(distance_from_desired, std::move(to_insert)); + append_to_list(current_entry); + // now we can swap back the displaced value to its correct position, + // putting the new value we're inserting to the front of the list + swap_positions(current_entry, result.current); + ++num_elements; + return {result, true}; + } else if (current_entry->distance_from_desired < distance_from_desired) { + swap(distance_from_desired, current_entry->distance_from_desired); + swap(to_insert, current_entry->value); + // to maintain our invariants we need to swap positions + // of result.current & current_entry: + swap_positions(result.current, current_entry); + ++distance_from_desired; + } else { + ++distance_from_desired; + if (distance_from_desired == max_lookups) { + // the displaced element gets put back into its correct position + // we grow the hash table, and then try again to reinsert the new + // element + swap(to_insert, result.current->value); + grow(); + return emplace(std::move(to_insert)); + } + } + } + } + + void grow() { + rehash(std::max(uint64_t(4), 2 * bucket_count())); + } + + void deallocate_data( + EntryPointer begin, + uint64_t num_slots_minus_one_, + int8_t max_lookups_) { + AllocatorTraits::deallocate( + *this, begin, num_slots_minus_one_ + max_lookups_ + 1); + } + + void reset_to_empty_state() { + deallocate_data(entries, num_slots_minus_one, max_lookups); + entries = empty_default_table(); + num_slots_minus_one = 0; + hash_policy.reset(); + max_lookups = detailv3::min_lookups - 1; + } + + template + uint64_t hash_object(const U& key) { + return static_cast(*this)(key); + } + template + uint64_t hash_object(const U& key) const { + return static_cast(*this)(key); + } + template + bool compares_equal(const L& lhs, const R& rhs) { + return static_cast(*this)(lhs, rhs); + } + + public: + struct convertible_to_iterator { + EntryPointer it; + + operator iterator() { + if (it->has_value()) + return {it}; + else + return ++iterator{it}; + } + operator const_iterator() { + if (it->has_value()) + return {it}; + else + return ++const_iterator{it}; + } + }; +}; +} // namespace detailv3 + +struct prime_number_hash_policy { + static uint64_t mod0(uint64_t /*unused*/) { + return 0llu; + } + static uint64_t mod2(uint64_t hash) { + return hash % 2llu; + } + static uint64_t mod3(uint64_t hash) { + return hash % 3llu; + } + static uint64_t mod5(uint64_t hash) { + return hash % 5llu; + } + static uint64_t mod7(uint64_t hash) { + return hash % 7llu; + } + static uint64_t mod11(uint64_t hash) { + return hash % 11llu; + } + static uint64_t mod13(uint64_t hash) { + return hash % 13llu; + } + static uint64_t mod17(uint64_t hash) { + return hash % 17llu; + } + static uint64_t mod23(uint64_t hash) { + return hash % 23llu; + } + static uint64_t mod29(uint64_t hash) { + return hash % 29llu; + } + static uint64_t mod37(uint64_t hash) { + return hash % 37llu; + } + static uint64_t mod47(uint64_t hash) { + return hash % 47llu; + } + static uint64_t mod59(uint64_t hash) { + return hash % 59llu; + } + static uint64_t mod73(uint64_t hash) { + return hash % 73llu; + } + static uint64_t mod97(uint64_t hash) { + return hash % 97llu; + } + static uint64_t mod127(uint64_t hash) { + return hash % 127llu; + } + static uint64_t mod151(uint64_t hash) { + return hash % 151llu; + } + static uint64_t mod197(uint64_t hash) { + return hash % 197llu; + } + static uint64_t mod251(uint64_t hash) { + return hash % 251llu; + } + static uint64_t mod313(uint64_t hash) { + return hash % 313llu; + } + static uint64_t mod397(uint64_t hash) { + return hash % 397llu; + } + static uint64_t mod499(uint64_t hash) { + return hash % 499llu; + } + static uint64_t mod631(uint64_t hash) { + return hash % 631llu; + } + static uint64_t mod797(uint64_t hash) { + return hash % 797llu; + } + static uint64_t mod1009(uint64_t hash) { + return hash % 1009llu; + } + static uint64_t mod1259(uint64_t hash) { + return hash % 1259llu; + } + static uint64_t mod1597(uint64_t hash) { + return hash % 1597llu; + } + static uint64_t mod2011(uint64_t hash) { + return hash % 2011llu; + } + static uint64_t mod2539(uint64_t hash) { + return hash % 2539llu; + } + static uint64_t mod3203(uint64_t hash) { + return hash % 3203llu; + } + static uint64_t mod4027(uint64_t hash) { + return hash % 4027llu; + } + static uint64_t mod5087(uint64_t hash) { + return hash % 5087llu; + } + static uint64_t mod6421(uint64_t hash) { + return hash % 6421llu; + } + static uint64_t mod8089(uint64_t hash) { + return hash % 8089llu; + } + static uint64_t mod10193(uint64_t hash) { + return hash % 10193llu; + } + static uint64_t mod12853(uint64_t hash) { + return hash % 12853llu; + } + static uint64_t mod16193(uint64_t hash) { + return hash % 16193llu; + } + static uint64_t mod20399(uint64_t hash) { + return hash % 20399llu; + } + static uint64_t mod25717(uint64_t hash) { + return hash % 25717llu; + } + static uint64_t mod32401(uint64_t hash) { + return hash % 32401llu; + } + static uint64_t mod40823(uint64_t hash) { + return hash % 40823llu; + } + static uint64_t mod51437(uint64_t hash) { + return hash % 51437llu; + } + static uint64_t mod64811(uint64_t hash) { + return hash % 64811llu; + } + static uint64_t mod81649(uint64_t hash) { + return hash % 81649llu; + } + static uint64_t mod102877(uint64_t hash) { + return hash % 102877llu; + } + static uint64_t mod129607(uint64_t hash) { + return hash % 129607llu; + } + static uint64_t mod163307(uint64_t hash) { + return hash % 163307llu; + } + static uint64_t mod205759(uint64_t hash) { + return hash % 205759llu; + } + static uint64_t mod259229(uint64_t hash) { + return hash % 259229llu; + } + static uint64_t mod326617(uint64_t hash) { + return hash % 326617llu; + } + static uint64_t mod411527(uint64_t hash) { + return hash % 411527llu; + } + static uint64_t mod518509(uint64_t hash) { + return hash % 518509llu; + } + static uint64_t mod653267(uint64_t hash) { + return hash % 653267llu; + } + static uint64_t mod823117(uint64_t hash) { + return hash % 823117llu; + } + static uint64_t mod1037059(uint64_t hash) { + return hash % 1037059llu; + } + static uint64_t mod1306601(uint64_t hash) { + return hash % 1306601llu; + } + static uint64_t mod1646237(uint64_t hash) { + return hash % 1646237llu; + } + static uint64_t mod2074129(uint64_t hash) { + return hash % 2074129llu; + } + static uint64_t mod2613229(uint64_t hash) { + return hash % 2613229llu; + } + static uint64_t mod3292489(uint64_t hash) { + return hash % 3292489llu; + } + static uint64_t mod4148279(uint64_t hash) { + return hash % 4148279llu; + } + static uint64_t mod5226491(uint64_t hash) { + return hash % 5226491llu; + } + static uint64_t mod6584983(uint64_t hash) { + return hash % 6584983llu; + } + static uint64_t mod8296553(uint64_t hash) { + return hash % 8296553llu; + } + static uint64_t mod10453007(uint64_t hash) { + return hash % 10453007llu; + } + static uint64_t mod13169977(uint64_t hash) { + return hash % 13169977llu; + } + static uint64_t mod16593127(uint64_t hash) { + return hash % 16593127llu; + } + static uint64_t mod20906033(uint64_t hash) { + return hash % 20906033llu; + } + static uint64_t mod26339969(uint64_t hash) { + return hash % 26339969llu; + } + static uint64_t mod33186281(uint64_t hash) { + return hash % 33186281llu; + } + static uint64_t mod41812097(uint64_t hash) { + return hash % 41812097llu; + } + static uint64_t mod52679969(uint64_t hash) { + return hash % 52679969llu; + } + static uint64_t mod66372617(uint64_t hash) { + return hash % 66372617llu; + } + static uint64_t mod83624237(uint64_t hash) { + return hash % 83624237llu; + } + static uint64_t mod105359939(uint64_t hash) { + return hash % 105359939llu; + } + static uint64_t mod132745199(uint64_t hash) { + return hash % 132745199llu; + } + static uint64_t mod167248483(uint64_t hash) { + return hash % 167248483llu; + } + static uint64_t mod210719881(uint64_t hash) { + return hash % 210719881llu; + } + static uint64_t mod265490441(uint64_t hash) { + return hash % 265490441llu; + } + static uint64_t mod334496971(uint64_t hash) { + return hash % 334496971llu; + } + static uint64_t mod421439783(uint64_t hash) { + return hash % 421439783llu; + } + static uint64_t mod530980861(uint64_t hash) { + return hash % 530980861llu; + } + static uint64_t mod668993977(uint64_t hash) { + return hash % 668993977llu; + } + static uint64_t mod842879579(uint64_t hash) { + return hash % 842879579llu; + } + static uint64_t mod1061961721(uint64_t hash) { + return hash % 1061961721llu; + } + static uint64_t mod1337987929(uint64_t hash) { + return hash % 1337987929llu; + } + static uint64_t mod1685759167(uint64_t hash) { + return hash % 1685759167llu; + } + static uint64_t mod2123923447(uint64_t hash) { + return hash % 2123923447llu; + } + static uint64_t mod2675975881(uint64_t hash) { + return hash % 2675975881llu; + } + static uint64_t mod3371518343(uint64_t hash) { + return hash % 3371518343llu; + } + static uint64_t mod4247846927(uint64_t hash) { + return hash % 4247846927llu; + } + static uint64_t mod5351951779(uint64_t hash) { + return hash % 5351951779llu; + } + static uint64_t mod6743036717(uint64_t hash) { + return hash % 6743036717llu; + } + static uint64_t mod8495693897(uint64_t hash) { + return hash % 8495693897llu; + } + static uint64_t mod10703903591(uint64_t hash) { + return hash % 10703903591llu; + } + static uint64_t mod13486073473(uint64_t hash) { + return hash % 13486073473llu; + } + static uint64_t mod16991387857(uint64_t hash) { + return hash % 16991387857llu; + } + static uint64_t mod21407807219(uint64_t hash) { + return hash % 21407807219llu; + } + static uint64_t mod26972146961(uint64_t hash) { + return hash % 26972146961llu; + } + static uint64_t mod33982775741(uint64_t hash) { + return hash % 33982775741llu; + } + static uint64_t mod42815614441(uint64_t hash) { + return hash % 42815614441llu; + } + static uint64_t mod53944293929(uint64_t hash) { + return hash % 53944293929llu; + } + static uint64_t mod67965551447(uint64_t hash) { + return hash % 67965551447llu; + } + static uint64_t mod85631228929(uint64_t hash) { + return hash % 85631228929llu; + } + static uint64_t mod107888587883(uint64_t hash) { + return hash % 107888587883llu; + } + static uint64_t mod135931102921(uint64_t hash) { + return hash % 135931102921llu; + } + static uint64_t mod171262457903(uint64_t hash) { + return hash % 171262457903llu; + } + static uint64_t mod215777175787(uint64_t hash) { + return hash % 215777175787llu; + } + static uint64_t mod271862205833(uint64_t hash) { + return hash % 271862205833llu; + } + static uint64_t mod342524915839(uint64_t hash) { + return hash % 342524915839llu; + } + static uint64_t mod431554351609(uint64_t hash) { + return hash % 431554351609llu; + } + static uint64_t mod543724411781(uint64_t hash) { + return hash % 543724411781llu; + } + static uint64_t mod685049831731(uint64_t hash) { + return hash % 685049831731llu; + } + static uint64_t mod863108703229(uint64_t hash) { + return hash % 863108703229llu; + } + static uint64_t mod1087448823553(uint64_t hash) { + return hash % 1087448823553llu; + } + static uint64_t mod1370099663459(uint64_t hash) { + return hash % 1370099663459llu; + } + static uint64_t mod1726217406467(uint64_t hash) { + return hash % 1726217406467llu; + } + static uint64_t mod2174897647073(uint64_t hash) { + return hash % 2174897647073llu; + } + static uint64_t mod2740199326961(uint64_t hash) { + return hash % 2740199326961llu; + } + static uint64_t mod3452434812973(uint64_t hash) { + return hash % 3452434812973llu; + } + static uint64_t mod4349795294267(uint64_t hash) { + return hash % 4349795294267llu; + } + static uint64_t mod5480398654009(uint64_t hash) { + return hash % 5480398654009llu; + } + static uint64_t mod6904869625999(uint64_t hash) { + return hash % 6904869625999llu; + } + static uint64_t mod8699590588571(uint64_t hash) { + return hash % 8699590588571llu; + } + static uint64_t mod10960797308051(uint64_t hash) { + return hash % 10960797308051llu; + } + static uint64_t mod13809739252051(uint64_t hash) { + return hash % 13809739252051llu; + } + static uint64_t mod17399181177241(uint64_t hash) { + return hash % 17399181177241llu; + } + static uint64_t mod21921594616111(uint64_t hash) { + return hash % 21921594616111llu; + } + static uint64_t mod27619478504183(uint64_t hash) { + return hash % 27619478504183llu; + } + static uint64_t mod34798362354533(uint64_t hash) { + return hash % 34798362354533llu; + } + static uint64_t mod43843189232363(uint64_t hash) { + return hash % 43843189232363llu; + } + static uint64_t mod55238957008387(uint64_t hash) { + return hash % 55238957008387llu; + } + static uint64_t mod69596724709081(uint64_t hash) { + return hash % 69596724709081llu; + } + static uint64_t mod87686378464759(uint64_t hash) { + return hash % 87686378464759llu; + } + static uint64_t mod110477914016779(uint64_t hash) { + return hash % 110477914016779llu; + } + static uint64_t mod139193449418173(uint64_t hash) { + return hash % 139193449418173llu; + } + static uint64_t mod175372756929481(uint64_t hash) { + return hash % 175372756929481llu; + } + static uint64_t mod220955828033581(uint64_t hash) { + return hash % 220955828033581llu; + } + static uint64_t mod278386898836457(uint64_t hash) { + return hash % 278386898836457llu; + } + static uint64_t mod350745513859007(uint64_t hash) { + return hash % 350745513859007llu; + } + static uint64_t mod441911656067171(uint64_t hash) { + return hash % 441911656067171llu; + } + static uint64_t mod556773797672909(uint64_t hash) { + return hash % 556773797672909llu; + } + static uint64_t mod701491027718027(uint64_t hash) { + return hash % 701491027718027llu; + } + static uint64_t mod883823312134381(uint64_t hash) { + return hash % 883823312134381llu; + } + static uint64_t mod1113547595345903(uint64_t hash) { + return hash % 1113547595345903llu; + } + static uint64_t mod1402982055436147(uint64_t hash) { + return hash % 1402982055436147llu; + } + static uint64_t mod1767646624268779(uint64_t hash) { + return hash % 1767646624268779llu; + } + static uint64_t mod2227095190691797(uint64_t hash) { + return hash % 2227095190691797llu; + } + static uint64_t mod2805964110872297(uint64_t hash) { + return hash % 2805964110872297llu; + } + static uint64_t mod3535293248537579(uint64_t hash) { + return hash % 3535293248537579llu; + } + static uint64_t mod4454190381383713(uint64_t hash) { + return hash % 4454190381383713llu; + } + static uint64_t mod5611928221744609(uint64_t hash) { + return hash % 5611928221744609llu; + } + static uint64_t mod7070586497075177(uint64_t hash) { + return hash % 7070586497075177llu; + } + static uint64_t mod8908380762767489(uint64_t hash) { + return hash % 8908380762767489llu; + } + static uint64_t mod11223856443489329(uint64_t hash) { + return hash % 11223856443489329llu; + } + static uint64_t mod14141172994150357(uint64_t hash) { + return hash % 14141172994150357llu; + } + static uint64_t mod17816761525534927(uint64_t hash) { + return hash % 17816761525534927llu; + } + static uint64_t mod22447712886978529(uint64_t hash) { + return hash % 22447712886978529llu; + } + static uint64_t mod28282345988300791(uint64_t hash) { + return hash % 28282345988300791llu; + } + static uint64_t mod35633523051069991(uint64_t hash) { + return hash % 35633523051069991llu; + } + static uint64_t mod44895425773957261(uint64_t hash) { + return hash % 44895425773957261llu; + } + static uint64_t mod56564691976601587(uint64_t hash) { + return hash % 56564691976601587llu; + } + static uint64_t mod71267046102139967(uint64_t hash) { + return hash % 71267046102139967llu; + } + static uint64_t mod89790851547914507(uint64_t hash) { + return hash % 89790851547914507llu; + } + static uint64_t mod113129383953203213(uint64_t hash) { + return hash % 113129383953203213llu; + } + static uint64_t mod142534092204280003(uint64_t hash) { + return hash % 142534092204280003llu; + } + static uint64_t mod179581703095829107(uint64_t hash) { + return hash % 179581703095829107llu; + } + static uint64_t mod226258767906406483(uint64_t hash) { + return hash % 226258767906406483llu; + } + static uint64_t mod285068184408560057(uint64_t hash) { + return hash % 285068184408560057llu; + } + static uint64_t mod359163406191658253(uint64_t hash) { + return hash % 359163406191658253llu; + } + static uint64_t mod452517535812813007(uint64_t hash) { + return hash % 452517535812813007llu; + } + static uint64_t mod570136368817120201(uint64_t hash) { + return hash % 570136368817120201llu; + } + static uint64_t mod718326812383316683(uint64_t hash) { + return hash % 718326812383316683llu; + } + static uint64_t mod905035071625626043(uint64_t hash) { + return hash % 905035071625626043llu; + } + static uint64_t mod1140272737634240411(uint64_t hash) { + return hash % 1140272737634240411llu; + } + static uint64_t mod1436653624766633509(uint64_t hash) { + return hash % 1436653624766633509llu; + } + static uint64_t mod1810070143251252131(uint64_t hash) { + return hash % 1810070143251252131llu; + } + static uint64_t mod2280545475268481167(uint64_t hash) { + return hash % 2280545475268481167llu; + } + static uint64_t mod2873307249533267101(uint64_t hash) { + return hash % 2873307249533267101llu; + } + static uint64_t mod3620140286502504283(uint64_t hash) { + return hash % 3620140286502504283llu; + } + static uint64_t mod4561090950536962147(uint64_t hash) { + return hash % 4561090950536962147llu; + } + static uint64_t mod5746614499066534157(uint64_t hash) { + return hash % 5746614499066534157llu; + } + static uint64_t mod7240280573005008577(uint64_t hash) { + return hash % 7240280573005008577llu; + } + static uint64_t mod9122181901073924329(uint64_t hash) { + return hash % 9122181901073924329llu; + } + static uint64_t mod11493228998133068689(uint64_t hash) { + return hash % 11493228998133068689llu; + } + static uint64_t mod14480561146010017169(uint64_t hash) { + return hash % 14480561146010017169llu; + } + static uint64_t mod18446744073709551557(uint64_t hash) { + return hash % 18446744073709551557llu; + } + + using mod_function = uint64_t (*)(uint64_t); + + mod_function next_size_over(uint64_t& size) const { + // prime numbers generated by the following method: + // 1. start with a prime p = 2 + // 2. go to wolfram alpha and get p = NextPrime(2 * p) + // 3. repeat 2. until you overflow 64 bits + // you now have large gaps which you would hit if somebody called reserve() + // with an unlucky number. + // 4. to fill the gaps for every prime p go to wolfram alpha and get + // ClosestPrime(p * 2^(1/3)) and ClosestPrime(p * 2^(2/3)) and put those in + // the gaps + // 5. get PrevPrime(2^64) and put it at the end + // NOLINTNEXTLINE(*c-array*) + static constexpr const uint64_t prime_list[] = { + 2llu, + 3llu, + 5llu, + 7llu, + 11llu, + 13llu, + 17llu, + 23llu, + 29llu, + 37llu, + 47llu, + 59llu, + 73llu, + 97llu, + 127llu, + 151llu, + 197llu, + 251llu, + 313llu, + 397llu, + 499llu, + 631llu, + 797llu, + 1009llu, + 1259llu, + 1597llu, + 2011llu, + 2539llu, + 3203llu, + 4027llu, + 5087llu, + 6421llu, + 8089llu, + 10193llu, + 12853llu, + 16193llu, + 20399llu, + 25717llu, + 32401llu, + 40823llu, + 51437llu, + 64811llu, + 81649llu, + 102877llu, + 129607llu, + 163307llu, + 205759llu, + 259229llu, + 326617llu, + 411527llu, + 518509llu, + 653267llu, + 823117llu, + 1037059llu, + 1306601llu, + 1646237llu, + 2074129llu, + 2613229llu, + 3292489llu, + 4148279llu, + 5226491llu, + 6584983llu, + 8296553llu, + 10453007llu, + 13169977llu, + 16593127llu, + 20906033llu, + 26339969llu, + 33186281llu, + 41812097llu, + 52679969llu, + 66372617llu, + 83624237llu, + 105359939llu, + 132745199llu, + 167248483llu, + 210719881llu, + 265490441llu, + 334496971llu, + 421439783llu, + 530980861llu, + 668993977llu, + 842879579llu, + 1061961721llu, + 1337987929llu, + 1685759167llu, + 2123923447llu, + 2675975881llu, + 3371518343llu, + 4247846927llu, + 5351951779llu, + 6743036717llu, + 8495693897llu, + 10703903591llu, + 13486073473llu, + 16991387857llu, + 21407807219llu, + 26972146961llu, + 33982775741llu, + 42815614441llu, + 53944293929llu, + 67965551447llu, + 85631228929llu, + 107888587883llu, + 135931102921llu, + 171262457903llu, + 215777175787llu, + 271862205833llu, + 342524915839llu, + 431554351609llu, + 543724411781llu, + 685049831731llu, + 863108703229llu, + 1087448823553llu, + 1370099663459llu, + 1726217406467llu, + 2174897647073llu, + 2740199326961llu, + 3452434812973llu, + 4349795294267llu, + 5480398654009llu, + 6904869625999llu, + 8699590588571llu, + 10960797308051llu, + 13809739252051llu, + 17399181177241llu, + 21921594616111llu, + 27619478504183llu, + 34798362354533llu, + 43843189232363llu, + 55238957008387llu, + 69596724709081llu, + 87686378464759llu, + 110477914016779llu, + 139193449418173llu, + 175372756929481llu, + 220955828033581llu, + 278386898836457llu, + 350745513859007llu, + 441911656067171llu, + 556773797672909llu, + 701491027718027llu, + 883823312134381llu, + 1113547595345903llu, + 1402982055436147llu, + 1767646624268779llu, + 2227095190691797llu, + 2805964110872297llu, + 3535293248537579llu, + 4454190381383713llu, + 5611928221744609llu, + 7070586497075177llu, + 8908380762767489llu, + 11223856443489329llu, + 14141172994150357llu, + 17816761525534927llu, + 22447712886978529llu, + 28282345988300791llu, + 35633523051069991llu, + 44895425773957261llu, + 56564691976601587llu, + 71267046102139967llu, + 89790851547914507llu, + 113129383953203213llu, + 142534092204280003llu, + 179581703095829107llu, + 226258767906406483llu, + 285068184408560057llu, + 359163406191658253llu, + 452517535812813007llu, + 570136368817120201llu, + 718326812383316683llu, + 905035071625626043llu, + 1140272737634240411llu, + 1436653624766633509llu, + 1810070143251252131llu, + 2280545475268481167llu, + 2873307249533267101llu, + 3620140286502504283llu, + 4561090950536962147llu, + 5746614499066534157llu, + 7240280573005008577llu, + 9122181901073924329llu, + 11493228998133068689llu, + 14480561146010017169llu, + 18446744073709551557llu}; + // NOLINTNEXTLINE(*c-array*) + static constexpr uint64_t (*const mod_functions[])(uint64_t) = { + &mod0, + &mod2, + &mod3, + &mod5, + &mod7, + &mod11, + &mod13, + &mod17, + &mod23, + &mod29, + &mod37, + &mod47, + &mod59, + &mod73, + &mod97, + &mod127, + &mod151, + &mod197, + &mod251, + &mod313, + &mod397, + &mod499, + &mod631, + &mod797, + &mod1009, + &mod1259, + &mod1597, + &mod2011, + &mod2539, + &mod3203, + &mod4027, + &mod5087, + &mod6421, + &mod8089, + &mod10193, + &mod12853, + &mod16193, + &mod20399, + &mod25717, + &mod32401, + &mod40823, + &mod51437, + &mod64811, + &mod81649, + &mod102877, + &mod129607, + &mod163307, + &mod205759, + &mod259229, + &mod326617, + &mod411527, + &mod518509, + &mod653267, + &mod823117, + &mod1037059, + &mod1306601, + &mod1646237, + &mod2074129, + &mod2613229, + &mod3292489, + &mod4148279, + &mod5226491, + &mod6584983, + &mod8296553, + &mod10453007, + &mod13169977, + &mod16593127, + &mod20906033, + &mod26339969, + &mod33186281, + &mod41812097, + &mod52679969, + &mod66372617, + &mod83624237, + &mod105359939, + &mod132745199, + &mod167248483, + &mod210719881, + &mod265490441, + &mod334496971, + &mod421439783, + &mod530980861, + &mod668993977, + &mod842879579, + &mod1061961721, + &mod1337987929, + &mod1685759167, + &mod2123923447, + &mod2675975881, + &mod3371518343, + &mod4247846927, + &mod5351951779, + &mod6743036717, + &mod8495693897, + &mod10703903591, + &mod13486073473, + &mod16991387857, + &mod21407807219, + &mod26972146961, + &mod33982775741, + &mod42815614441, + &mod53944293929, + &mod67965551447, + &mod85631228929, + &mod107888587883, + &mod135931102921, + &mod171262457903, + &mod215777175787, + &mod271862205833, + &mod342524915839, + &mod431554351609, + &mod543724411781, + &mod685049831731, + &mod863108703229, + &mod1087448823553, + &mod1370099663459, + &mod1726217406467, + &mod2174897647073, + &mod2740199326961, + &mod3452434812973, + &mod4349795294267, + &mod5480398654009, + &mod6904869625999, + &mod8699590588571, + &mod10960797308051, + &mod13809739252051, + &mod17399181177241, + &mod21921594616111, + &mod27619478504183, + &mod34798362354533, + &mod43843189232363, + &mod55238957008387, + &mod69596724709081, + &mod87686378464759, + &mod110477914016779, + &mod139193449418173, + &mod175372756929481, + &mod220955828033581, + &mod278386898836457, + &mod350745513859007, + &mod441911656067171, + &mod556773797672909, + &mod701491027718027, + &mod883823312134381, + &mod1113547595345903, + &mod1402982055436147, + &mod1767646624268779, + &mod2227095190691797, + &mod2805964110872297, + &mod3535293248537579, + &mod4454190381383713, + &mod5611928221744609, + &mod7070586497075177, + &mod8908380762767489, + &mod11223856443489329, + &mod14141172994150357, + &mod17816761525534927, + &mod22447712886978529, + &mod28282345988300791, + &mod35633523051069991, + &mod44895425773957261, + &mod56564691976601587, + &mod71267046102139967, + &mod89790851547914507, + &mod113129383953203213, + &mod142534092204280003, + &mod179581703095829107, + &mod226258767906406483, + &mod285068184408560057, + &mod359163406191658253, + &mod452517535812813007, + &mod570136368817120201, + &mod718326812383316683, + &mod905035071625626043, + &mod1140272737634240411, + &mod1436653624766633509, + &mod1810070143251252131, + &mod2280545475268481167, + &mod2873307249533267101, + &mod3620140286502504283, + &mod4561090950536962147, + &mod5746614499066534157, + &mod7240280573005008577, + &mod9122181901073924329, + &mod11493228998133068689, + &mod14480561146010017169, + &mod18446744073709551557}; + const uint64_t* found = std::lower_bound( + std::begin(prime_list), std::end(prime_list) - 1, size); + size = *found; + return mod_functions[1 + found - prime_list]; + } + void commit(mod_function new_mod_function) { + current_mod_function = new_mod_function; + } + void reset() { + current_mod_function = &mod0; + } + + uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/) + const { + return current_mod_function(hash); + } + uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const { + return index > num_slots_minus_one ? current_mod_function(index) : index; + } + + private: + mod_function current_mod_function = &mod0; +}; + +struct power_of_two_hash_policy { + uint64_t index_for_hash(uint64_t hash, uint64_t num_slots_minus_one) const { + return hash & num_slots_minus_one; + } + uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const { + return index_for_hash(index, num_slots_minus_one); + } + int8_t next_size_over(uint64_t& size) const { + size = detailv3::next_power_of_two(size); + return 0; + } + void commit(int8_t /*unused*/) {} + void reset() {} +}; + +struct fibonacci_hash_policy { + uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/) + const { + return (11400714819323198485ull * hash) >> shift; + } + uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const { + return index & num_slots_minus_one; + } + + int8_t next_size_over(uint64_t& size) const { + size = std::max(uint64_t(2), detailv3::next_power_of_two(size)); + return static_cast(64 - detailv3::log2(size)); + } + void commit(int8_t shift_) { + shift = shift_; + } + void reset() { + shift = 63; + } + + private: + int8_t shift = 63; +}; + +template < + typename K, + typename V, + typename H = std::hash, + typename E = std::equal_to, + typename A = std::allocator>> +class order_preserving_flat_hash_map + : public detailv3::sherwood_v3_table< + std::pair, + K, + H, + detailv3::KeyOrValueHasher, H>, + E, + detailv3::KeyOrValueEquality, E>, + A, + typename std::allocator_traits::template rebind_alloc< + detailv3::sherwood_v3_entry>>> { + using Table = detailv3::sherwood_v3_table< + std::pair, + K, + H, + detailv3::KeyOrValueHasher, H>, + E, + detailv3::KeyOrValueEquality, E>, + A, + typename std::allocator_traits::template rebind_alloc< + detailv3::sherwood_v3_entry>>>; + + public: + using key_type = K; + using mapped_type = V; + + using Table::Table; + order_preserving_flat_hash_map() = default; + + inline V& operator[](const K& key) { + return emplace(key, convertible_to_value()).first->second; + } + inline V& operator[](K&& key) { + return emplace(std::move(key), convertible_to_value()).first->second; + } + V& at(const K& key) { + auto found = this->find(key); + if (found == this->end()) + throw std::out_of_range("Argument passed to at() was not in the map."); + return found->second; + } + const V& at(const K& key) const { + auto found = this->find(key); + if (found == this->end()) + throw std::out_of_range("Argument passed to at() was not in the map."); + return found->second; + } + + using Table::emplace; + std::pair emplace() { + return emplace(key_type(), convertible_to_value()); + } + template + std::pair insert_or_assign( + const key_type& key, + M&& m) { + auto emplace_result = emplace(key, std::forward(m)); + if (!emplace_result.second) + emplace_result.first->second = std::forward(m); + return emplace_result; + } + template + std::pair insert_or_assign( + key_type&& key, + M&& m) { + auto emplace_result = emplace(std::move(key), std::forward(m)); + if (!emplace_result.second) + emplace_result.first->second = std::forward(m); + return emplace_result; + } + template + typename Table::iterator insert_or_assign( + typename Table::const_iterator /*unused*/, + const key_type& key, + M&& m) { + return insert_or_assign(key, std::forward(m)).first; + } + template + typename Table::iterator insert_or_assign( + typename Table::const_iterator /*unused*/, + key_type&& key, + M&& m) { + return insert_or_assign(std::move(key), std::forward(m)).first; + } + + friend bool operator==( + const order_preserving_flat_hash_map& lhs, + const order_preserving_flat_hash_map& rhs) { + if (lhs.size() != rhs.size()) + return false; + for (const typename Table::value_type& value : lhs) { + auto found = rhs.find(value.first); + if (found == rhs.end() || value.second != found->second) + return false; + } + return true; + } + friend bool operator!=( + const order_preserving_flat_hash_map& lhs, + const order_preserving_flat_hash_map& rhs) { + return !(lhs == rhs); + } + + private: + struct convertible_to_value { + operator V() const { + return V(); + } + }; +}; + +template < + typename T, + typename H = std::hash, + typename E = std::equal_to, + typename A = std::allocator> +class flat_hash_set + : public detailv3::sherwood_v3_table< + T, + T, + H, + detailv3::functor_storage, + E, + detailv3::functor_storage, + A, + typename std::allocator_traits::template rebind_alloc< + detailv3::sherwood_v3_entry>> { + using Table = detailv3::sherwood_v3_table< + T, + T, + H, + detailv3::functor_storage, + E, + detailv3::functor_storage, + A, + typename std::allocator_traits::template rebind_alloc< + detailv3::sherwood_v3_entry>>; + + public: + using key_type = T; + + using Table::Table; + flat_hash_set() = default; + + template + std::pair emplace(Args&&... args) { + return Table::emplace(T(std::forward(args)...)); + } + std::pair emplace(const key_type& arg) { + return Table::emplace(arg); + } + std::pair emplace(key_type& arg) { + return Table::emplace(arg); + } + std::pair emplace(const key_type&& arg) { + return Table::emplace(std::move(arg)); + } + std::pair emplace(key_type&& arg) { + return Table::emplace(std::move(arg)); + } + + friend bool operator==(const flat_hash_set& lhs, const flat_hash_set& rhs) { + if (lhs.size() != rhs.size()) + return false; + for (const T& value : lhs) { + if (rhs.find(value) == rhs.end()) + return false; + } + return true; + } + friend bool operator!=(const flat_hash_set& lhs, const flat_hash_set& rhs) { + return !(lhs == rhs); + } +}; + +template +struct power_of_two_std_hash : std::hash { + typedef ska_ordered::power_of_two_hash_policy hash_policy; +}; + +} // namespace ska_ordered + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h new file mode 100644 index 0000000000000000000000000000000000000000..e414de5aaab43b00062139b718067b14be4422ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h @@ -0,0 +1,105 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace c10 { +// In some versions of MSVC, there will be a compiler error when building. +// C4146: unary minus operator applied to unsigned type, result still unsigned +// C4804: unsafe use of type 'bool' in operation +// It can be addressed by disabling the following warning. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4146) +#pragma warning(disable : 4804) +#pragma warning(disable : 4018) +#endif + +// The overflow checks may involve float to int conversion which may +// trigger precision loss warning. Re-enable the warning once the code +// is fixed. See T58053069. +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") +#endif + +// bool can be converted to any type. +// Without specializing on bool, in pytorch_linux_trusty_py2_7_9_build: +// `error: comparison of constant '255' with boolean expression is always false` +// for `f > limit::max()` below +template +std::enable_if_t, bool> overflows( + From /*f*/, + bool strict_unsigned [[maybe_unused]] = false) { + return false; +} + +// skip isnan and isinf check for integral types +template +std::enable_if_t && !std::is_same_v, bool> +overflows(From f, bool strict_unsigned = false) { + using limit = std::numeric_limits::type>; + if constexpr (!limit::is_signed && std::numeric_limits::is_signed) { + // allow for negative numbers to wrap using two's complement arithmetic. + // For example, with uint8, this allows for `a - b` to be treated as + // `a + 255 * b`. + if (!strict_unsigned) { + return greater_than_max(f) || + (c10::is_negative(f) && + -static_cast(f) > static_cast(limit::max())); + } + } + return c10::less_than_lowest(f) || greater_than_max(f); +} + +template +std::enable_if_t, bool> overflows( + From f, + bool strict_unsigned [[maybe_unused]] = false) { + using limit = std::numeric_limits::type>; + if (limit::has_infinity && std::isinf(static_cast(f))) { + return false; + } + if (!limit::has_quiet_NaN && (f != f)) { + return true; + } + return f < limit::lowest() || f > limit::max(); +} + +C10_CLANG_DIAGNOSTIC_POP() + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +template +std::enable_if_t::value, bool> overflows( + From f, + bool strict_unsigned = false) { + // casts from complex to real are considered to overflow if the + // imaginary component is non-zero + if (!is_complex::value && f.imag() != 0) { + return true; + } + // Check for overflow componentwise + // (Technically, the imag overflow check is guaranteed to be false + // when !is_complex, but any optimizer worth its salt will be + // able to figure it out.) + return overflows< + typename scalar_value_type::type, + typename From::value_type>(f.real(), strict_unsigned) || + overflows< + typename scalar_value_type::type, + typename From::value_type>(f.imag(), strict_unsigned); +} +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h new file mode 100644 index 0000000000000000000000000000000000000000..9c1571b57e808ab068dd5456e1ea83dfd9fd6342 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +namespace c10 { +namespace detail { + +template +struct overloaded_t {}; + +template +struct overloaded_t : T0 { + using T0::operator(); + overloaded_t(T0 t0) : T0(std::move(t0)) {} +}; +template +struct overloaded_t : T0, overloaded_t { + using T0::operator(); + using overloaded_t::operator(); + overloaded_t(T0 t0, Ts... ts) + : T0(std::move(t0)), overloaded_t(std::move(ts)...) {} +}; + +} // namespace detail + +// Construct an overloaded callable combining multiple callables, e.g. lambdas +template +detail::overloaded_t overloaded(Ts... ts) { + return {std::move(ts)...}; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h new file mode 100644 index 0000000000000000000000000000000000000000..f457be5949a775e9ce3f4b8b39d8c4bbe95985b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h @@ -0,0 +1,9 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +struct _object; +using PyObject = _object; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h new file mode 100644 index 0000000000000000000000000000000000000000..2b48a5a89c503e4a3ddae1aee65695044d1a3384 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h new file mode 100644 index 0000000000000000000000000000000000000000..47f7a9e42540c917299479e9bda73da37083e082 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h new file mode 100644 index 0000000000000000000000000000000000000000..b4603a707c35a3a24eee27c4eea54c025f49454b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h new file mode 100644 index 0000000000000000000000000000000000000000..60b2c344e0639fb6490a1c300cb77469f111bd62 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h @@ -0,0 +1,124 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +#if defined(__APPLE__) +#define C10_SUPPORTS_SIGNAL_HANDLER +#elif defined(__linux__) && !defined(C10_DISABLE_SIGNAL_HANDLERS) +#define C10_SUPPORTS_FATAL_SIGNAL_HANDLERS +#define C10_SUPPORTS_SIGNAL_HANDLER +#endif + +#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS) +#include +#endif + +namespace c10 { + +class C10_API SignalHandler { + public: + enum class Action { NONE, STOP }; + + // Constructor. Specify what action to take when a signal is received. + SignalHandler(Action SIGINT_action, Action SIGHUP_action); + + SignalHandler(const SignalHandler&) = delete; + SignalHandler(SignalHandler&&) = delete; + SignalHandler& operator=(const SignalHandler&) = delete; + SignalHandler& operator=(SignalHandler&&) = delete; + ~SignalHandler(); + + Action CheckForSignals(); + + bool GotSIGINT(); + bool GotSIGHUP(); + + Action SIGINT_action_; + Action SIGHUP_action_; + std::atomic my_sigint_count_; + std::atomic my_sighup_count_; +}; + +#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS) +class C10_API FatalSignalHandler { + // This works by setting up certain fatal signal handlers. Previous fatal + // signal handlers will still be called when the signal is raised. Defaults + // to being off. + public: + C10_API void setPrintStackTracesOnFatalSignal(bool print); + C10_API bool printStackTracesOnFatalSignal(); + static FatalSignalHandler& getInstance(); + FatalSignalHandler(const FatalSignalHandler&) = delete; + FatalSignalHandler(FatalSignalHandler&&) = delete; + FatalSignalHandler& operator=(const FatalSignalHandler&) = delete; + FatalSignalHandler& operator=(FatalSignalHandler&&) = delete; + virtual ~FatalSignalHandler() = default; + + protected: + explicit FatalSignalHandler(); + + private: + void installFatalSignalHandlers(); + void uninstallFatalSignalHandlers(); + static void fatalSignalHandlerStatic(int signum); + void fatalSignalHandler(int signum); + virtual void fatalSignalHandlerPostProcess(); + struct sigaction* getPreviousSigaction(int signum); + const char* getSignalName(int signum); + void callPreviousSignalHandler( + struct sigaction* action, + int signum, + siginfo_t* info, + void* ctx); + void stacktraceSignalHandler(bool needsLock); + static void stacktraceSignalHandlerStatic( + int signum, + siginfo_t* info, + void* ctx); + void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx); + + // The mutex protects the bool. + std::mutex fatalSignalHandlersInstallationMutex; + bool fatalSignalHandlersInstalled; + // We need to hold a reference to call the previous SIGUSR2 handler in case + // we didn't signal it + struct sigaction previousSigusr2{}; + // Flag dictating whether the SIGUSR2 handler falls back to previous handlers + // or is intercepted in order to print a stack trace. + std::atomic fatalSignalReceived; + // Global state set when a fatal signal is received so that backtracing + // threads know why they're printing a stacktrace. + const char* fatalSignalName; + int fatalSignum = -1; + // This wait condition is used to wait for other threads to finish writing + // their stack trace when in fatal sig handler (we can't use pthread_join + // because there's no way to convert from a tid to a pthread_t). + std::condition_variable writingCond; + std::mutex writingMutex; + // used to indicate if the other thread responded to the signal + bool signalReceived; + + struct signal_handler { + const char* name; + int signum; + struct sigaction previous; + }; + + // NOLINTNEXTLINE(*c-arrays*) + static signal_handler kSignalHandlers[]; +}; + +#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER) + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h new file mode 100644 index 0000000000000000000000000000000000000000..4030828469d45cdbef603bbb8588071a41b9b398 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) && \ + !(defined(TORCH_DISABLE_SDT) && TORCH_DISABLE_SDT) + +#define TORCH_HAVE_SDT 1 + +#include + +#define TORCH_SDT(name, ...) \ + TORCH_SDT_PROBE_N( \ + pytorch, name, 0, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__) +// Use TORCH_SDT_DEFINE_SEMAPHORE(name) to define the semaphore +// as global variable before using the TORCH_SDT_WITH_SEMAPHORE macro +#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \ + TORCH_SDT_PROBE_N( \ + pytorch, name, 1, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__) +#define TORCH_SDT_IS_ENABLED(name) (TORCH_SDT_SEMAPHORE(pytorch, name) > 0) + +#else + +#define TORCH_HAVE_SDT 0 + +#define TORCH_SDT(name, ...) \ + do { \ + } while (0) +#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \ + do { \ + } while (0) +#define TORCH_SDT_IS_ENABLED(name) (false) +#define TORCH_SDT_DEFINE_SEMAPHORE(name) +#define TORCH_SDT_DECLARE_SEMAPHORE(name) + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h new file mode 100644 index 0000000000000000000000000000000000000000..a3afe767fee1e9cf92062b2ece5e2f0520dcb9e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h @@ -0,0 +1,149 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// clang-format off + +// Default constraint for the probe arguments as operands. +#ifndef TORCH_SDT_ARG_CONSTRAINT +#define TORCH_SDT_ARG_CONSTRAINT "nor" +#endif + +// Instruction to emit for the probe. +#define TORCH_SDT_NOP nop + +// Note section properties. +#define TORCH_SDT_NOTE_NAME "stapsdt" +#define TORCH_SDT_NOTE_TYPE 3 + +// Semaphore variables are put in this section +#define TORCH_SDT_SEMAPHORE_SECTION ".probes" + +// Size of address depending on platform. +#ifdef __LP64__ +#define TORCH_SDT_ASM_ADDR .8byte +#else +#define TORCH_SDT_ASM_ADDR .4byte +#endif + +// Assembler helper Macros. +#define TORCH_SDT_S(x) #x +#define TORCH_SDT_ASM_1(x) TORCH_SDT_S(x) "\n" +#define TORCH_SDT_ASM_2(a, b) TORCH_SDT_S(a) "," TORCH_SDT_S(b) "\n" +#define TORCH_SDT_ASM_3(a, b, c) TORCH_SDT_S(a) "," TORCH_SDT_S(b) "," \ + TORCH_SDT_S(c) "\n" +#define TORCH_SDT_ASM_STRING(x) TORCH_SDT_ASM_1(.asciz TORCH_SDT_S(x)) + +// Helper to determine the size of an argument. +#define TORCH_SDT_IS_ARRAY_POINTER(x) ((__builtin_classify_type(x) == 14) || \ + (__builtin_classify_type(x) == 5)) +#define TORCH_SDT_ARGSIZE(x) (TORCH_SDT_IS_ARRAY_POINTER(x) \ + ? sizeof(void*) \ + : sizeof(x)) + +// Format of each probe arguments as operand. +// Size of the argument tagged with TORCH_SDT_Sn, with "n" constraint. +// Value of the argument tagged with TORCH_SDT_An, with configured constraint. +#define TORCH_SDT_ARG(n, x) \ + [TORCH_SDT_S##n] "n" ((size_t)TORCH_SDT_ARGSIZE(x)), \ + [TORCH_SDT_A##n] TORCH_SDT_ARG_CONSTRAINT (x) + +// Templates to append arguments as operands. +#define TORCH_SDT_OPERANDS_0() [__sdt_dummy] "g" (0) +#define TORCH_SDT_OPERANDS_1(_1) TORCH_SDT_ARG(1, _1) +#define TORCH_SDT_OPERANDS_2(_1, _2) \ + TORCH_SDT_OPERANDS_1(_1), TORCH_SDT_ARG(2, _2) +#define TORCH_SDT_OPERANDS_3(_1, _2, _3) \ + TORCH_SDT_OPERANDS_2(_1, _2), TORCH_SDT_ARG(3, _3) +#define TORCH_SDT_OPERANDS_4(_1, _2, _3, _4) \ + TORCH_SDT_OPERANDS_3(_1, _2, _3), TORCH_SDT_ARG(4, _4) +#define TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5) \ + TORCH_SDT_OPERANDS_4(_1, _2, _3, _4), TORCH_SDT_ARG(5, _5) +#define TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6) \ + TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5), TORCH_SDT_ARG(6, _6) +#define TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7) \ + TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), TORCH_SDT_ARG(7, _7) +#define TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8) \ + TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7), TORCH_SDT_ARG(8, _8) +#define TORCH_SDT_OPERANDS_9(_1, _2, _3, _4, _5, _6, _7, _8, _9) \ + TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8), TORCH_SDT_ARG(9, _9) + +// Templates to reference the arguments from operands in note section. +#define TORCH_SDT_ARGFMT(no) %n[TORCH_SDT_S##no]@%[TORCH_SDT_A##no] +#define TORCH_SDT_ARG_TEMPLATE_0 /*No arguments*/ +#define TORCH_SDT_ARG_TEMPLATE_1 TORCH_SDT_ARGFMT(1) +#define TORCH_SDT_ARG_TEMPLATE_2 TORCH_SDT_ARG_TEMPLATE_1 TORCH_SDT_ARGFMT(2) +#define TORCH_SDT_ARG_TEMPLATE_3 TORCH_SDT_ARG_TEMPLATE_2 TORCH_SDT_ARGFMT(3) +#define TORCH_SDT_ARG_TEMPLATE_4 TORCH_SDT_ARG_TEMPLATE_3 TORCH_SDT_ARGFMT(4) +#define TORCH_SDT_ARG_TEMPLATE_5 TORCH_SDT_ARG_TEMPLATE_4 TORCH_SDT_ARGFMT(5) +#define TORCH_SDT_ARG_TEMPLATE_6 TORCH_SDT_ARG_TEMPLATE_5 TORCH_SDT_ARGFMT(6) +#define TORCH_SDT_ARG_TEMPLATE_7 TORCH_SDT_ARG_TEMPLATE_6 TORCH_SDT_ARGFMT(7) +#define TORCH_SDT_ARG_TEMPLATE_8 TORCH_SDT_ARG_TEMPLATE_7 TORCH_SDT_ARGFMT(8) +#define TORCH_SDT_ARG_TEMPLATE_9 TORCH_SDT_ARG_TEMPLATE_8 TORCH_SDT_ARGFMT(9) + +// Resolvable by name macros +// An attribute that marks a function or variable as needing to be resolvable +// by name. This generally is needed if inline assembly refers to the variable +// by string name. +#ifdef __roar__ +#define TORCH_NAME_RESOLVABLE __attribute__((roar_resolvable_by_name)) +#else +#define TORCH_NAME_RESOLVABLE +#endif + +// Semaphore define, declare and probe note format + +#define TORCH_SDT_SEMAPHORE(provider, name) \ + torch_sdt_semaphore_##provider##_##name + +#define TORCH_SDT_DEFINE_SEMAPHORE(name) \ + extern "C" { \ + TORCH_NAME_RESOLVABLE \ + volatile unsigned short TORCH_SDT_SEMAPHORE(pytorch, name) \ + __attribute__((section(TORCH_SDT_SEMAPHORE_SECTION), used)) = 0; \ + } + +#define TORCH_SDT_DECLARE_SEMAPHORE(name) \ + extern "C" TORCH_NAME_RESOLVABLE volatile unsigned short \ + TORCH_SDT_SEMAPHORE(pytorch, name) + +#define TORCH_SDT_SEMAPHORE_NOTE_0(provider, name) \ + TORCH_SDT_ASM_1( TORCH_SDT_ASM_ADDR 0) /*No Semaphore*/ \ + +#define TORCH_SDT_SEMAPHORE_NOTE_1(provider, name) \ + TORCH_SDT_ASM_1(TORCH_SDT_ASM_ADDR TORCH_SDT_SEMAPHORE(provider, name)) + +// Structure of note section for the probe. +#define TORCH_SDT_NOTE_CONTENT(provider, name, has_semaphore, arg_template) \ + TORCH_SDT_ASM_1(990: TORCH_SDT_NOP) \ + TORCH_SDT_ASM_3( .pushsection .note.stapsdt,"","note") \ + TORCH_SDT_ASM_1( .balign 4) \ + TORCH_SDT_ASM_3( .4byte 992f-991f, 994f-993f, TORCH_SDT_NOTE_TYPE) \ + TORCH_SDT_ASM_1(991: .asciz TORCH_SDT_NOTE_NAME) \ + TORCH_SDT_ASM_1(992: .balign 4) \ + TORCH_SDT_ASM_1(993: TORCH_SDT_ASM_ADDR 990b) \ + TORCH_SDT_ASM_1( TORCH_SDT_ASM_ADDR 0) /*Reserved for Base Address*/ \ + TORCH_SDT_SEMAPHORE_NOTE_##has_semaphore(provider, name) \ + TORCH_SDT_ASM_STRING(provider) \ + TORCH_SDT_ASM_STRING(name) \ + TORCH_SDT_ASM_STRING(arg_template) \ + TORCH_SDT_ASM_1(994: .balign 4) \ + TORCH_SDT_ASM_1( .popsection) + +// Main probe Macro. +#define TORCH_SDT_PROBE(provider, name, has_semaphore, n, arglist) \ + __asm__ __volatile__ ( \ + TORCH_SDT_NOTE_CONTENT( \ + provider, name, has_semaphore, TORCH_SDT_ARG_TEMPLATE_##n) \ + :: TORCH_SDT_OPERANDS_##n arglist \ + ) \ + +// Helper Macros to handle variadic arguments. +#define TORCH_SDT_NARG_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N +#define TORCH_SDT_NARG(...) \ + TORCH_SDT_NARG_(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define TORCH_SDT_PROBE_N(provider, name, has_semaphore, N, ...) \ + TORCH_SDT_PROBE(provider, name, has_semaphore, N, (__VA_ARGS__)) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h new file mode 100644 index 0000000000000000000000000000000000000000..1e74cffc5e6338d234846bac166d5fcac7db63b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace c10 { + +// Computes the contiguous strides of a tensor, given its sizes. +inline DimVector contiguous_strides(const IntArrayRef sizes) { + using Int = IntArrayRef::value_type; + const Int dims = static_cast(sizes.size()); + + // With this initialisation we get the case dim == 0 or 1 right + DimVector strides(dims, 1); + + for (auto i = dims - 2; i >= 0; --i) { + // Strides can't be 0 even if sizes are 0. + strides[i] = strides[i + 1] * std::max(sizes[i + 1], Int{1}); + } + + return strides; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..cbcf0b1f3c95d2e0e572ae58b6e066efc893f582 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NO_DEPRECATED) + +namespace c10 { + +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::stod; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::stoi; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::stoll; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::stoull; +// NOLINTNEXTLINE(misc-unused-using-decls) +using std::to_string; + +} // namespace c10 + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h new file mode 100644 index 0000000000000000000000000000000000000000..559cde09f9c35071293f0ed62d481ea7f6940710 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h @@ -0,0 +1,648 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace c10 { + +/** + * Port of std::string_view with methods from C++20. + * Implemented following the interface definition in + * https://en.cppreference.com/w/cpp/string/basic_string_view + * See there for the API documentation. + * + * Difference: We don't have a Traits template parameter because + * std::char_traits isn't constexpr and we'd have to reimplement + * std::char_traits if we wanted to use it with our constexpr basic_string_view. + */ +template +// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions) +class basic_string_view final { + public: + using value_type = CharT; + using pointer = CharT*; + using const_pointer = const CharT*; + using reference = CharT&; + using const_reference = const CharT&; + using const_iterator = const CharT*; + using iterator = const_iterator; + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = const_reverse_iterator; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + static constexpr size_type npos = size_type(-1); + + constexpr basic_string_view() noexcept : begin_(nullptr) {} + + explicit constexpr basic_string_view(const_pointer str, size_type count) + : begin_(str), size_(count) {} + + /* implicit */ constexpr basic_string_view(const_pointer str) + : basic_string_view(str, strlen_(str)) {} + + /* implicit */ basic_string_view(const ::std::basic_string& str) + : basic_string_view(str.data(), str.size()) {} + + /* implicit */ constexpr basic_string_view( + const ::std::basic_string_view& str) + : basic_string_view(str.data(), str.size()) {} + + constexpr basic_string_view(const basic_string_view&) noexcept = default; + + constexpr basic_string_view& operator=( + const basic_string_view& rhs) noexcept = default; + + constexpr operator ::std::basic_string_view() const { + return ::std::basic_string_view(data(), size()); + } + + explicit operator ::std::basic_string() const { + return ::std::basic_string(data(), size()); + } + + constexpr const_iterator begin() const noexcept { + return cbegin(); + } + + constexpr const_iterator cbegin() const noexcept { + return begin_; + } + + constexpr const_iterator end() const noexcept { + return cend(); + } + + constexpr const_iterator cend() const noexcept { + return begin_ + size_; + } + + constexpr const_reverse_iterator rbegin() const noexcept { + return crbegin(); + } + + constexpr const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(this->end()); + } + + constexpr const_reverse_iterator rend() const noexcept { + return crend(); + } + + constexpr const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(this->begin()); + } + + friend constexpr const_iterator begin(basic_string_view sv) noexcept { + return sv.begin(); + } + + friend constexpr const_iterator end(basic_string_view sv) noexcept { + return sv.end(); + } + + constexpr const_reference operator[](size_type pos) const { + // TODO: split out + return at_(pos); + } + + constexpr const_reference at(size_type pos) const { +#if !defined( \ + __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code + return C10_UNLIKELY(pos >= size_) + ? (throw std::out_of_range( + "string_view::operator[] or string_view::at() out of range. Index: " + + std::to_string(pos) + ", size: " + std::to_string(size())), + at_(0)) + : at_(pos); +#else + return at_(pos); +#endif + } + + constexpr const_reference front() const { + return *begin_; + } + + constexpr const_reference back() const { + return *(begin_ + size_ - 1); + } + + constexpr const_pointer data() const noexcept { + return begin_; + } + + constexpr size_type size() const noexcept { + return size_; + } + + constexpr size_type length() const noexcept { + return size(); + } + + constexpr size_type max_size() const noexcept { + return std::numeric_limits::max(); + } + + [[nodiscard]] constexpr bool empty() const noexcept { + return size() == 0; + } + + constexpr void remove_prefix(size_type n) { + if (n > size()) { + throw std::out_of_range( + "basic_string_view::remove_prefix: out of range. PrefixLength: " + + std::to_string(n) + ", size: " + std::to_string(size())); + } + begin_ += n; + size_ -= n; + } + + constexpr void remove_suffix(size_type n) { + if (n > size()) { + throw std::out_of_range( + "basic_string_view::remove_suffix: out of range. SuffixLength: " + + std::to_string(n) + ", size: " + std::to_string(size())); + } + size_ -= n; + } + + constexpr void swap(basic_string_view& sv) noexcept { + auto tmp = *this; + *this = sv; + sv = tmp; + } + + size_type copy(pointer dest, size_type count, size_type pos = 0) const { + if (pos > size_) { + throw std::out_of_range( + "basic_string_view::copy: out of range. Index: " + + std::to_string(pos) + ", size: " + std::to_string(size())); + } + size_type copy_length = std::min(count, size_ - pos); + for (auto iter = begin() + pos, end = iter + copy_length; iter != end;) { + *(dest++) = *(iter++); + } + return copy_length; + } + + constexpr basic_string_view substr(size_type pos = 0, size_type count = npos) + const { +#if !defined( \ + __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code + return (pos > size_) + ? (throw std::out_of_range( + "basic_string_view::substr parameter out of bounds. Index: " + + std::to_string(pos) + ", size: " + std::to_string(size())), + substr_()) + : substr_(pos, count); +#else + return substr_(pos, count); +#endif + } + + constexpr int compare(basic_string_view rhs) const noexcept { + // Write it iteratively. This is faster. + for (size_t i = 0, end = std::min(size(), rhs.size()); i < end; ++i) { + if (at_(i) < rhs.at_(i)) { + return -1; + } else if (at_(i) > rhs.at_(i)) { + return 1; + } + } + if (size() < rhs.size()) { + return -1; + } else if (size() > rhs.size()) { + return 1; + } + return 0; + } + + constexpr int compare(size_type pos1, size_type count1, basic_string_view v) + const { + return substr(pos1, count1).compare(v); + } + + constexpr int compare( + size_type pos1, + size_type count1, + basic_string_view v, + size_type pos2, + size_type count2) const { + return substr(pos1, count1).compare(v.substr(pos2, count2)); + } + + constexpr int compare(const_pointer s) const { + return compare(basic_string_view(s)); + } + + constexpr int compare(size_type pos1, size_type count1, const_pointer s) + const { + return substr(pos1, count1).compare(basic_string_view(s)); + } + + constexpr int compare( + size_type pos1, + size_type count1, + const_pointer s, + size_type count2) const { + return substr(pos1, count1).compare(basic_string_view(s, count2)); + } + + friend constexpr bool operator==( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return lhs.equals_(rhs); + } + + friend constexpr bool operator!=( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return !(lhs == rhs); + } + + friend constexpr bool operator<( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return lhs.compare(rhs) < 0; + } + + friend constexpr bool operator>=( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return !(lhs < rhs); + } + + friend constexpr bool operator>( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return rhs < lhs; + } + + friend constexpr bool operator<=( + basic_string_view lhs, + basic_string_view rhs) noexcept { + return !(lhs > rhs); + } + + constexpr bool starts_with(basic_string_view prefix) const noexcept { + return (prefix.size() > size()) ? false + : prefix.equals_(substr_(0, prefix.size())); + } + + constexpr bool starts_with(CharT prefix) const noexcept { + return !empty() && prefix == front(); + } + + constexpr bool starts_with(const_pointer prefix) const { + return starts_with(basic_string_view(prefix)); + } + + constexpr bool ends_with(basic_string_view suffix) const noexcept { + return (suffix.size() > size()) + ? false + : suffix.equals_(substr_(size() - suffix.size(), suffix.size())); + } + + constexpr bool ends_with(CharT suffix) const noexcept { + return !empty() && suffix == back(); + } + + constexpr bool ends_with(const_pointer suffix) const { + return ends_with(basic_string_view(suffix)); + } + + constexpr size_type find(basic_string_view v, size_type pos = 0) + const noexcept { + if (v.empty()) { + return pos <= size() ? pos : npos; + } + + if (pos + v.size() <= size()) { + for (size_type cur = pos, end = size() - v.size(); cur <= end; ++cur) { + if (v.at_(0) == at_(cur) && + v.substr_(1).equals_(substr_(cur + 1, v.size() - 1))) { + return cur; + } + } + } + return npos; + } + + constexpr size_type find(CharT ch, size_type pos = 0) const noexcept { + return find_first_if_(pos, charIsEqual_{ch}); + } + + constexpr size_type find(const_pointer s, size_type pos, size_type count) + const { + return find(basic_string_view(s, count), pos); + } + + constexpr size_type find(const_pointer s, size_type pos = 0) const { + return find(basic_string_view(s), pos); + } + + constexpr size_type rfind(basic_string_view v, size_type pos = npos) + const noexcept { + // Write it iteratively. This is faster. + if (v.empty()) { + return pos <= size() ? pos : size(); + } + + if (v.size() <= size()) { + pos = std::min(size() - v.size(), pos); + do { + if (v.at_(0) == at_(pos) && + v.substr_(1).equals_(substr_(pos + 1, v.size() - 1))) { + return pos; + } + } while (pos-- > 0); + } + return npos; + } + + constexpr size_type rfind(CharT ch, size_type pos = npos) const noexcept { + return find_last_if_(pos, charIsEqual_{ch}); + } + + constexpr size_type rfind(const_pointer s, size_type pos, size_type count) + const { + return rfind(basic_string_view(s, count), pos); + } + + constexpr size_type rfind(const_pointer s, size_type pos = npos) const { + return rfind(basic_string_view(s), pos); + } + + constexpr size_type find_first_of(basic_string_view v, size_type pos = 0) + const noexcept { + return find_first_if_(pos, stringViewContainsChar_{v}); + } + + constexpr size_type find_first_of(CharT ch, size_type pos = 0) + const noexcept { + return find_first_if_(pos, charIsEqual_{ch}); + } + + constexpr size_type find_first_of( + const_pointer s, + size_type pos, + size_type count) const { + return find_first_of(basic_string_view(s, count), pos); + } + + constexpr size_type find_first_of(const_pointer s, size_type pos = 0) const { + return find_first_of(basic_string_view(s), pos); + } + + constexpr size_type find_last_of(basic_string_view v, size_type pos = npos) + const noexcept { + return find_last_if_(pos, stringViewContainsChar_{v}); + } + + constexpr size_type find_last_of(CharT ch, size_type pos = npos) + const noexcept { + return find_last_if_(pos, charIsEqual_{ch}); + } + + constexpr size_type find_last_of( + const_pointer s, + size_type pos, + size_type count) const { + return find_last_of(basic_string_view(s, count), pos); + } + + constexpr size_type find_last_of(const_pointer s, size_type pos = npos) + const { + return find_last_of(basic_string_view(s), pos); + } + + constexpr size_type find_first_not_of(basic_string_view v, size_type pos = 0) + const noexcept { + return find_first_if_(pos, stringViewDoesNotContainChar_{v}); + } + + constexpr size_type find_first_not_of(CharT ch, size_type pos = 0) + const noexcept { + return find_first_if_(pos, charIsNotEqual_{ch}); + } + + constexpr size_type find_first_not_of( + const_pointer s, + size_type pos, + size_type count) const { + return find_first_not_of(basic_string_view(s, count), pos); + } + + constexpr size_type find_first_not_of(const_pointer s, size_type pos = 0) + const { + return find_first_not_of(basic_string_view(s), pos); + } + + constexpr size_type find_last_not_of( + basic_string_view v, + size_type pos = npos) const noexcept { + return find_last_if_(pos, stringViewDoesNotContainChar_{v}); + } + + constexpr size_type find_last_not_of(CharT ch, size_type pos = npos) + const noexcept { + return find_last_if_(pos, charIsNotEqual_{ch}); + } + + constexpr size_type find_last_not_of( + const_pointer s, + size_type pos, + size_type count) const { + return find_last_not_of(basic_string_view(s, count), pos); + } + + constexpr size_type find_last_not_of(const_pointer s, size_type pos = npos) + const { + return find_last_not_of(basic_string_view(s), pos); + } + + private: + static constexpr size_type strlen_(const_pointer str) noexcept { + const_pointer current = str; + while (*current != '\0') { + ++current; + } + return current - str; + } + + constexpr const_reference at_(size_type pos) const noexcept { + return *(begin_ + pos); + } + + constexpr basic_string_view substr_(size_type pos = 0, size_type count = npos) + const { + return basic_string_view{begin_ + pos, std::min(count, size() - pos)}; + } + + template + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + constexpr size_type find_first_if_(size_type pos, Condition&& condition) + const noexcept { + if (pos + 1 <= size()) { + for (size_type cur = pos; cur < size(); ++cur) { + if (condition(at_(cur))) { + return cur; + } + } + } + return npos; + } + + template + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + constexpr size_type find_last_if_(size_type pos, Condition&& condition) + const noexcept { + // Write it iteratively. This is faster. + if (!empty()) { + pos = std::min(size() - 1, pos); + do { + if (condition(at_(pos))) { + return pos; + } + } while (pos-- > 0); + } + return npos; + } + + constexpr bool equals_(basic_string_view rhs) const { + // We don't use string_view::compare() here but implement it manually + // because only looking at equality allows for more optimized code. +#if defined(__GNUC__) && !defined(__CUDACC__) + return size() == rhs.size() && + 0 == __builtin_memcmp(data(), rhs.data(), size()); +#else + if (size() != rhs.size()) { + return false; + } + // Yes, memcmp would be laster than this loop, but memcmp isn't constexpr + // and I didn't feel like implementing a constexpr memcmp variant. + // TODO At some point this should probably be done, including tricks + // like comparing one machine word instead of a byte per iteration. + for (typename basic_string_view::size_type pos = 0; pos < size(); + ++pos) { + if (at_(pos) != rhs.at_(pos)) { + return false; + } + } + return true; +#endif + } + + struct charIsEqual_ final { + CharT expected; + constexpr bool operator()(CharT actual) const noexcept { + return expected == actual; + } + }; + + struct charIsNotEqual_ final { + CharT expected; + constexpr bool operator()(CharT actual) const noexcept { + return expected != actual; + } + }; + + struct stringViewContainsChar_ final { + basic_string_view expected; + constexpr bool operator()(CharT ch) const noexcept { + return npos != expected.find(ch); + } + }; + + struct stringViewDoesNotContainChar_ final { + basic_string_view expected; + constexpr bool operator()(CharT ch) const noexcept { + return npos == expected.find(ch); + } + }; + + const_pointer begin_; + size_type size_{}; +}; + +template +inline std::basic_ostream& operator<<( + std::basic_ostream& stream, + basic_string_view sv) { + // The rules for operator<< are quite complex, so lets defer to the + // STL implementation. + using std_string_type = ::std::basic_string_view; + return stream << std_string_type(sv.data(), sv.size()); +} + +template +constexpr inline void swap( + basic_string_view& lhs, + basic_string_view& rhs) noexcept { + lhs.swap(rhs); +} +using string_view = std::string_view; +using c10_string_view = basic_string_view; + +// NOTE: In C++20, this function should be replaced by string_view.starts_with +constexpr bool starts_with( + const std::string_view s, + const std::string_view prefix) noexcept { + return (prefix.size() > s.size()) ? false + : prefix == s.substr(0, prefix.size()); +} + +// NOTE: In C++20, this function should be replaced by string_view.starts_with +constexpr bool starts_with( + const std::string_view s, + const char prefix) noexcept { + return !s.empty() && prefix == s.front(); +} + +// NOTE: In C++20, this function should be replaced by string_view.ends_with +constexpr bool ends_with( + const std::string_view s, + const std::string_view suffix) noexcept { + return (suffix.size() > s.size()) + ? false + : suffix == s.substr(s.size() - suffix.size(), suffix.size()); +} + +// NOTE: In C++20, this function should be replaced by string_view.ends_with +constexpr bool ends_with(const std::string_view s, const char prefix) noexcept { + return !s.empty() && prefix == s.back(); +} + +} // namespace c10 + +namespace std { +template +struct hash<::c10::basic_string_view> { + size_t operator()(::c10::basic_string_view x) const { + // The standard says that std::string_view hashing must do the same as + // std::string hashing but leaves the details of std::string hashing + // up to the implementer. So, to be conformant, we need to reuse and + // existing STL type's hash function. The std::string fallback is probably + // slow but the only way to be conformant. + + using std_string_type = ::std::basic_string_view; + return ::std::hash{}(std_string_type(x.data(), x.size())); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h new file mode 100644 index 0000000000000000000000000000000000000000..4e3d1a431c19958786bba8245d56bb12854fd5e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h @@ -0,0 +1,1669 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * strong_type C++14/17/20 strong typedef library + * + * Copyright (C) Björn Fahller + * + * Use, modification and distribution is subject to the + * Boost Software License, Version 1.0. (See accompanying + * file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + * + * Project home: https://github.com/rollbear/strong_type + */ + +#ifndef ROLLBEAR_STRONG_TYPE_HPP_INCLUDED +#define ROLLBEAR_STRONG_TYPE_HPP_INCLUDED + +#include +#include +#include +#include +#include + +#ifndef STRONG_HAS_STD_FORMAT +#define STRONG_HAS_STD_FORMAT 0 +#endif + +#ifndef STRONG_HAS_FMT_FORMAT +#define STRONG_HAS_FMT_FORMAT 0 +#endif + +#if STRONG_HAS_STD_FORMAT +#include +#if !defined(__cpp_lib_format) || __cpp_lib_format < 201907 +#undef STRONG_HAS_STD_FORMAT +#define STRONG_HAS_STD_FORMAT 0 +#endif +#endif + +#if STRONG_HAS_FMT_FORMAT +#include +#endif + +namespace strong +{ + +namespace impl +{ + template + using WhenConstructible = std::enable_if_t>; +} + +template +using modifier = typename M::template modifier; + +struct uninitialized_t {}; +static constexpr uninitialized_t uninitialized{}; + +struct default_constructible +{ + template + class modifier + { + }; +}; + +namespace impl { + template + constexpr bool supports_default_construction(const ::strong::default_constructible::modifier* /*unused*/) + { + return true; + } +} + +template +class type : public modifier>... +{ +public: + template {}>> + explicit type(uninitialized_t /*unused*/) + noexcept + { + } + template (nullptr))> + constexpr + type() + noexcept(noexcept(T{})) + : val{} + { + } + + template >> + constexpr + explicit + type( + std::initializer_list us + ) + noexcept(noexcept(T{us})) + : val{us} + { + } + template && (sizeof...(U) > 0)>> + constexpr + explicit + type( + U&& ... u) + noexcept(std::is_nothrow_constructible_v) + : val(std::forward(u)...) + {} + + friend constexpr void swap(type& a, type& b) noexcept( + std::is_nothrow_move_constructible_v && + std::is_nothrow_move_assignable_v + ) + { + using std::swap; + swap(a.val, b.val); + } + + [[nodiscard]] + constexpr T& value_of() & noexcept { return val;} + [[nodiscard]] + constexpr const T& value_of() const & noexcept { return val;} + [[nodiscard]] + constexpr T&& value_of() && noexcept { return std::move(val);} + + [[nodiscard]] + friend constexpr T& value_of(type& t) noexcept { return t.val;} + [[nodiscard]] + friend constexpr const T& value_of(const type& t) noexcept { return t.val;} + [[nodiscard]] + friend constexpr T&& value_of(type&& t) noexcept { return std::move(t).val;} +private: + T val; +}; + +namespace impl { + template + constexpr bool is_strong_type_func(const strong::type* /*unused*/) { return true;} + constexpr bool is_strong_type_func(...) { return false;} + template + constexpr T underlying_type(strong::type*); + +} + +template +struct is_strong_type : std::integral_constant(nullptr))> {}; + +namespace impl { + template + using WhenStrongType = std::enable_if_t>::value>; + template + using WhenNotStrongType = std::enable_if_t>::value>; +} + +template ::value> +struct underlying_type +{ + using type = decltype(impl::underlying_type(static_cast(nullptr))); +}; + +template +struct underlying_type +{ + using type = T; +}; + +template +using underlying_type_t = typename underlying_type::type; + + +namespace impl { + template< + typename T, + typename = impl::WhenNotStrongType> + constexpr + T && + access(T &&t) + noexcept { + return std::forward(t); + } + template < + typename T, + typename = impl::WhenStrongType> + [[nodiscard]] + constexpr + auto + access(T&& t) + noexcept + -> decltype(value_of(std::forward(t))) + { + return value_of(std::forward(t)); + } + +} +struct equality +{ + template + class modifier; +}; + + +template +class equality::modifier<::strong::type> +{ + using type = ::strong::type; +public: + [[nodiscard]] + friend + constexpr + auto + operator==( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() == std::declval())) + -> decltype(std::declval() == std::declval()) + { + return value_of(lh) == value_of(rh); + } + + [[nodiscard]] + friend + constexpr + auto + operator!=( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() != std::declval())) + -> decltype(std::declval() != std::declval()) + { + return value_of(lh) != value_of(rh); + } +}; + +namespace impl +{ + template + class typed_equality + { + private: + using TT = underlying_type_t; + using OT = underlying_type_t; + public: + [[nodiscard]] + friend + constexpr + auto operator==(const T& lh, const Other& rh) + noexcept(noexcept(std::declval() == std::declval())) + -> decltype(std::declval() == std::declval()) + { + return value_of(lh) == impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator==(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() == std::declval())) + -> decltype(std::declval() == std::declval()) + { + return impl::access(lh) == value_of(rh) ; + } + [[nodiscard]] + friend + constexpr + auto operator!=(const T& lh, const Other rh) + noexcept(noexcept(std::declval() != std::declval())) + -> decltype(std::declval() != std::declval()) + { + return value_of(lh) != impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator!=(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() != std::declval())) + -> decltype(std::declval() != std::declval()) + { + return impl::access(lh) != value_of(rh) ; + } + }; +} +template +struct equality_with +{ + template + class modifier : public impl::typed_equality... + { + }; +}; + +namespace impl +{ + template + class typed_ordering + { + private: + using TT = underlying_type_t; + using OT = underlying_type_t; + public: + [[nodiscard]] + friend + constexpr + auto operator<(const T& lh, const Other& rh) + noexcept(noexcept(std::declval() < std::declval())) + -> decltype(std::declval() < std::declval()) + { + return value_of(lh) < impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator<(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() < std::declval())) + -> decltype(std::declval() < std::declval()) + { + return impl::access(lh) < value_of(rh) ; + } + + [[nodiscard]] + friend + constexpr + auto operator<=(const T& lh, const Other& rh) + noexcept(noexcept(std::declval() <= std::declval())) + -> decltype(std::declval() <= std::declval()) + { + return value_of(lh) <= impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator<=(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() <= std::declval())) + -> decltype(std::declval() <= std::declval()) + { + return impl::access(lh) <= value_of(rh) ; + } + + [[nodiscard]] + friend + constexpr + auto operator>(const T& lh, const Other& rh) + noexcept(noexcept(std::declval() > std::declval())) + -> decltype(std::declval() > std::declval()) + { + return value_of(lh) > impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator>(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() > std::declval())) + -> decltype(std::declval() > std::declval()) + { + return impl::access(lh) > value_of(rh) ; + } + + [[nodiscard]] + friend + constexpr + auto operator>=(const T& lh, const Other& rh) + noexcept(noexcept(std::declval() >= std::declval())) + -> decltype(std::declval() >= std::declval()) + { + return value_of(lh) >= impl::access(rh); + } + [[nodiscard]] + friend + constexpr + auto operator>=(const Other& lh, const T& rh) + noexcept(noexcept(std::declval() >= std::declval())) + -> decltype(std::declval() >= std::declval()) + { + return impl::access(lh) >= value_of(rh) ; + } + }; +} + +template +struct ordered_with +{ + template + class modifier : public impl::typed_ordering... + { + }; +}; + +namespace impl +{ + template + struct require_copy_constructible + { + static constexpr bool value = std::is_copy_constructible>::value; + static_assert(value, "underlying type must be copy constructible"); + }; + template + struct require_move_constructible + { + static constexpr bool value = std::is_move_constructible>::value; + static_assert(value, "underlying type must be move constructible"); + }; + template + struct require_copy_assignable + { + static constexpr bool value = std::is_copy_assignable>::value; + static_assert(value, "underlying type must be copy assignable"); + }; + template + struct require_move_assignable + { + static constexpr bool value = std::is_move_assignable>::value; + static_assert(value, "underlying type must be move assignable"); + }; + + template struct valid_type; + template <> + struct valid_type {}; + + template + struct require_semiregular + : valid_type::value && + require_move_constructible::value && + require_copy_assignable::value && + require_move_assignable::value> + { + }; + +} +struct semiregular +{ + template + class modifier; +}; + +template +class semiregular::modifier<::strong::type> + : public default_constructible::modifier + , private impl::require_semiregular +{ +}; + +struct regular +{ + template + class modifier + : public semiregular::modifier + , public equality::modifier + { + }; +}; + +struct unique +{ + template + class modifier + : private impl::valid_type< + impl::require_move_constructible::value && + impl::require_move_assignable::value + > + { + public: + constexpr modifier() = default; + modifier(const modifier&) = delete; + constexpr modifier(modifier&&) = default; + modifier& operator=(const modifier&) = delete; + constexpr modifier& operator=(modifier&&) = default; + }; +}; +struct ordered +{ + template + class modifier; +}; + + +template +class ordered::modifier<::strong::type> +{ + using type = ::strong::type; +public: + [[nodiscard]] + friend + constexpr + auto + operator<( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() < std::declval())) + -> decltype(std::declval() < std::declval()) + { + return value_of(lh) < value_of(rh); + } + + [[nodiscard]] + friend + constexpr + auto + operator<=( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() <= std::declval())) + -> decltype(std::declval() <= std::declval()) + { + return value_of(lh) <= value_of(rh); + } + + [[nodiscard]] + friend + constexpr + auto + operator>( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() > std::declval())) + -> decltype(std::declval() > std::declval()) + { + return value_of(lh) > value_of(rh); + } + + [[nodiscard]] + friend + constexpr + + auto + operator>=( + const type& lh, + const type& rh) + noexcept(noexcept(std::declval() >= std::declval())) + -> decltype(std::declval() >= std::declval()) + { + return value_of(lh) >= value_of(rh); + } +}; + +struct ostreamable +{ + template + class modifier + { + public: + friend + std::ostream& + operator<<( + std::ostream &os, + const T &t) + { + return os << value_of(t); + } + }; +}; + +struct istreamable +{ + template + class modifier + { + public: + friend + std::istream& + operator>>( + std::istream &is, + T &t) + { + return is >> value_of(t); + } + }; +}; + +struct iostreamable +{ + template + class modifier + : public ostreamable::modifier + , public istreamable::modifier + { + }; +}; + +struct incrementable +{ + template + class modifier + { + public: + friend + constexpr + T& + operator++(T& t) + noexcept(noexcept(++std::declval().value_of())) + { + ++value_of(t); + return t; + } + + friend + constexpr + T + operator++(T& t, int) + { + auto copy = t; + ++t; + return copy; + } + }; +}; + +struct decrementable +{ + template + class modifier + { + public: + friend + constexpr + T& + operator--(T& t) + noexcept(noexcept(--std::declval().value_of())) + { + --value_of(t); + return t; + } + + friend + constexpr + T + operator--(T& t, int) + { + auto copy = t; + --t; + return copy; + } + }; +}; + +struct bicrementable +{ + template + class modifier + : public incrementable::modifier + , public decrementable::modifier + { + }; +}; + +struct boolean +{ + template + class modifier + { + public: + explicit constexpr operator bool() const + noexcept(noexcept(static_cast(value_of(std::declval())))) + { + const auto& self = static_cast(*this); + return static_cast(value_of(self)); + } + }; +}; + +struct hashable +{ + template + class modifier{}; +}; + +struct difference +{ + template + class modifier; +}; + +template +class difference::modifier<::strong::type> +: public ordered::modifier<::strong::type> +, public equality::modifier<::strong::type> +{ + using type = ::strong::type; +public: + friend + constexpr + type& operator+=(type& lh, const type& rh) + noexcept(noexcept(value_of(lh) += value_of(rh))) + { + value_of(lh) += value_of(rh); + return lh; + } + + friend + constexpr + type& operator-=(type& lh, const type& rh) + noexcept(noexcept(value_of(lh) -= value_of(rh))) + { + value_of(lh) -= value_of(rh); + return lh; + } + + friend + constexpr + type& operator*=(type& lh, const T& rh) + noexcept(noexcept(value_of(lh) *= rh)) + { + value_of(lh) *= rh; + return lh; + } + + friend + constexpr + type& operator/=(type& lh, const T& rh) + noexcept(noexcept(value_of(lh) /= rh)) + { + value_of(lh) /= rh; + return lh; + } + + template ()%= std::declval())> + friend + constexpr + type& operator%=(type& lh, const T& rh) + noexcept(noexcept(value_of(lh) %= rh)) + { + value_of(lh)%= rh; + return lh; + } + + friend + constexpr + type operator+(type lh, const type& rh) + { + lh += rh; + return lh; + } + + friend + constexpr + type operator-(type lh, const type& rh) + { + lh -= rh; + return lh; + } + + friend + constexpr + type operator*(type lh, const T& rh) + { + lh *= rh; + return lh; + } + + friend + constexpr + type operator*(const T& lh, type rh) + { + rh *= lh; + return rh; + } + + friend + constexpr + type operator/(type lh, const T& rh) + { + lh /= rh; + return lh; + } + + friend + constexpr + T operator/(const type& lh, const type& rh) + { + return value_of(lh) / value_of(rh); + } + + template () %= std::declval())> + friend + constexpr + type operator%(type lh, const T& rh) + noexcept(noexcept(lh%= rh)) + { + lh %= rh; + return lh; + } + + template () % std::declval())> + friend + constexpr + T operator%(type lh, type rh) + noexcept(noexcept(value_of(lh) % value_of(rh))) + { + return value_of(lh) % value_of(rh); + } +}; + +template +struct affine_point +{ + template + class modifier; +}; + +namespace impl +{ + template + using void_t = void; + + template + struct subtractable : std::false_type {}; + + template + struct subtractable() - std::declval())>> + : std::true_type {}; +} + + +template +template +class affine_point::modifier<::strong::type> +{ + using type = ::strong::type; + static_assert(impl::subtractable::value, "it must be possible to subtract instances of your underlying type"); + using base_diff_type = decltype(std::declval() - std::declval()); +public: + using difference = std::conditional_t{}, strong::type, D>; + static_assert(std::is_constructible_v, ""); + [[nodiscard]] + friend + constexpr + difference + operator-( + const type& lh, + const type& rh) + { + return difference(value_of(lh) - value_of(rh)); + } + + friend + constexpr + type& + operator+=( + type& lh, + const difference& d) + noexcept(noexcept(value_of(lh) += impl::access(d))) + { + value_of(lh) += impl::access(d); + return lh; + } + + friend + constexpr + type& + operator-=( + type& lh, + const difference& d) + noexcept(noexcept(value_of(lh) -= impl::access(d))) + { + value_of(lh) -= impl::access(d); + return lh; + } + + [[nodiscard]] + friend + constexpr + type + operator+( + type lh, + const difference& d) + { + return lh += d; + } + + [[nodiscard]] + friend + constexpr + type + operator+( + const difference& d, + type rh) + { + return rh+= d; + } + + [[nodiscard]] + friend + constexpr + type + operator-( + type lh, + const difference& d) + { + return lh -= d; + } +}; + + +struct pointer +{ + template + class modifier; +}; + +template +class pointer::modifier<::strong::type> +{ + using type = strong::type; +public: + template + [[nodiscard]] + friend + constexpr + auto + operator==( + const type& t, + std::nullptr_t) + noexcept(noexcept(std::declval() == nullptr)) + -> decltype(std::declval() == nullptr) + { + return value_of(t) == nullptr; + } + + template + [[nodiscard]] + friend + constexpr + auto + operator==( + std::nullptr_t, + const type& t) + noexcept(noexcept(nullptr == std::declval())) + -> decltype(nullptr == std::declval()) + { + return value_of(t) == nullptr; + } + + template + [[nodiscard]] + friend + constexpr + auto + operator!=( + const type& t, + std::nullptr_t) + noexcept(noexcept(std::declval() != nullptr)) + -> decltype(std::declval() != nullptr) + { + return value_of(t) != nullptr; + } + + template + [[nodiscard]] + friend + constexpr + auto + operator!=( + std::nullptr_t, + const type& t) + noexcept(noexcept(nullptr != std::declval())) + -> decltype(nullptr != std::declval()) + { + return value_of(t) != nullptr; + } + + [[nodiscard]] + constexpr + decltype(*std::declval()) + operator*() + const + { + auto& self = static_cast(*this); + return *value_of(self); + } + + [[nodiscard]] + constexpr + decltype(&(*std::declval())) operator->() const { return &operator*();} +}; + +struct arithmetic +{ + template + class modifier + { + public: + [[nodiscard]] + friend + constexpr + T + operator-( + const T &lh) + { + return T{-value_of(lh)}; + } + + friend + constexpr + T& + operator+=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) += value_of(rh))) + { + value_of(lh) += value_of(rh); + return lh; + } + + friend + constexpr + T& + operator-=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) -= value_of(rh))) + { + value_of(lh) -= value_of(rh); + return lh; + } + + friend + constexpr + T& + operator*=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) *= value_of(rh))) + { + value_of(lh) *= value_of(rh); + return lh; + } + + friend + constexpr + T& + operator/=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) /= value_of(rh))) + { + value_of(lh) /= value_of(rh); + return lh; + } + + template ()) % value_of(std::declval()))> + friend + constexpr + T& + operator%=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) %= value_of(rh))) + { + value_of(lh) %= value_of(rh); + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator+( + T lh, + const T &rh) + { + lh += rh; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator-( + T lh, + const T &rh) + { + lh -= rh; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator*( + T lh, + const T &rh) + { + lh *= rh; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator/( + T lh, + const T &rh) + { + lh /= rh; + return lh; + } + + template ()) % value_of(std::declval()))> + [[nodiscard]] + friend + constexpr + T + operator%( + T lh, + const T &rh) + { + lh %= rh; + return lh; + } + + }; +}; + + +struct bitarithmetic +{ + template + class modifier + { + public: + friend + constexpr + T& + operator&=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) &= value_of(rh))) + { + value_of(lh) &= value_of(rh); + return lh; + } + + friend + constexpr + T& + operator|=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) |= value_of(rh))) + { + value_of(lh) |= value_of(rh); + return lh; + } + + friend + constexpr + T& + operator^=( + T &lh, + const T &rh) + noexcept(noexcept(value_of(lh) ^= value_of(rh))) + { + value_of(lh) ^= value_of(rh); + return lh; + } + + template + friend + constexpr + T& + operator<<=( + T &lh, + C c) + noexcept(noexcept(value_of(lh) <<= c)) + { + value_of(lh) <<= c; + return lh; + } + + template + friend + constexpr + T& + operator>>=( + T &lh, + C c) + noexcept(noexcept(value_of(lh) >>= c)) + { + value_of(lh) >>= c; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator~( + const T &lh) + { + auto v = value_of(lh); + v = ~v; + return T(v); + } + + [[nodiscard]] + friend + constexpr + T + operator&( + T lh, + const T &rh) + { + lh &= rh; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator|( + T lh, + const T &rh) + { + lh |= rh; + return lh; + } + + [[nodiscard]] + friend + constexpr + T + operator^( + T lh, + const T &rh) + { + lh ^= rh; + return lh; + } + + template + [[nodiscard]] + friend + constexpr + T + operator<<( + T lh, + C c) + { + lh <<= c; + return lh; + } + + template + [[nodiscard]] + friend + constexpr + T + operator>>( + T lh, + C c) + { + lh >>= c; + return lh; + } + }; +}; +template +struct indexed +{ + template + class modifier; +}; + +template <> +struct indexed { + template + class modifier; + + template + class modifier> { + using ref = T&; + using cref = const T&; + using rref = T&&; + using type = strong::type; + public: + template + [[nodiscard]] + auto + operator[]( + const I &i) + const & + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) { + auto& self = static_cast(*this); + return value_of(self)[impl::access(i)]; + } + + template + [[nodiscard]] + auto + operator[]( + const I &i) + & + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) { + auto& self = static_cast(*this); + return value_of(self)[impl::access(i)]; + } + + template + [[nodiscard]] + auto + operator[]( + const I &i) + && + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) { + auto& self = static_cast(*this); + return value_of(std::move(self))[impl::access(i)]; + } + + template + [[nodiscard]] + auto + at( + const I &i) + const & + -> decltype(std::declval().at(impl::access(i))) { + auto& self = static_cast(*this); + return value_of(self).at(impl::access(i)); + } + + template + [[nodiscard]] + auto + at( + const I &i) + & + -> decltype(std::declval().at(impl::access(i))) { + auto& self = static_cast(*this); + return value_of(self).at(impl::access(i)); + } + + template + [[nodiscard]] + auto + at( + const I &i) + && + -> decltype(std::declval().at(impl::access(i))) { + auto& self = static_cast(*this); + return value_of(std::move(self)).at(impl::access(i)); + } + }; +}; + +template +template +class indexed::modifier> +{ + using type = ::strong::type; +public: + [[nodiscard]] + auto + operator[]( + const I& i) + const & + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) + { + auto& self = static_cast(*this); + return value_of(self)[impl::access(i)]; + } + + [[nodiscard]] + auto + operator[]( + const I& i) + & + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) + { + auto& self = static_cast(*this); + return value_of(self)[impl::access(i)]; + } + + [[nodiscard]] + auto + operator[]( + const I& i) + && + noexcept(noexcept(std::declval()[impl::access(i)])) + -> decltype(std::declval()[impl::access(i)]) + { + auto& self = static_cast(*this); + return value_of(std::move(self))[impl::access(i)]; + } + + template + [[nodiscard]] + auto + at( + const I& i) + const & + -> decltype(std::declval().at(impl::access(i))) + { + auto& self = static_cast(*this); + return value_of(self).at(impl::access(i)); + } + + template + [[nodiscard]] + auto + at( + const I& i) + & + -> decltype(std::declval().at(impl::access(i))) + { + auto& self = static_cast(*this); + return value_of(self).at(impl::access(i)); + } + + template + [[nodiscard]] + auto + at( + const I& i) + && + -> decltype(std::declval().at(impl::access(i))) + { + auto& self = static_cast(*this); + return value_of(std::move(self)).at(impl::access(i)); + } +}; + +class iterator +{ +public: + template >::iterator_category> + class modifier + : public pointer::modifier + , public equality::modifier + , public incrementable::modifier + { + public: + using difference_type = typename std::iterator_traits>::difference_type; + using value_type = typename std::iterator_traits>::value_type; + using pointer = typename std::iterator_traits>::value_type; + using reference = typename std::iterator_traits>::reference; + using iterator_category = typename std::iterator_traits>::iterator_category; + }; + + template + class modifier + : public modifier + , public decrementable::modifier + { + }; + template + class modifier + : public modifier + , public affine_point>::difference_type>::template modifier + , public indexed<>::modifier + , public ordered::modifier + { + }; +}; + +class range +{ +public: + template + class modifier; +}; + +template +class range::modifier> +{ + using type = ::strong::type; + using r_iterator = decltype(std::declval().begin()); + using r_const_iterator = decltype(std::declval().begin()); +public: + using iterator = ::strong::type; + using const_iterator = ::strong::type; + + iterator + begin() + noexcept(noexcept(std::declval().begin())) + { + auto& self = static_cast(*this); + return iterator{value_of(self).begin()}; + } + + iterator + end() + noexcept(noexcept(std::declval().end())) + { + auto& self = static_cast(*this); + return iterator{value_of(self).end()}; + } + + const_iterator + cbegin() + const + noexcept(noexcept(std::declval().begin())) + { + auto& self = static_cast(*this); + return const_iterator{value_of(self).begin()}; + } + + const_iterator + cend() + const + noexcept(noexcept(std::declval().end())) + { + auto& self = static_cast(*this); + return const_iterator{value_of(self).end()}; + } + + const_iterator + begin() + const + noexcept(noexcept(std::declval().begin())) + { + auto& self = static_cast(*this); + return const_iterator{value_of(self).begin()}; + } + + const_iterator + end() + const + noexcept(noexcept(std::declval().end())) + { + auto& self = static_cast(*this); + return const_iterator{value_of(self).end()}; + } +}; + +namespace impl { + + template + struct converter + { + constexpr explicit operator D() const + noexcept(noexcept(static_cast(std::declval&>()))) + { + auto& self = static_cast(*this); + return static_cast(value_of(self)); + } + }; + template + struct implicit_converter + { + constexpr operator D() const + noexcept(noexcept(static_cast(std::declval&>()))) + { + auto& self = static_cast(*this); + return static_cast(value_of(self)); + } + }; +} +template +struct convertible_to +{ + template + struct modifier : impl::converter... + { + }; +}; + +template +struct implicitly_convertible_to +{ + template + struct modifier : impl::implicit_converter... + { + }; + +}; + +struct formattable +{ + template + class modifier{}; +}; + +} + +namespace std { +template +struct hash<::strong::type> + : std::conditional_t< + std::is_base_of< + ::strong::hashable::modifier< + ::strong::type + >, + ::strong::type + >::value, + hash, + std::false_type> +{ + using type = ::strong::type; + decltype(auto) + operator()( + const ::strong::hashable::modifier& t) + const + noexcept(noexcept(std::declval>()(value_of(std::declval())))) + { + auto& tt = static_cast(t); + return hash::operator()(value_of(tt)); + } +}; + +#if STRONG_HAS_STD_FORMAT +template +struct formatter<::strong::type, Char, + std::enable_if_t< + std::is_base_of< + ::strong::formattable::modifier< + ::strong::type + >, + ::strong::type + >::value + >> + : formatter +{ + using type = ::strong::type; + template + constexpr + decltype(auto) + format(const ::strong::formattable::modifier& t, FormatContext& fc) + noexcept(noexcept(std::declval>().format(value_of(std::declval()), fc))) + { + const auto& tt = static_cast(t); + return formatter::format(value_of(tt), fc); + } +}; +#endif + +} + +#if STRONG_HAS_FMT_FORMAT +namespace fmt +{ +template +struct formatter<::strong::type, Char, + std::enable_if_t< + std::is_base_of< + ::strong::formattable::modifier< + ::strong::type + >, + ::strong::type + >::value + >> + : formatter +{ + using type = ::strong::type; + template + constexpr + decltype(auto) + format(const ::strong::formattable::modifier& t, FormatContext& fc) + noexcept(noexcept(std::declval>().format(value_of(std::declval()), fc))) + { + const auto& tt = static_cast(t); + return formatter::format(value_of(tt), fc); + } +}; +} +#endif +#endif //ROLLBEAR_STRONG_TYPE_HPP_INCLUDED + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h new file mode 100644 index 0000000000000000000000000000000000000000..5cda361bc8f17f673fb6735b76261b82d821f26d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +namespace c10 { + +C10_API void setThreadName(std::string name); + +C10_API std::string getThreadName(); + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..9fe6ecf7e59c18eaa8cd6afc37aa06a2045cf8aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h @@ -0,0 +1,121 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10::xpu::XPUCachingAllocator { + +class XPUAllocator : public DeviceAllocator { + public: + virtual void init(c10::DeviceIndex device_count) = 0; + virtual void* raw_alloc(size_t nbytes) = 0; + virtual void raw_delete(void* ptr) = 0; +}; + +C10_XPU_API extern std::atomic allocator; + +inline XPUAllocator* get() { + return allocator.load(); +} + +inline void init(c10::DeviceIndex device_count) { + get()->init(device_count); +} + +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + get()->emptyCache(mempool_id); +} + +inline void resetPeakStats(DeviceIndex device) { + get()->resetPeakStats(device); +} + +inline void resetAccumulatedStats(DeviceIndex device) { + get()->resetAccumulatedStats(device); +} + +inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + DeviceIndex device) { + return get()->getDeviceStats(device); +} + +inline void* raw_alloc(size_t size) { + return get()->raw_alloc(size); +} + +inline void raw_delete(void* ptr) { + get()->raw_delete(ptr); +} + +inline void recordStream(const DataPtr& dataPtr, XPUStream stream) { + get()->recordStream(dataPtr, stream); +} + +C10_XPU_API void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access); + +C10_XPU_API double getMemoryFraction(DeviceIndex device); + +C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device); + +C10_XPU_API void createOrIncrefPool( + c10::DeviceIndex device, + c10::MempoolId_t mempool_id, + XPUAllocator* allocator = nullptr); + +C10_XPU_API void beginAllocateToPool( + c10::DeviceIndex device, + c10::MempoolId_t mempool_id, + std::function filter); + +C10_XPU_API void endAllocateToPool( + c10::DeviceIndex device, + c10::MempoolId_t mempool_id); + +C10_XPU_API void releasePool( + c10::DeviceIndex device, + c10::MempoolId_t mempool_id); + +C10_XPU_API int getPoolUseCount( + c10::DeviceIndex device, + c10::MempoolId_t mempool_id); + +} // namespace c10::xpu::XPUCachingAllocator + +namespace c10::xpu { + +using c10::CaptureId_t; +using c10::MempoolId_t; +struct C10_XPU_API MemPool { + MemPool( + XPUCachingAllocator::XPUAllocator* allocator = nullptr, + bool is_user_created = true, + bool use_on_oom = false); + MemPool(const MemPool&) = delete; + MemPool(MemPool&&) = default; + MemPool& operator=(const MemPool&) = delete; + MemPool& operator=(MemPool&&) = default; + ~MemPool(); + + MempoolId_t id(); + XPUCachingAllocator::XPUAllocator* allocator(); + int use_count(); + c10::DeviceIndex device(); + static MempoolId_t graph_pool_handle(bool is_user_created = true); + + private: + static std::atomic uid_; + static std::atomic uuid_; + XPUCachingAllocator::XPUAllocator* allocator_; + bool is_user_created_; + MempoolId_t id_; + c10::DeviceIndex device_; +}; +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h new file mode 100644 index 0000000000000000000000000000000000000000..b85a34f0bc3d032fe403c5e758cfbba252b27871 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h @@ -0,0 +1,212 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::xpu { + +#define AT_FORALL_XPU_DEVICE_PROPERTIES(_) \ + /* the device name of this SYCL device. */ \ + _(name) \ + \ + /* the device type associated with the device. */ \ + _(device_type) \ + \ + /* the vendor of this SYCL device. */ \ + _(vendor) \ + \ + /* a backend-defined driver version as a std::string. */ \ + _(driver_version) \ + \ + /* the SYCL version as a std::string in the form . */ \ + _(version) \ + \ + /* true if the SYCL device is available. Otherwise, return false. */ \ + _(is_available) \ + \ + /* the maximum size in bytes of the arguments that can be passed to a \ + * kernel. */ \ + _(max_parameter_size) \ + \ + /* the number of parallel compute units available to the device. */ \ + _(max_compute_units) \ + \ + /* the maximum dimensions that specify the global and local work-item IDs \ + * used by the data parallel execution model. */ \ + _(max_work_item_dimensions) \ + \ + /* the maximum number of workitems that are permitted in a work-group \ + * executing a kernel on a single compute unit. */ \ + _(max_work_group_size) \ + \ + /* the maximum number of subgroups in a work-group for any kernel executed \ + * on the device. */ \ + _(max_num_sub_groups) \ + \ + /* a std::vector of size_t containing the set of sub-group sizes supported \ + * by the device. */ \ + _(sub_group_sizes) \ + \ + /* the maximum configured clock frequency of this SYCL device in MHz. */ \ + _(max_clock_frequency) \ + \ + /* the default compute device address space size specified as an unsigned \ + * integer value in bits. Must return either 32 or 64. */ \ + _(address_bits) \ + \ + /* the maximum size of memory object allocation in bytes. */ \ + _(max_mem_alloc_size) \ + \ + /* the minimum value in bits of the largest supported SYCL built-in data \ + * type if this SYCL device is not of device type \ + * sycl::info::device_type::custom. */ \ + _(mem_base_addr_align) \ + \ + /* a std::vector of info::fp_config describing the half/single/double \ + * precision floating-point capability of this SYCL device. */ \ + _(half_fp_config) \ + _(single_fp_config) \ + _(double_fp_config) \ + \ + /* the size of global device memory in bytes. */ \ + _(global_mem_size) \ + \ + /* the type of global memory cache supported. */ \ + _(global_mem_cache_type) \ + \ + /* the size of global memory cache in bytes. */ \ + _(global_mem_cache_size) \ + \ + /* the size of global memory cache line in bytes. */ \ + _(global_mem_cache_line_size) \ + \ + /* the type of local memory supported. */ \ + _(local_mem_type) \ + \ + /* the size of local memory arena in bytes. */ \ + _(local_mem_size) \ + \ + /* the maximum number of sub-devices that can be created when this device is \ + * partitioned. */ \ + _(partition_max_sub_devices) \ + \ + /* the resolution of device timer in nanoseconds. */ \ + _(profiling_timer_resolution) \ + \ + /* the preferred native vector width size for built-in scalar types that can \ + * be put into vectors. */ \ + _(preferred_vector_width_char) \ + _(preferred_vector_width_short) \ + _(preferred_vector_width_int) \ + _(preferred_vector_width_long) \ + _(preferred_vector_width_float) \ + _(preferred_vector_width_double) \ + _(preferred_vector_width_half) \ + \ + /* the native ISA vector width. The vector width is defined as the number of \ + * scalar elements that can be stored in the vector. */ \ + _(native_vector_width_char) \ + _(native_vector_width_short) \ + _(native_vector_width_int) \ + _(native_vector_width_long) \ + _(native_vector_width_float) \ + _(native_vector_width_double) \ + _(native_vector_width_half) + +#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_) \ + /* the number of EUs associated with the Intel GPU. */ \ + _(gpu_eu_count, gpu_eu_count, 512) \ + \ + /* the number of EUs in a subslice. */ \ + _(gpu_eu_count_per_subslice, gpu_eu_count_per_subslice, 8) \ + \ + /* the simd width of EU of GPU. */ \ + _(gpu_eu_simd_width, gpu_eu_simd_width, 8) \ + \ + /* the number of hardware threads per EU of GPU. */ \ + _(gpu_hw_threads_per_eu, gpu_hw_threads_per_eu, 8) \ + \ + /* the device identifier of the Intel GPU, also known as the product ID. */ \ + _(device_id, device_id, 0) \ + \ + /* the device descriptor for device Universal Unique ID, 16 bytes*/ \ + _(uuid, device_info_uuid, (std::array{})) + +#define AT_FORALL_XPU_DEVICE_ASPECT(_) \ + /* sycl::half is supported on device. */ \ + _(fp16) \ + \ + /* double is supported on device. */ \ + _(fp64) \ + \ + /* 64-bit atomic operation is supported on device. */ \ + _(atomic64) + +#define AT_FORALL_XPU_EXP_CL_ASPECT(_) \ + /* conversion between single-precision 32-bit floating-point values and \ + * 16-bit bfloat16 values is supported on device. */ \ + _(bfloat16_conversions) \ + \ + /* specialized hardware to compute MMA is supported on device. */ \ + _(subgroup_matrix_multiply_accumulate) \ + \ + /* specialized hardware to compute MMA for 32-bit floating-point is \ + * supported on device. */ \ + _(subgroup_matrix_multiply_accumulate_tensor_float32) \ + \ + /* block read operations for efficient matrix multiplication is supported on \ + * device. */ \ + _(subgroup_2d_block_io) + +#define AT_FORALL_XPU_EXP_DEVICE_PROPERTIES(_) \ + /* the device architecture of this SYCL device. */ \ + _(architecture) + +#define _DEFINE_SYCL_PROP(ns, property, member) \ + ns::property::return_type member; + +#define DEFINE_DEVICE_PROP(property) \ + _DEFINE_SYCL_PROP(sycl::info::device, property, property) + +#define DEFINE_PLATFORM_PROP(property, member) \ + _DEFINE_SYCL_PROP(sycl::info::platform, property, member) + +#define DEFINE_EXT_DEVICE_PROP(property, ...) \ + _DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property) + +#define DEFINE_DEVICE_ASPECT(member) bool has_##member; + +#define DEFINE_EXP_DEVICE_PROP(property) \ + _DEFINE_SYCL_PROP( \ + sycl::ext::oneapi::experimental::info::device, property, property) + +struct C10_XPU_API DeviceProp { + AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP); + + // the platform name. + DEFINE_PLATFORM_PROP(name, platform_name); + + AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP); + + AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT); + + AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT); + +#if SYCL_COMPILER_VERSION >= 20250000 + AT_FORALL_XPU_EXP_DEVICE_PROPERTIES(DEFINE_EXP_DEVICE_PROP); +#endif +}; + +#undef _DEFINE_SYCL_PROP +#undef DEFINE_DEVICE_PROP +#undef DEFINE_PLATFORM_PROP +#undef DEFINE_EXT_DEVICE_PROP +#undef DEFINE_DEVICE_ASPECT +#undef DEFINE_EXP_DEVICE_PROP + +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..596fdfcc0ff06ccdb4395c3989e987836804eddc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h @@ -0,0 +1,183 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace c10::xpu { + +/* + * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are + * constructed lazily when first recorded. It has a device, and this device is + * acquired from the first recording stream. Later streams that record the event + * must match the same device. + * + * Currently, XPUEvent does NOT support to export an inter-process event from + * another process via inter-process communication(IPC). So it means that + * inter-process communication for event handles between different processes is + * not available. This could impact some applications that rely on cross-process + * synchronization and communication. + */ +struct XPUEvent { + // Constructors + XPUEvent(bool enable_timing = false) noexcept + : enable_timing_{enable_timing} {} + + ~XPUEvent() { + if (isCreated()) { + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_deletion( + c10::kXPU, reinterpret_cast(event_.get())); + } + } + } + + C10_DISABLE_COPY_AND_ASSIGN(XPUEvent); + + XPUEvent(XPUEvent&& other) = default; + XPUEvent& operator=(XPUEvent&& other) = default; + + operator sycl::event&() const { + return event(); + } + + std::optional device() const { + if (isCreated()) { + return c10::Device(c10::kXPU, device_index_); + } else { + return std::nullopt; + } + } + + inline bool isCreated() const { + return (event_.get() != nullptr); + } + + DeviceIndex device_index() const { + return device_index_; + } + + sycl::event& event() const { + return *event_; + } + + bool query() const { + using namespace sycl::info; + if (!isCreated()) { + return true; + } + + return event().get_info() == + event_command_status::complete; + } + + void record() { + record(getCurrentXPUStream()); + } + + void recordOnce(const XPUStream& stream) { + if (!isCreated()) { + record(stream); + } + } + + void record(const XPUStream& stream) { + if (!isCreated()) { + device_index_ = stream.device_index(); + assignEvent(stream.queue()); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_creation( + c10::kXPU, reinterpret_cast(event_.get())); + } + } else { + TORCH_CHECK( + device_index_ == stream.device_index(), + "Event device ", + device_index_, + " does not match recording stream's device ", + stream.device_index(), + "."); + reassignEvent(stream.queue()); + } + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_record( + c10::kXPU, + reinterpret_cast(event_.get()), + reinterpret_cast(&stream.queue())); + } + } + + void block(const XPUStream& stream) { + if (isCreated()) { + std::vector event_list{event()}; + // Make this stream wait until event_ is completed. + stream.queue().ext_oneapi_submit_barrier(event_list); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_wait( + c10::kXPU, + reinterpret_cast(event_.get()), + reinterpret_cast(&stream.queue())); + } + } + } + + double elapsed_time(const XPUEvent& other) const { + TORCH_CHECK( + isCreated() && other.isCreated(), + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && other.query(), + "Both events must be completed before calculating elapsed time."); + TORCH_CHECK( + enable_timing_ && other.enable_timing_, + "Both events must be created with argument 'enable_timing=True'."); + + using namespace sycl::info::event_profiling; + // Block until both of the recorded events are completed. + uint64_t end_time_ns = other.event().get_profiling_info(); + uint64_t start_time_ns = event().get_profiling_info(); + // Return the eplased time in milliseconds. + return 1e-6 * + (static_cast(end_time_ns) - static_cast(start_time_ns)); + } + + void synchronize() const { + if (isCreated()) { + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_synchronization( + c10::kXPU, reinterpret_cast(event_.get())); + } + event().wait_and_throw(); + } + } + + private: + void assignEvent(sycl::queue& queue) { + if (enable_timing_) { + event_ = std::make_unique( + sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); + } else { + event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); + } + } + + void reassignEvent(sycl::queue& queue) { + event_.reset(); + assignEvent(queue); + } + + bool enable_timing_ = false; + c10::DeviceIndex device_index_ = -1; + // Only need to track the last event, as events in an in-order queue are + // executed sequentially. + std::unique_ptr event_; +}; + +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h new file mode 100644 index 0000000000000000000000000000000000000000..d5d6d56a1560728c6604d04aaa2aa75c4c615aae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace c10::xpu { + +static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) { + if (el.size() == 0) { + return; + } + for (const auto& e : el) { + try { + std::rethrow_exception(e); + } catch (sycl::exception& e) { + TORCH_WARN("SYCL Exception: ", e.what()); + } + } + throw; +}; + +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..e5017a054d32448a372290fcab2adfdea3e7fb36 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +// The naming convention used here matches the naming convention of torch.xpu + +namespace c10::xpu { + +// Log a warning only once if no devices are detected. +C10_XPU_API DeviceIndex device_count(); + +// Throws an error if no devices are detected. +C10_XPU_API DeviceIndex device_count_ensure_non_zero(); + +C10_XPU_API DeviceIndex current_device(); + +C10_XPU_API void set_device(DeviceIndex device); + +C10_XPU_API DeviceIndex exchange_device(DeviceIndex device); + +C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device); + +C10_XPU_API sycl::device& get_raw_device(DeviceIndex device); + +C10_XPU_API sycl::context& get_device_context(); + +C10_XPU_API void get_device_properties( + DeviceProp* device_prop, + DeviceIndex device); + +C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr); + +static inline void check_device_index(DeviceIndex device_index) { + TORCH_CHECK( + device_index >= 0 && device_index < c10::xpu::device_count(), + "The device index is out of range. It must be in [0, ", + static_cast(c10::xpu::device_count()), + "), but got ", + static_cast(device_index), + "."); +} + +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..437dda44bfc4826d05389b530a7cd54083ec8f08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +// XPU Graphs utils used by c10 and aten. +using namespace sycl::ext::oneapi::experimental; +namespace c10::xpu { + +static_assert( + int8_t(queue_state::executing) == 0, + "unexpected int(queue_state::executing) value"); +static_assert( + int8_t(queue_state::recording) == 1, + "unexpected int(queue_state::recording) value"); + +enum class CaptureStatus : int8_t { + Executing = int8_t(queue_state::executing), + Recording = int8_t(queue_state::recording) +}; + +inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { + switch (status) { + case CaptureStatus::Executing: + os << "Executing"; + break; + case CaptureStatus::Recording: + os << "Recording"; + break; + default: + TORCH_INTERNAL_ASSERT( + false, "Unknown XPU graph CaptureStatus", int(status)); + } + return os; +} + +inline CaptureStatus currentStreamCaptureStatusMayInitCtx() { + auto state = c10::xpu::getCurrentXPUStream().queue().ext_oneapi_get_state(); + return CaptureStatus(state); +} + +} // namespace c10::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h new file mode 100644 index 0000000000000000000000000000000000000000..43a42c2a6f8a47a2276268e58edc176ec5f6a781 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif + +// See c10/macros/Export.h for a detailed explanation of what the function +// of these macros are. We need one set of macros for every separate library +// we build. + +#ifdef _WIN32 +#if defined(C10_XPU_BUILD_SHARED_LIBS) +#define C10_XPU_EXPORT __declspec(dllexport) +#define C10_XPU_IMPORT __declspec(dllimport) +#else +#define C10_XPU_EXPORT +#define C10_XPU_IMPORT +#endif +#else // _WIN32 +#if defined(__GNUC__) +#define C10_XPU_EXPORT __attribute__((__visibility__("default"))) +#else // defined(__GNUC__) +#define C10_XPU_EXPORT +#endif // defined(__GNUC__) +#define C10_XPU_IMPORT C10_XPU_EXPORT +#endif // _WIN32 + +// This one is being used by libc10_xpu.so +#ifdef C10_XPU_BUILD_MAIN_LIB +#define C10_XPU_API C10_XPU_EXPORT +#else +#define C10_XPU_API C10_XPU_IMPORT +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h new file mode 100644 index 0000000000000000000000000000000000000000..df79df4945aa93da62b5faf0bf931a44cd09bf2d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h @@ -0,0 +1,217 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10::xpu { + +/* + * Note [Stream Management] + * + * An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel + * can execute. Currently, there are several pools per device to manage SYCL + * queue, and a device's pool is lazily created. + * + * There are two pools per device. The first pool contains "normal priority" + * queues. The second pool is the "high priority" queues. There are 32 queues in + * per pool per device, and when a queue is requested one of these queues is + * returned round-robin. That is, the first queue requested is at index 0, the + * second at index 1... to index 31, then index 0 again. + * + * This means that if 33 queues are requested, the first and last queues + * requested are actually the same queue (under the covers) and kernels enqueued + * on them cannot run concurrently. + * + * It is safe to enqueue a kernel on the same queue from two different + * threads as the SYCL specification described. + */ + +static constexpr int max_compile_time_stream_priorities = 3; + +/* + * This serves as a wrapper around c10::Stream and acts as a representation for + * a SYCL queue, which allows asynchronous execution of XPU tasks. + */ +class C10_XPU_API XPUStream { + public: + enum Unchecked { UNCHECKED }; + + /// Construct a XPUStream from a Stream. This construction is checked, and + /// will raise an error if the Stream is not, in fact, a XPU stream. + explicit XPUStream(Stream stream) : stream_(stream) { + TORCH_CHECK(stream_.device_type() == DeviceType::XPU); + } + + /// Construct a XPUStream from a Stream with no error checking. + explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {} + + bool operator==(const XPUStream& other) const noexcept { + return unwrap() == other.unwrap(); + } + + bool operator!=(const XPUStream& other) const noexcept { + return unwrap() != other.unwrap(); + } + + /// Implicit conversion to sycl::queue&. + operator sycl::queue&() const { + return queue(); + } + + /// Implicit conversion to sycl::queue*. + operator sycl::queue*() const { + return &queue(); + } + + /// Implicit conversion to Stream (a.k.a., forget that the stream is a + /// XPU stream). + operator Stream() const { + return unwrap(); + } + + /// Get the XPU device type that this stream is associated with. + DeviceType device_type() const { + return DeviceType::XPU; + } + + /// Get the XPU device index that this stream is associated with. + DeviceIndex device_index() const { + return stream_.device_index(); + } + + /// Get the full Device that this stream is associated with. The Device is + /// guaranteed to be a XPU device. + Device device() const { + return Device(DeviceType::XPU, device_index()); + } + + /// Return the stream ID corresponding to this particular stream. StreamId is + /// a int64_t representation generated by its type and index. + StreamId id() const { + return stream_.id(); + } + + /// Return true if all enqueued tasks in this stream have been completed, + /// otherwise return false. + bool query() const { + return queue().ext_oneapi_empty(); + } + + /// Performs a blocking wait for the completion of all enqueued tasks in this + /// stream. + void synchronize() const { + queue().wait_and_throw(); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_stream_synchronization( + c10::kXPU, reinterpret_cast(&queue())); + } + } + + /// Return the priority that this stream is associated with. Lower numbers + /// represent higher priority. + int priority() const; + + /// Explicit conversion to sycl::queue&. + sycl::queue& queue() const; + + /// Explicit conversion to Stream. + Stream unwrap() const { + return stream_; + } + + /// Reversibly pack a XPUStream into a struct representation. The XPUStream + /// can be unpacked using unpack3(). + struct c10::StreamData3 pack3() const { + return stream_.pack3(); + } + + /// Unpack a XPUStream from the 3 fields generated by pack3(). + static XPUStream unpack3( + StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + return XPUStream(Stream::unpack3(stream_id, device_index, device_type)); + } + + /// Return the range of priority **supported by PyTorch**. + static std::tuple priority_range() { + // See Note [XPU Stream priorities] + return std::make_tuple(1, -max_compile_time_stream_priorities + 2); + } + + private: + Stream stream_; +}; + +/** + * Get a stream from the pool in a round-robin fashion. + * + * You can request a stream from the highest priority pool by setting + * isHighPriority to true for a specific device. + */ +C10_XPU_API XPUStream +getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1); + +/** + * Get a stream from the pool in a round-robin fashion. + * + * You can request a stream by setting a priority value for a specific device. + * The priority number lower, the priority higher. + */ +C10_XPU_API XPUStream +getStreamFromPool(const int priority, DeviceIndex device = -1); + +/** + * Get an XPUStream from an external SYCL queue. + * + * This function allows interoperability with other libraries by enabling + * the use of an external SYCL queue that was not created by PyTorch. This + * can be useful for data exchange or other operations where integration + * with non-PyTorch queues is required. + * + * NOTE: It is the user's responsibility to ensure that the referenced SYCL + * queue remains alive while the corresponding XPUStream, or any c10::Stream + * derived from it, is in use. The different SYCL queue pointers will result in + * distinct XPUStream instances, even if the SYCL queues they dereference are + * equivalent. + */ +C10_XPU_API XPUStream +getStreamFromExternal(sycl::queue* ext_queue, DeviceIndex device_index); + +/** + * Get the current XPU stream, for the passed XPU device, or for the current + * device if no device index is passed. + */ +C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1); + +/** + * Set the current stream on the device of the passed in stream to be the passed + * in stream. + */ +C10_XPU_API void setCurrentXPUStream(XPUStream stream); + +C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s); + +/** + * Block all reserved SYCL queues in the stream pools on the device, and wait + * for their synchronizations. + */ +C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1); + +} // namespace c10::xpu + +namespace std { +template <> +struct hash { + size_t operator()(c10::xpu::XPUStream s) const noexcept { + return std::hash{}(s.unwrap()); + } +}; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..0d700f946ebe76abf99c2641448f5b4e2c3241eb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h @@ -0,0 +1,223 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace c10::xpu::impl { + +struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr DeviceType static_type = kXPU; + + XPUGuardImpl() = default; + + explicit XPUGuardImpl(DeviceType t) { + TORCH_CHECK( + t == kXPU, "XPUGuardImpl initialized with non-XPU DeviceType: ", t); + } + + DeviceType type() const override { + return kXPU; + } + + Device exchangeDevice(Device d) const override { + TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d); + const auto old_device_index = c10::xpu::exchange_device(d.index()); + return Device(kXPU, old_device_index); + } + + Device getDevice() const override { + const auto device = c10::xpu::current_device(); + return Device(kXPU, device); + } + + void setDevice(Device d) const override { + TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d); + c10::xpu::set_device(d.index()); + } + + void uncheckedSetDevice(Device d) const noexcept override { + c10::xpu::set_device(d.index()); + } + + Stream getStream(Device d) const override { + return getCurrentXPUStream(d.index()).unwrap(); + } + + Stream getNewStream(Device d, int priority = 0) const override { + return getStreamFromPool(priority, d.index()); + } + + Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) + const override { + return getStreamFromPool(isHighPriority, d.index()); + } + + // NB: These do NOT set the current device + Stream exchangeStream(Stream s) const override { + const XPUStream stream(s); + const auto old_stream = getCurrentXPUStream(s.device().index()); + setCurrentXPUStream(stream); + return old_stream.unwrap(); + } + + DeviceIndex deviceCount() const noexcept override { + return c10::xpu::device_count(); + } + + // Event-related functions + void destroyEvent(void* event, const DeviceIndex device_index) + const noexcept override { + if (!event) + return; + + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_deletion( + c10::kXPU, reinterpret_cast(event)); + } + + delete reinterpret_cast(event); + } + + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override { + TORCH_CHECK( + device_index == -1 || device_index == stream.device_index(), + "Event device index ", + device_index, + " does not match recording stream's device index ", + stream.device_index(), + "."); + + auto* xpu_event = reinterpret_cast(*event); + const XPUStream xpu_stream{stream}; + + // Delete the event previously recorded. + if (xpu_event) + delete xpu_event; +#if SYCL_COMPILER_VERSION >= 20250000 + if (flag == EventFlag::BACKEND_DEFAULT) { + // Use the profiling tag to record the event to enable timing feature. + xpu_event = + new sycl::event(sycl::ext::oneapi::experimental::submit_profiling_tag( + xpu_stream.queue())); + } else { + xpu_event = + new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier()); + } +#else + xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier()); +#endif + *event = reinterpret_cast(xpu_event); + + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_record( + c10::kXPU, + reinterpret_cast(xpu_event), + reinterpret_cast(&xpu_stream.queue())); + } + } + + void block(void* event, const Stream& stream) const override { + if (!event) + return; + auto* xpu_event = reinterpret_cast(event); + std::vector event_list{*xpu_event}; + const XPUStream xpu_stream(stream); + xpu_stream.queue().ext_oneapi_submit_barrier(event_list); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_wait( + c10::kXPU, + reinterpret_cast(xpu_event), + reinterpret_cast(&xpu_stream.queue())); + } + } + + bool queryEvent(void* event) const override { + using namespace sycl::info; + if (!event) + return true; + auto* xpu_event = reinterpret_cast(event); + return xpu_event->get_info() == + event_command_status::complete; + } + + double elapsedTime( + void* start_event, + void* end_event, + const DeviceIndex device_index) const override { +#if SYCL_COMPILER_VERSION < 20250000 + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "elapsedTime requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); +#endif + TORCH_CHECK( + start_event && end_event, + "Both events must be recorded before calculating elapsed time."); + auto* xpu_start_event = reinterpret_cast(start_event); + auto* xpu_end_event = reinterpret_cast(end_event); + + using namespace sycl::info::event_profiling; + // Block until both of the recorded events are completed. + uint64_t end_time_ns = xpu_end_event->get_profiling_info(); + uint64_t start_time_ns = xpu_start_event->get_profiling_info(); + // Return the eplased time in milliseconds. + return 1e-6 * + (static_cast(end_time_ns) - static_cast(start_time_ns)); + } + + // Stream-related functions + bool queryStream(const Stream& stream) const override { + const XPUStream xpu_stream{stream}; + return xpu_stream.query(); + } + + void synchronizeStream(const Stream& stream) const override { + const XPUStream xpu_stream{stream}; + xpu_stream.synchronize(); + } + + void synchronizeEvent(void* event) const override { + if (!event) + return; + auto* xpu_event = reinterpret_cast(event); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_synchronization( + c10::kXPU, reinterpret_cast(xpu_event)); + } + xpu_event->wait_and_throw(); + } + + void synchronizeDevice(const c10::DeviceIndex device_index) const override { + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_device_synchronization(c10::kXPU); + } + c10::xpu::syncStreamsOnDevice(device_index); + } + + void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) + const override { + const XPUStream xpu_stream{stream}; + XPUCachingAllocator::recordStream(data_ptr, xpu_stream); + } +}; + +} // namespace c10::xpu::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h new file mode 100644 index 0000000000000000000000000000000000000000..336c8349121389fd6dc64732ef50977e1cb2e0d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#include + +static inline void initHostData(int* hostData, int numel) { + for (const auto i : c10::irange(numel)) { + hostData[i] = i; + } +} + +static inline void clearHostData(int* hostData, int numel) { + for (const auto i : c10::irange(numel)) { + hostData[i] = 0; + } +} + +static inline void validateHostData(int* hostData, int numel) { + for (const auto i : c10::irange(numel)) { + EXPECT_EQ(hostData[i], i); + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h new file mode 100644 index 0000000000000000000000000000000000000000..f8de86b9ed8e3fc25a3e6efe20bc36f5b29336c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_CORE_COMMON_H_ +#define CAFFE2_CORE_COMMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#endif + +#if defined(_MSC_VER) +#include +#else +#include +#endif + +// Macros used during the build of this caffe2 instance. This header file +// is automatically generated by the cmake script during build. +#include "caffe2/core/macros.h" + +#include + +namespace caffe2 { + +// Using statements for common classes that we refer to in caffe2 very often. +// Note that we only place it inside caffe2 so the global namespace is not +// polluted. +/* using override */ +using std::set; +using std::string; +using std::unique_ptr; +using std::vector; + +// Define alignment macro that is cross platform +#if (defined _MSC_VER && !defined NOMINMAX) +#define NOMINMAX +#endif + +using std::make_unique; + +#if defined(__ANDROID__) && !defined(__NDK_MAJOR__) +using ::round; +#else +using std::round; +#endif // defined(__ANDROID__) && !defined(__NDK_MAJOR__) + +// Returns which setting Caffe2 was configured and built with (exported from +// CMake) +TORCH_API const std::map& GetBuildOptions(); + +} // namespace caffe2 + +#endif // CAFFE2_CORE_COMMON_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..ae86a3366590c8538b92cc7e92191365ef3545c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Automatically generated header file for caffe2 macros. These +// macros are used to build the Caffe2 binary, and if you are +// building a dependent library, they will need to be set as well +// for your program to link correctly. + +#pragma once + +#define CAFFE2_BUILD_SHARED_LIBS +/* #undef CAFFE2_FORCE_FALLBACK_CUDA_MPI */ +/* #undef CAFFE2_HAS_MKL_DNN */ +/* #undef CAFFE2_HAS_MKL_SGEMM_PACK */ +#define CAFFE2_PERF_WITH_AVX +#define CAFFE2_PERF_WITH_AVX2 +/* #undef CAFFE2_THREADPOOL_MAIN_IMBALANCE */ +/* #undef CAFFE2_THREADPOOL_STATS */ +/* #undef CAFFE2_USE_ACCELERATE */ +#define CAFFE2_USE_CUDNN +/* #undef CAFFE2_USE_EIGEN_FOR_BLAS */ +/* #undef CAFFE2_USE_FBCODE */ +/* #undef CAFFE2_USE_GOOGLE_GLOG */ +/* #undef CAFFE2_USE_LITE_PROTO */ +#define CAFFE2_USE_MKL +#define USE_MKLDNN +/* #undef CAFFE2_USE_NVTX */ +/* #undef CAFFE2_USE_ITT */ + +#ifndef EIGEN_MPL2_ONLY +#define EIGEN_MPL2_ONLY +#endif + +// Useful build settings that are recorded in the compiled binary +// torch.__config__.show() +#define CAFFE2_BUILD_STRINGS { \ + {"TORCH_VERSION", "2.10.0"}, \ + {"CXX_COMPILER", "/opt/rh/gcc-toolset-13/root/usr/bin/c++"}, \ + {"CXX_FLAGS", " -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_FBGEMM_GENAI -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow"}, \ + {"BUILD_TYPE", "Release"}, \ + {"BLAS_INFO", "mkl"}, \ + {"LAPACK_INFO", "mkl"}, \ + {"USE_CUDA", "ON"}, \ + {"USE_ROCM", "OFF"}, \ + {"CUDA_VERSION", "12.8"}, \ + {"ROCM_VERSION", ""}, \ + {"USE_CUDNN", "ON"}, \ + {"COMMIT_SHA", "449b1768410104d3ed79d3bcfe4ba1d65c7f22c0"}, \ + {"CUDNN_VERSION", "9.10.2"}, \ + {"USE_NCCL", "1"}, \ + {"USE_MPI", "OFF"}, \ + {"USE_GFLAGS", "OFF"}, \ + {"USE_GLOG", "OFF"}, \ + {"USE_GLOO", "ON"}, \ + {"USE_NNPACK", "ON"}, \ + {"USE_OPENMP", "ON"}, \ + {"FORCE_FALLBACK_CUDA_MPI", ""}, \ + {"HAS_MKL_DNN", ""}, \ + {"HAS_MKL_SGEMM_PACK", ""}, \ + {"PERF_WITH_AVX", "1"}, \ + {"PERF_WITH_AVX2", "1"}, \ + {"USE_ACCELERATE", ""}, \ + {"USE_EIGEN_FOR_BLAS", ""}, \ + {"USE_LITE_PROTO", ""}, \ + {"USE_MKL", "ON"}, \ + {"USE_MKLDNN", "ON"}, \ + {"USE_NVTX", ""}, \ + {"USE_ITT", ""}, \ + {"USE_ROCM_KERNEL_ASSERT", "OFF"}, \ + {"USE_CUSPARSELT", "1"}, \ + {"USE_XPU", "OFF"}, \ + {"USE_XCCL", "OFF"}, \ +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h new file mode 100644 index 0000000000000000000000000000000000000000..54ff81fc25e27eb38cc23e497b692f321b71c6b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_CORE_TIMER_H_ +#define CAFFE2_CORE_TIMER_H_ + +#include + +#include "caffe2/core/common.h" + +namespace caffe2 { + +/** + * @brief A simple timer object for measuring time. + * + * This is a minimal class around a std::chrono::high_resolution_clock that + * serves as a utility class for testing code. + */ +class Timer { + public: + typedef std::chrono::high_resolution_clock clock; + typedef std::chrono::nanoseconds ns; + Timer() { Start(); } + /** + * @brief Starts a timer. + */ + inline void Start() { start_time_ = clock::now(); } + inline float NanoSeconds() { + return static_cast( + std::chrono::duration_cast(clock::now() - start_time_).count()); + } + /** + * @brief Returns the elapsed time in milliseconds. + */ + inline float MilliSeconds() { return NanoSeconds() / 1000000.f; } + /** + * @brief Returns the elapsed time in microseconds. + */ + inline float MicroSeconds() { return NanoSeconds() / 1000.f; } + /** + * @brief Returns the elapsed time in seconds. + */ + inline float Seconds() { return NanoSeconds() / 1000000000.f; } + + protected: + std::chrono::time_point start_time_; + C10_DISABLE_COPY_AND_ASSIGN(Timer); +}; +} + +#endif // CAFFE2_CORE_TIMER_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h new file mode 100644 index 0000000000000000000000000000000000000000..7c7c0b7ec332ff3e66c897806c6e26dbbb7dee9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h @@ -0,0 +1,326 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include "vectorizer.h" +#include + +namespace caffe2::details { + +namespace { +void TileIndicesInPlace(std::vector& v, const std::size_t D, const std::size_t K) { + auto n = v.size(); + v.resize(K * n); + for (const auto k : c10::irange(1, K)) { + for (const auto j : c10::irange(n)) { + v[k * n + j] = v[j] + k * D; + } + } +} + +// MKL VML function templates. +template +void PackV(const int N, const T* a, const int* ia, T* y); +template +void UnpackV(const int N, const T* a, T* y, const int* iy); + +#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \ + template <> \ + void PackV(const int N, const T* a, const int* ia, T* y) { \ + OriginalFunc(N, a, ia, y); \ + } +DELEGATE_PACKV_FUNCTION(float, vsPackV) +DELEGATE_PACKV_FUNCTION(double, vdPackV) +#undef DELEGATE_PACKV_FUNCTION + +#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \ + template <> \ + void UnpackV(const int N, const T* a, T* y, const int* iy) { \ + OriginalFunc(N, a, y, iy); \ + } +DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV) +DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV) +#undef DELEGATE_UNPACKV_FUNCTION + +#ifndef FAST_VECTORIZED_KERNEL +template +void box_cox_zero_lambda( + size_t D, + const T* const self_data, + const T* const lambda2_data, + T k_eps, + T* const output_data) { + int j = 0; + using Vec = at::vec::Vectorized; + constexpr int64_t VLEN = Vec::size(); + auto k_eps_vec = Vec(k_eps); + for(; j + VLEN < D; j += VLEN) { + auto data = Vec::loadu(self_data + j); + auto lambda2 = Vec::loadu(lambda2_data + j); + auto sum = data + lambda2; + auto max = at::vec::max(sum, k_eps_vec); + auto res = max.log(); + res.store(output_data + j); + } + for ( ;j < D; ++j) { + auto sum = self_data[j] + lambda2_data[j]; + auto max = std::max(sum, k_eps); + output_data[j] = std::log(max); + } +} + +template +at::vec::Vectorized box_cox_nonzero_lambda_impl( + at::vec::Vectorized data, + at::vec::Vectorized lambda1, + at::vec::Vectorized lambda2, + at::vec::Vectorized k_eps) { + auto sum = data + lambda2; + auto max = at::vec::max(sum, k_eps); + auto lambda_over_1 = at::vec::fast_recieprocal(lambda1); + auto pow = max.pow(lambda1); + return at::vec::fmsub(pow, lambda_over_1, lambda_over_1); +} + +template +void box_cox_nonzero_lambda( + int64_t D, + const T* data_ptr, + const T* lambda1_ptr, + const T* lambda2_ptr, + T k_eps, + T* out) { + + int j = 0; + using Vec = at::vec::Vectorized; + constexpr int64_t VLEN = Vec::size(); + auto k_eps_vec = Vec(k_eps); + for(; j + VLEN < D; j += VLEN) { + auto data = Vec::loadu(data_ptr + j); + auto lambda1 = Vec::loadu(lambda1_ptr + j); + auto lambda2 = Vec::loadu(lambda2_ptr + j); + auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec); + res.store(out + j); + } + if (j < D) { + auto remaining = D - j; + auto data = Vec::loadu(data_ptr + j, remaining); + auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining); + auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining); + auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec); + res.store(out + j, remaining); + } +} +#else +template +void box_cox_zero_lambda( + size_t D, + const T* const self_data, + const T* const lambda2_data, + T k_eps, + T* const output_data) { + VECTOR_LOOP for (auto j=0 ;j < D; ++j) { + auto sum = self_data[j] + lambda2_data[j]; + auto max = std::max(sum, k_eps); + output_data[j] = std::log(max); + } +} + +template +void box_cox_nonzero_lambda( + int64_t D, + const T* data_ptr, + const T* lambda1_ptr, + const T* lambda2_ptr, + T k_eps, + T* out) { + + VECTOR_LOOP for (auto j=0 ;j < D; ++j) { + FAST_MATH + auto sum = data_ptr[j] + lambda2_ptr[j]; + auto max = std::max(sum, k_eps); + auto lamda1 = lambda1_ptr[j]; + auto lambda_over_1 = 1 / lamda1; + if constexpr (std::is_same::value) { + lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1); + lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1); + } + auto pow = std::pow(max, lamda1); + out[j] = pow * lambda_over_1 - lambda_over_1; + } +} +#endif // FAST_VECTORIZED_KERNEL + +template +void box_cox_mixed_lambda( + const T* const self_data, + const std::vector& nonzeros, + const std::vector& zeros, + const T* const lambda1, + const T* const lambda2, + const T* const lambda2_z_, + T k_eps, + T* const buffer, + T* const output_data) { + PackV(nonzeros.size(), self_data, nonzeros.data(), buffer); + box_cox_nonzero_lambda( + nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer); + UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data()); + + PackV(zeros.size(), self_data, zeros.data(), buffer); + box_cox_zero_lambda( + zeros.size(), buffer, lambda2_z_, k_eps, buffer); + UnpackV(zeros.size(), buffer, output_data, zeros.data()); +} + +template +void TileArrayIntoVector( + const T* const a, + const size_t D, + const int K, + std::vector& b) { + b.resize(K * D); + for (const auto k : c10::irange(K)) { + std::copy(a, a + D, b.begin() + k * D); + } +} + +template +void compute_batch_box_cox_vec_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const T* self_data, + const T* __restrict lambda1_data, + const T* __restrict lambda2_data, + T* output_data) { + constexpr T k_eps = static_cast(1e-6); + + FOLLY_DECLARE_REUSED(zeros, std::vector); + FOLLY_DECLARE_REUSED(nonzeros, std::vector); + // Don't bother calling reserve; calls after the first will get a + // correctly-sized allocation anyway. + for (const auto j : c10::irange(D)) { + if (lambda1_data[j] == 0) { + zeros.push_back(j); + } else { + nonzeros.push_back(j); + } + } + + // Process K rows at a time for effective vectorization with small rows. + const auto K = std::min(N, (block_size + D - 1) / D); + + FOLLY_DECLARE_REUSED(lambda1_, std::vector); + FOLLY_DECLARE_REUSED(lambda2_, std::vector); + FOLLY_DECLARE_REUSED(lambda2_z_, std::vector); + + if (nonzeros.size() == D) { + // ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0 + size_t i = 0; + if (K > 1) { + TileArrayIntoVector(lambda1_data, D, K, lambda1_); + TileArrayIntoVector(lambda2_data, D, K, lambda2_); + DCHECK_EQ(K * D, lambda1_.size()); + DCHECK_EQ(K * D, lambda2_.size()); + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_nonzero_lambda( + K * D, + self_data, + lambda1_.data(), + lambda2_.data(), + k_eps, + output_data); + } + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_nonzero_lambda( + D, self_data, lambda1_data, lambda2_data, k_eps, output_data); + } + } else if (zeros.size() == D) { + // ln(x + lambda2), if lambda1 == 0 + size_t i = 0; + if (K > 1) { + TileArrayIntoVector(lambda2_data, D, K, lambda2_z_); + DCHECK_EQ(K * D, lambda2_z_.size()); + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_zero_lambda( + K * D, self_data, lambda2_z_.data(), k_eps, output_data); + } + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_zero_lambda( + D, self_data, lambda2_data, k_eps, output_data); + } + } else { + // mix zeros and nonzeros + const size_t n = nonzeros.size(); + if (K > 1) { + TileIndicesInPlace(nonzeros, 0, K); + TileIndicesInPlace(zeros, 0, K); + } + + FOLLY_DECLARE_REUSED(buffer, std::vector); + + buffer.resize(std::max(nonzeros.size(), zeros.size())); + lambda1_.resize(nonzeros.size()); + lambda2_.resize(nonzeros.size()); + lambda2_z_.resize(zeros.size()); + PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data()); + PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data()); + PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data()); + + size_t i = 0; + if (K > 1) { + // Truncate to original size, and re-tile with offsets this time. + nonzeros.resize(n); + DCHECK_GT(D, n); + zeros.resize(D - n); + TileIndicesInPlace(nonzeros, D, K); + TileIndicesInPlace(zeros, D, K); + DCHECK_EQ(nonzeros.size(), lambda1_.size()); + DCHECK_EQ(nonzeros.size(), lambda2_.size()); + DCHECK_EQ(zeros.size(), lambda2_z_.size()); + + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_mixed_lambda( + self_data, + nonzeros, + zeros, + lambda1_.data(), + lambda2_.data(), + lambda2_z_.data(), + k_eps, + buffer.data(), + output_data); + } + // Truncate to original size. + nonzeros.resize(n); + zeros.resize(D - n); + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_mixed_lambda( + self_data, + nonzeros, + zeros, + lambda1_.data(), + lambda2_.data(), + lambda2_z_.data(), + k_eps, + buffer.data(), + output_data); + } + } +} +} // namespace + +} // namespace caffe2::details + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h new file mode 100644 index 0000000000000000000000000000000000000000..f927b1ac74631203bfb9ac4bf869d0e2fa7b0a7c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h @@ -0,0 +1,145 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// !!!! PLEASE READ !!!! +// Minimize (transitively) included headers from _avx*.cc because some of the +// functions defined in the headers compiled with platform dependent compiler +// options can be reused by other translation units generating illegal +// instruction run-time error. + +// Common utilities for writing performance kernels and easy dispatching of +// different backends. +/* +The general workflow shall be as follows, say we want to +implement a functionality called void foo(int a, float b). + +In foo.h, do: + void foo(int a, float b); + +In foo_avx512.cc, do: + void foo__avx512(int a, float b) { + [actual avx512 implementation] + } + +In foo_avx2.cc, do: + void foo__avx2(int a, float b) { + [actual avx2 implementation] + } + +In foo_avx.cc, do: + void foo__avx(int a, float b) { + [actual avx implementation] + } + +In foo.cc, do: + // The base implementation should *always* be provided. + void foo__base(int a, float b) { + [base, possibly slow implementation] + } + decltype(foo__base) foo__avx512; + decltype(foo__base) foo__avx2; + decltype(foo__base) foo__avx; + void foo(int a, float b) { + // You should always order things by their preference, faster + // implementations earlier in the function. + AVX512_DO(foo, a, b); + AVX2_DO(foo, a, b); + AVX_DO(foo, a, b); + BASE_DO(foo, a, b); + } + +*/ +// Details: this functionality basically covers the cases for both build time +// and run time architecture support. +// +// During build time: +// The build system should provide flags CAFFE2_PERF_WITH_AVX512, +// CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the +// __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the +// compiler provides. Note that we do not use the compiler flags but rely on +// the build system flags, because the common files (like foo.cc above) will +// always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__ +// and __AVX__. +// During run time: +// we use cpuinfo to identify cpu support and run the proper functions. + +#pragma once +#if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \ + defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX) +#include +#endif + +// DO macros: these should be used in your entry function, similar to foo() +// above, that routes implementations based on CPU capability. + +#define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__); + +#ifdef CAFFE2_PERF_WITH_SVE +#define SVE_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \ + if (isDo) { \ + return funcname##__sve(__VA_ARGS__); \ + } \ + } +#else // CAFFE2_PERF_WITH_SVE +#define SVE_DO(funcname, ...) +#endif // CAFFE2_PERF_WITH_SVE + +#ifdef CAFFE2_PERF_WITH_AVX512 +#define AVX512_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && \ + cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \ + cpuinfo_has_x86_avx512vl(); \ + if (isDo) { \ + return funcname##__avx512(__VA_ARGS__); \ + } \ + } +#else // CAFFE2_PERF_WITH_AVX512 +#define AVX512_DO(funcname, ...) +#endif // CAFFE2_PERF_WITH_AVX512 + +#ifdef CAFFE2_PERF_WITH_AVX2 +#define AVX2_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \ + if (isDo) { \ + return funcname##__avx2(__VA_ARGS__); \ + } \ + } +#define AVX2_FMA_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \ + cpuinfo_has_x86_fma3(); \ + if (isDo) { \ + return funcname##__avx2_fma(__VA_ARGS__); \ + } \ + } +#else // CAFFE2_PERF_WITH_AVX2 +#define AVX2_DO(funcname, ...) +#define AVX2_FMA_DO(funcname, ...) +#endif // CAFFE2_PERF_WITH_AVX2 + +#ifdef CAFFE2_PERF_WITH_AVX +#define AVX_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \ + if (isDo) { \ + return funcname##__avx(__VA_ARGS__); \ + } \ + } +#define AVX_F16C_DO(funcname, ...) \ + { \ + static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \ + cpuinfo_has_x86_f16c(); \ + if (isDo) { \ + return funcname##__avx_f16c(__VA_ARGS__); \ + } \ + } +#else // CAFFE2_PERF_WITH_AVX +#define AVX_DO(funcname, ...) +#define AVX_F16C_DO(funcname, ...) +#endif // CAFFE2_PERF_WITH_AVX + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h new file mode 100644 index 0000000000000000000000000000000000000000..45eb7106de95e6ae73e4a99b020339aadb7fc527 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace caffe2 { + +// clang-format off +/** + * Embedding lookup with reduction. + * + * `input` of size data_size * block_size + * `indices` of size index_size + * `offsets` of size output_size + * `weights` nullptr or array of size index_size + * `out` of size output_size * block_size + * + * Behavior is roughly equivalent to pseudocode: + * + * pos = 0 + * for (i = 0..output_size-1) + * for (k = 0..block_size-1) + * out[i*block_size + k] = 0 + * start_offset = offsets[i] + * end_offset = offsets[i+1] + * length = end_offset - start_offset + * for (j = start_offset..end_offset-1) + * for (k = 0..block_size-1) + * out[i*block_size + k] += input[indices[pos]*block_size + k] * + * (weights ? weights[IS_WEIGHT_POSITIONAL ? j - start_offset : pos] : 1.0) + * pos += 1 + * if (normalize_weights && length > 0) + * for (k = 0..block_size-1) + * out[i*block_size + k] /= length + * + * TODO: make this API also take "offsets" rather than "lengths" to match the + * API for PyTorch's EmbeddingBag + */ +// clang-format on +template < + typename IndexType, + typename InType, + typename OutType, + bool IS_WEIGHT_POSITIONAL = false> +void EmbeddingLookupIdx( + const std::int64_t block_size, + const std::int64_t output_size, + const std::int64_t index_size, + const std::int64_t data_size, + const InType* input, + const IndexType* indices, + const IndexType* offsets, + const float* weights, // optional, can be null for non-weighted sum + const float* scale_bias, // optional scale & bias params for uint8 input + bool normalize_by_lengths, + OutType* out); + +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h new file mode 100644 index 0000000000000000000000000000000000000000..5586b37e59707104b1138b4f806200ded8466e87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h @@ -0,0 +1,1348 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// ////////////////////////////////////////////////////////// +// Crc32.h +// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved. +// Slicing-by-16 contributed by Bulat Ziganshin +// Tableless bytewise CRC contributed by Hagai Gold +// see http://create.stephan-brumme.com/disclaimer.html +// + +// if running on an embedded system, you might consider shrinking the +// big Crc32Lookup table by undefining these lines: +#define CRC32_USE_LOOKUP_TABLE_BYTE +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +// - crc32_bitwise doesn't need it at all +// - crc32_halfbyte has its own small lookup table +// - crc32_1byte_tableless and crc32_1byte_tableless2 don't need it at all +// - crc32_1byte needs only Crc32Lookup[0] +// - crc32_4bytes needs only Crc32Lookup[0..3] +// - crc32_8bytes needs only Crc32Lookup[0..7] +// - crc32_4x8bytes needs only Crc32Lookup[0..7] +// - crc32_16bytes needs all of Crc32Lookup +// using the aforementioned #defines the table is automatically fitted to your needs + +// uint8_t, uint32_t, int32_t +#include +// size_t +#include + +// crc32_fast selects the fastest algorithm depending on flags (CRC32_USE_LOOKUP_...) +/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs +uint32_t crc32_fast (const void* data, size_t length, uint32_t previousCrc32 = 0); + +/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA)) +uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB); + +/// compute CRC32 (bitwise algorithm) +uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (half-byte algorithm) +uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0); + +#ifdef CRC32_USE_LOOKUP_TABLE_BYTE +/// compute CRC32 (standard algorithm) +uint32_t crc32_1byte (const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32 = 0); + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +/// compute CRC32 (Slicing-by-4 algorithm) +uint32_t crc32_4bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +/// compute CRC32 (Slicing-by-8 algorithm) +uint32_t crc32_8bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times +uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +/// compute CRC32 (Slicing-by-16 algorithm) +uint32_t crc32_16bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks) +uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32 = 0, size_t prefetchAhead = 256); +#endif + +// ////////////////////////////////////////////////////////// +// Crc32.cpp +// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved. +// Slicing-by-16 contributed by Bulat Ziganshin +// Tableless bytewise CRC contributed by Hagai Gold +// see http://create.stephan-brumme.com/disclaimer.html +// + +// if running on an embedded system, you might consider shrinking the +// big Crc32Lookup table: +// - crc32_bitwise doesn't need it at all +// - crc32_halfbyte has its own small lookup table +// - crc32_1byte needs only Crc32Lookup[0] +// - crc32_4bytes needs only Crc32Lookup[0..3] +// - crc32_8bytes needs only Crc32Lookup[0..7] +// - crc32_4x8bytes needs only Crc32Lookup[0..7] +// - crc32_16bytes needs all of Crc32Lookup + + +#ifndef __LITTLE_ENDIAN + #define __LITTLE_ENDIAN 1234 +#endif +#ifndef __BIG_ENDIAN + #define __BIG_ENDIAN 4321 +#endif + +// define endianness and some integer data types +#if defined(_MSC_VER) || defined(__MINGW32__) + // Windows always little endian + #define __BYTE_ORDER __LITTLE_ENDIAN + + // intrinsics / prefetching + #if defined(_M_ARM64) + #include + #else + #include + #endif + + #ifdef __MINGW32__ + #define PREFETCH(location) __builtin_prefetch(location) + #else + #if defined(_M_ARM64) + #define PREFETCH(location) __prefetch(location) + #else + #define PREFETCH(location) _mm_prefetch(location, _MM_HINT_T0) + #endif + #endif +#elif defined(__APPLE__) + #include + #if TARGET_IPHONE_SIMULATOR + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif TARGET_OS_IPHONE + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif TARGET_OS_MAC + #include + #if defined(__BIG_ENDIAN__) + #define __BYTE_ORDER __BIG_ENDIAN + #endif + #if defined(__LITTLE_ENDIAN__) + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif + #else + # error "Unknown Apple platform" + #endif +#elif defined(__ARMEB__) + #define __BYTE_ORDER __BIG_ENDIAN +#elif (defined(__BYTE_ORDER__) and !defined(__BYTE_ORDER)) + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + #define __BYTE_ORDER __BIG_ENDIAN + #else + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif +#else + // defines __BYTE_ORDER as __LITTLE_ENDIAN or __BIG_ENDIAN + #include +#endif + +// intrinsics / prefetching +#ifdef __GNUC__ + #define PREFETCH(location) __builtin_prefetch(location) +#else +#ifndef PREFETCH + // no prefetching + #define PREFETCH(location) ; +#endif +#endif + +// abort if byte order is undefined +#ifndef __BYTE_ORDER +#error undefined byte order, compile with -D__BYTE_ORDER=1234 (if little endian) or -D__BYTE_ORDER=4321 (big endian) +#endif + + +namespace +{ + /// zlib's CRC32 polynomial + const uint32_t Polynomial = 0xEDB88320; + + /// swap endianness + static inline uint32_t swap(uint32_t x) + { + #if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(x); + #else + return (x >> 24) | + ((x >> 8) & 0x0000FF00) | + ((x << 8) & 0x00FF0000) | + (x << 24); + #endif + } + + /// Slicing-By-16 + #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + const size_t MaxSlice = 16; + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) + const size_t MaxSlice = 8; + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) + const size_t MaxSlice = 4; + #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE) + const size_t MaxSlice = 1; + #else + #define NO_LUT // don't need Crc32Lookup at all + #endif + +} // anonymous namespace + +#ifndef NO_LUT +/// forward declaration, table is at the end of this file +extern const uint32_t Crc32Lookup[MaxSlice][256]; // extern is needed to keep compiler happy +#endif + + +/// compute CRC32 (bitwise algorithm) +uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + crc ^= *current++; + + for (int j = 0; j < 8; j++) + { + // branch-free + crc = (crc >> 1) ^ (-int32_t(crc & 1) & Polynomial); + + // branching, much slower: + //if (crc & 1) + // crc = (crc >> 1) ^ Polynomial; + //else + // crc = crc >> 1; + } + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (half-byte algorithm) +uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + /// look-up table for half-byte, same as crc32Lookup[0][16*i] + static const uint32_t Crc32Lookup16[16] = + { + 0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C, + 0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C + }; + + while (length-- != 0) + { + crc = Crc32Lookup16[(crc ^ *current ) & 0x0F] ^ (crc >> 4); + crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4); + current++; + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +#ifdef CRC32_USE_LOOKUP_TABLE_BYTE +/// compute CRC32 (standard algorithm) +uint32_t crc32_1byte(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *current++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + uint8_t s = uint8_t(crc) ^ *current++; + + // Hagai Gold made me aware of this table-less algorithm and send me code + + // polynomial 0xEDB88320 can be written in binary as 11101101101110001000001100100000b + // reverse the bits (or just assume bit 0 is the first one) + // and we have bits set at position 0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 16, 22, 23, 26 + // => those are the shift offsets: + //crc = (crc >> 8) ^ + // t ^ + // (t >> 1) ^ (t >> 2) ^ (t >> 4) ^ (t >> 5) ^ // == y + // (t >> 7) ^ (t >> 8) ^ (t >> 10) ^ (t >> 11) ^ // == y >> 6 + // (t >> 12) ^ (t >> 16) ^ // == z + // (t >> 22) ^ (t >> 26) ^ // == z >> 10 + // (t >> 23); + + // the fastest I can come up with: + uint32_t low = (s ^ (s << 6)) & 0xFF; + uint32_t a = (low * ((1 << 23) + (1 << 14) + (1 << 2))); + crc = (crc >> 8) ^ + (low * ((1 << 24) + (1 << 16) + (1 << 8))) ^ + a ^ + (a >> 1) ^ + (low * ((1 << 20) + (1 << 12) )) ^ + (low << 19) ^ + (low << 17) ^ + (low >> 2); + + // Hagai's code: + /*uint32_t t = (s ^ (s << 6)) << 24; + // some temporaries to optimize XOR + uint32_t x = (t >> 1) ^ (t >> 2); + uint32_t y = x ^ (x >> 3); + uint32_t z = (t >> 12) ^ (t >> 16); + crc = (crc >> 8) ^ + t ^ (t >> 23) ^ + y ^ (y >> 6) ^ + z ^ (z >> 10);*/ + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32) +{ + int32_t crc = ~previousCrc32; // note: signed integer, right shift distributes sign bit into lower bits + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + crc = crc ^ *current++; + + uint32_t c = (((crc << 31) >> 31) & ((Polynomial >> 7) ^ (Polynomial >> 1))) ^ + (((crc << 30) >> 31) & ((Polynomial >> 6) ^ Polynomial)) ^ + (((crc << 29) >> 31) & (Polynomial >> 5)) ^ + (((crc << 28) >> 31) & (Polynomial >> 4)) ^ + (((crc << 27) >> 31) & (Polynomial >> 3)) ^ + (((crc << 26) >> 31) & (Polynomial >> 2)) ^ + (((crc << 25) >> 31) & (Polynomial >> 1)) ^ + (((crc << 24) >> 31) & Polynomial); + + crc = ((uint32_t)crc >> 8) ^ c; // convert to unsigned integer before right shift + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +/// compute CRC32 (Slicing-by-4 algorithm) +uint32_t crc32_4bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // process four bytes at once (Slicing-by-4) + while (length >= 4) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + crc = Crc32Lookup[0][ one & 0xFF] ^ + Crc32Lookup[1][(one>> 8) & 0xFF] ^ + Crc32Lookup[2][(one>>16) & 0xFF] ^ + Crc32Lookup[3][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + crc = Crc32Lookup[0][(one>>24) & 0xFF] ^ + Crc32Lookup[1][(one>>16) & 0xFF] ^ + Crc32Lookup[2][(one>> 8) & 0xFF] ^ + Crc32Lookup[3][ one & 0xFF]; +#endif + + length -= 4; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 3 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +/// compute CRC32 (Slicing-by-8 algorithm) +uint32_t crc32_8bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // process eight bytes at once (Slicing-by-8) + while (length >= 8) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + crc = Crc32Lookup[0][ two & 0xFF] ^ + Crc32Lookup[1][(two>> 8) & 0xFF] ^ + Crc32Lookup[2][(two>>16) & 0xFF] ^ + Crc32Lookup[3][(two>>24) & 0xFF] ^ + Crc32Lookup[4][ one & 0xFF] ^ + Crc32Lookup[5][(one>> 8) & 0xFF] ^ + Crc32Lookup[6][(one>>16) & 0xFF] ^ + Crc32Lookup[7][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + crc = Crc32Lookup[0][(two>>24) & 0xFF] ^ + Crc32Lookup[1][(two>>16) & 0xFF] ^ + Crc32Lookup[2][(two>> 8) & 0xFF] ^ + Crc32Lookup[3][ two & 0xFF] ^ + Crc32Lookup[4][(one>>24) & 0xFF] ^ + Crc32Lookup[5][(one>>16) & 0xFF] ^ + Crc32Lookup[6][(one>> 8) & 0xFF] ^ + Crc32Lookup[7][ one & 0xFF]; +#endif + + length -= 8; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 7 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times +uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the inner for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 8 * Unroll; + + // process 4x eight bytes at once (Slicing-by-8) + while (length >= BytesAtOnce) + { + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + crc = Crc32Lookup[0][ two & 0xFF] ^ + Crc32Lookup[1][(two>> 8) & 0xFF] ^ + Crc32Lookup[2][(two>>16) & 0xFF] ^ + Crc32Lookup[3][(two>>24) & 0xFF] ^ + Crc32Lookup[4][ one & 0xFF] ^ + Crc32Lookup[5][(one>> 8) & 0xFF] ^ + Crc32Lookup[6][(one>>16) & 0xFF] ^ + Crc32Lookup[7][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + crc = Crc32Lookup[0][(two>>24) & 0xFF] ^ + Crc32Lookup[1][(two>>16) & 0xFF] ^ + Crc32Lookup[2][(two>> 8) & 0xFF] ^ + Crc32Lookup[3][ two & 0xFF] ^ + Crc32Lookup[4][(one>>24) & 0xFF] ^ + Crc32Lookup[5][(one>>16) & 0xFF] ^ + Crc32Lookup[6][(one>> 8) & 0xFF] ^ + Crc32Lookup[7][ one & 0xFF]; +#endif + + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 31 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +/// compute CRC32 (Slicing-by-16 algorithm) +uint32_t crc32_16bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the inner for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 16 * Unroll; + + while (length >= BytesAtOnce) + { + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][ four & 0xFF] ^ + Crc32Lookup[ 1][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 3][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 4][ three & 0xFF] ^ + Crc32Lookup[ 5][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 7][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 8][ two & 0xFF] ^ + Crc32Lookup[ 9][(two >> 8) & 0xFF] ^ + Crc32Lookup[10][(two >> 16) & 0xFF] ^ + Crc32Lookup[11][(two >> 24) & 0xFF] ^ + Crc32Lookup[12][ one & 0xFF] ^ + Crc32Lookup[13][(one >> 8) & 0xFF] ^ + Crc32Lookup[14][(one >> 16) & 0xFF] ^ + Crc32Lookup[15][(one >> 24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 1][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 3][ four & 0xFF] ^ + Crc32Lookup[ 4][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 5][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 7][ three & 0xFF] ^ + Crc32Lookup[ 8][(two >> 24) & 0xFF] ^ + Crc32Lookup[ 9][(two >> 16) & 0xFF] ^ + Crc32Lookup[10][(two >> 8) & 0xFF] ^ + Crc32Lookup[11][ two & 0xFF] ^ + Crc32Lookup[12][(one >> 24) & 0xFF] ^ + Crc32Lookup[13][(one >> 16) & 0xFF] ^ + Crc32Lookup[14][(one >> 8) & 0xFF] ^ + Crc32Lookup[15][ one & 0xFF]; +#endif + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 63 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks) +uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32, size_t prefetchAhead) +{ + // CRC code is identical to crc32_16bytes (including unrolling), only added prefetching + // 256 bytes look-ahead seems to be the sweet spot on Core i7 CPUs + + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 16 * Unroll; + + while (length >= BytesAtOnce + prefetchAhead) + { + PREFETCH(((const char*) current) + prefetchAhead); + + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][ four & 0xFF] ^ + Crc32Lookup[ 1][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 3][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 4][ three & 0xFF] ^ + Crc32Lookup[ 5][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 7][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 8][ two & 0xFF] ^ + Crc32Lookup[ 9][(two >> 8) & 0xFF] ^ + Crc32Lookup[10][(two >> 16) & 0xFF] ^ + Crc32Lookup[11][(two >> 24) & 0xFF] ^ + Crc32Lookup[12][ one & 0xFF] ^ + Crc32Lookup[13][(one >> 8) & 0xFF] ^ + Crc32Lookup[14][(one >> 16) & 0xFF] ^ + Crc32Lookup[15][(one >> 24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 1][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 3][ four & 0xFF] ^ + Crc32Lookup[ 4][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 5][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 7][ three & 0xFF] ^ + Crc32Lookup[ 8][(two >> 24) & 0xFF] ^ + Crc32Lookup[ 9][(two >> 16) & 0xFF] ^ + Crc32Lookup[10][(two >> 8) & 0xFF] ^ + Crc32Lookup[11][ two & 0xFF] ^ + Crc32Lookup[12][(one >> 24) & 0xFF] ^ + Crc32Lookup[13][(one >> 16) & 0xFF] ^ + Crc32Lookup[14][(one >> 8) & 0xFF] ^ + Crc32Lookup[15][ one & 0xFF]; +#endif + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 63 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs +uint32_t crc32_fast(const void* data, size_t length, uint32_t previousCrc32) +{ +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + return crc32_16bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) + return crc32_8bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) + return crc32_4bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_BYTE) + return crc32_1byte (data, length, previousCrc32); +#else + return crc32_halfbyte(data, length, previousCrc32); +#endif +} + + +/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA)) +uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) +{ + // based on Mark Adler's crc_combine from + // https://github.com/madler/pigz/blob/master/pigz.c + + // main idea: + // - if you have two equally-sized blocks A and B, + // then you can create a block C = A ^ B + // which has the property crc(C) = crc(A) ^ crc(B) + // - if you append length(B) zeros to A and call it A' (think of it as AAAA000) + // and prepend length(A) zeros to B and call it B' (think of it as 0000BBB) + // then exists a C' = A' ^ B' + // - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X + // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B') + // - the trick is to compute crc(A') based on crc(A) + // and crc(B') based on crc(B) + // - since B' starts with many zeros, the crc of those initial zeros is still zero + // - that means crc(B') = crc(B) + // - unfortunately the trailing zeros of A' change the crc, so usually crc(A') != crc(A) + // - the following code is a fast algorithm to compute crc(A') + // - starting with crc(A) and appending length(B) zeros, needing just log2(length(B)) iterations + // - the details are explained by the original author at + // https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768 + // + // notes: + // - I squeezed everything into one function to keep global namespace clean (original code two helper functions) + // - most original comments are still in place, I added comments where these helper functions where made inline code + // - performance-wise there isn't any differenze to the original zlib/pigz code + + // degenerated case + if (lengthB == 0) + return crcA; + + /// CRC32 => 32 bits + const uint32_t CrcBits = 32; + + uint32_t odd [CrcBits]; // odd-power-of-two zeros operator + uint32_t even[CrcBits]; // even-power-of-two zeros operator + + // put operator for one zero bit in odd + odd[0] = Polynomial; // CRC-32 polynomial + for (uint32_t i = 1; i < CrcBits; i++) + odd[i] = 1 << (i - 1); + + // put operator for two zero bits in even + // same as gf2_matrix_square(even, odd); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = odd[i]; + even[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + even[i] ^= odd[j]; + } + // put operator for four zero bits in odd + // same as gf2_matrix_square(odd, even); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = even[i]; + odd[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + odd[i] ^= even[j]; + } + + // the following loop becomes much shorter if I keep swapping even and odd + uint32_t* a = even; + uint32_t* b = odd; + // apply secondLength zeros to firstCrc32 + for (; lengthB > 0; lengthB >>= 1) + { + // same as gf2_matrix_square(a, b); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = b[i]; + a[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + a[i] ^= b[j]; + } + + // apply zeros operator for this bit + if (lengthB & 1) + { + // same as firstCrc32 = gf2_matrix_times(a, firstCrc32); + uint32_t sum = 0; + for (int i = 0; crcA != 0; i++, crcA >>= 1) + if (crcA & 1) + sum ^= a[i]; + crcA = sum; + } + + // switch even and odd + uint32_t* t = a; a = b; b = t; + } + + // return combined crc + return crcA ^ crcB; +} + + +// ////////////////////////////////////////////////////////// +// constants + + +#ifndef NO_LUT +/// look-up table, already declared above +const uint32_t Crc32Lookup[MaxSlice][256] = +{ + //// same algorithm as crc32_bitwise + //for (int i = 0; i <= 0xFF; i++) + //{ + // uint32_t crc = i; + // for (int j = 0; j < 8; j++) + // crc = (crc >> 1) ^ ((crc & 1) * Polynomial); + // Crc32Lookup[0][i] = crc; + //} + //// ... and the following slicing-by-8 algorithm (from Intel): + //// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf + //// http://sourceforge.net/projects/slicing-by-8/ + //for (int slice = 1; slice < MaxSlice; slice++) + // Crc32Lookup[slice][i] = (Crc32Lookup[slice - 1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[slice - 1][i] & 0xFF]; + { + // note: the first number of every second row corresponds to the half-byte look-up table ! + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + } + +#if defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) + // beyond this point only relevant for Slicing-by-4, Slicing-by-8 and Slicing-by-16 + ,{ + 0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7, + 0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF, + 0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496, + 0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E, + 0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265, + 0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D, + 0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034, + 0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C, + 0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2, + 0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA, + 0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93, + 0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B, + 0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60, + 0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768, + 0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31, + 0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539, + 0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C, + 0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484, + 0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD, + 0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5, + 0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E, + 0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026, + 0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F, + 0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277, + 0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189, + 0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81, + 0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8, + 0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0, + 0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B, + 0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23, + 0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A, + 0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72, + }, + + { + 0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685, + 0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D, + 0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5, + 0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D, + 0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065, + 0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD, + 0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315, + 0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD, + 0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45, + 0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD, + 0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835, + 0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D, + 0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5, + 0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D, + 0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5, + 0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D, + 0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05, + 0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD, + 0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75, + 0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD, + 0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5, + 0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D, + 0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895, + 0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D, + 0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5, + 0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D, + 0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5, + 0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D, + 0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625, + 0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D, + 0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555, + 0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED, + }, + + { + 0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9, + 0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056, + 0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26, + 0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9, + 0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787, + 0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68, + 0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018, + 0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7, + 0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084, + 0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B, + 0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B, + 0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4, + 0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA, + 0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755, + 0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825, + 0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA, + 0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82, + 0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D, + 0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D, + 0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2, + 0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC, + 0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953, + 0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623, + 0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC, + 0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF, + 0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50, + 0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120, + 0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF, + 0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981, + 0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E, + 0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E, + 0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1, + } +#endif // defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) +#if defined (CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) + // beyond this point only relevant for Slicing-by-8 and Slicing-by-16 + ,{ + 0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10, + 0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1, + 0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92, + 0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053, + 0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314, + 0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5, + 0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496, + 0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57, + 0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459, + 0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98, + 0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB, + 0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A, + 0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D, + 0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C, + 0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF, + 0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E, + 0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82, + 0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743, + 0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00, + 0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1, + 0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386, + 0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847, + 0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404, + 0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5, + 0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB, + 0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A, + 0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349, + 0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888, + 0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF, + 0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E, + 0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D, + 0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C, + }, + + { + 0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8, + 0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5, + 0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223, + 0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E, + 0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E, + 0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3, + 0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715, + 0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578, + 0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4, + 0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9, + 0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F, + 0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22, + 0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2, + 0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F, + 0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79, + 0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14, + 0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460, + 0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D, + 0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB, + 0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496, + 0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156, + 0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B, + 0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD, + 0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0, + 0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C, + 0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61, + 0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97, + 0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA, + 0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A, + 0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957, + 0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1, + 0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC, + }, + + { + 0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E, + 0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9, + 0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240, + 0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27, + 0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712, + 0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975, + 0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC, + 0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB, + 0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7, + 0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590, + 0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739, + 0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E, + 0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B, + 0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C, + 0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5, + 0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2, + 0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C, + 0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B, + 0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2, + 0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5, + 0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0, + 0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387, + 0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E, + 0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49, + 0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105, + 0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62, + 0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB, + 0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC, + 0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899, + 0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE, + 0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457, + 0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30, + }, + + { + 0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919, + 0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC, + 0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832, + 0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387, + 0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F, + 0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA, + 0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64, + 0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1, + 0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4, + 0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041, + 0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF, + 0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A, + 0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2, + 0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217, + 0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889, + 0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C, + 0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3, + 0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776, + 0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8, + 0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D, + 0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95, + 0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520, + 0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE, + 0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B, + 0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E, + 0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B, + 0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05, + 0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0, + 0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78, + 0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD, + 0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53, + 0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6, + } +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 || CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + // beyond this point only relevant for Slicing-by-16 + ,{ + 0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9, + 0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1, + 0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8, + 0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0, + 0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A, + 0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72, + 0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B, + 0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03, + 0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE, + 0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6, + 0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF, + 0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7, + 0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D, + 0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75, + 0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C, + 0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04, + 0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86, + 0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E, + 0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7, + 0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF, + 0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25, + 0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D, + 0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54, + 0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C, + 0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81, + 0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99, + 0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0, + 0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8, + 0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22, + 0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A, + 0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53, + 0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B, + }, + + { + 0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79, + 0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D, + 0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91, + 0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65, + 0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9, + 0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D, + 0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941, + 0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5, + 0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9, + 0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D, + 0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31, + 0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5, + 0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09, + 0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD, + 0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1, + 0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15, + 0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278, + 0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C, + 0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390, + 0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364, + 0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8, + 0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C, + 0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040, + 0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4, + 0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8, + 0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C, + 0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430, + 0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4, + 0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608, + 0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC, + 0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0, + 0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714, + }, + + { + 0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583, + 0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6, + 0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148, + 0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D, + 0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54, + 0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11, + 0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F, + 0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA, + 0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C, + 0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29, + 0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7, + 0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2, + 0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB, + 0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE, + 0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770, + 0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635, + 0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C, + 0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159, + 0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7, + 0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592, + 0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB, + 0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E, + 0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00, + 0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45, + 0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3, + 0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6, + 0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38, + 0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D, + 0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624, + 0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761, + 0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF, + 0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA, + }, + + { + 0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F, + 0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71, + 0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473, + 0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D, + 0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277, + 0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489, + 0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B, + 0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975, + 0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F, + 0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881, + 0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383, + 0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D, + 0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587, + 0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379, + 0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B, + 0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85, + 0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F, + 0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091, + 0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93, + 0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D, + 0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97, + 0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69, + 0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B, + 0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695, + 0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F, + 0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761, + 0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63, + 0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D, + 0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67, + 0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99, + 0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B, + 0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165, + }, + + { + 0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658, + 0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535, + 0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082, + 0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF, + 0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD, + 0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0, + 0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77, + 0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A, + 0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3, + 0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E, + 0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129, + 0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244, + 0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06, + 0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B, + 0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC, + 0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1, + 0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F, + 0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022, + 0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595, + 0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8, + 0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA, + 0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7, + 0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60, + 0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D, + 0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4, + 0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189, + 0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E, + 0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753, + 0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911, + 0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C, + 0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB, + 0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6, + }, + + { + 0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0, + 0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6, + 0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC, + 0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A, + 0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218, + 0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E, + 0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74, + 0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042, + 0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31, + 0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307, + 0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D, + 0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B, + 0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9, + 0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF, + 0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985, + 0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3, + 0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522, + 0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14, + 0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E, + 0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778, + 0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA, + 0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC, + 0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196, + 0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0, + 0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3, + 0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5, + 0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF, + 0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089, + 0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B, + 0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D, + 0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667, + 0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851, + }, + + { + 0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A, + 0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D, + 0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055, + 0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2, + 0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184, + 0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03, + 0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB, + 0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C, + 0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467, + 0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0, + 0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28, + 0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF, + 0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9, + 0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E, + 0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6, + 0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931, + 0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1, + 0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326, + 0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE, + 0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69, + 0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F, + 0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8, + 0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70, + 0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7, + 0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC, + 0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B, + 0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93, + 0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714, + 0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42, + 0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5, + 0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D, + 0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A, + }, + + { + 0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875, + 0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A, + 0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB, + 0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4, + 0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308, + 0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47, + 0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96, + 0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9, + 0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F, + 0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0, + 0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011, + 0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E, + 0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2, + 0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD, + 0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C, + 0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423, + 0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581, + 0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE, + 0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F, + 0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450, + 0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC, + 0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3, + 0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062, + 0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D, + 0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B, + 0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34, + 0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5, + 0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA, + 0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806, + 0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749, + 0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698, + 0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7, + } +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +}; +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..398b9e97c0bb68bb2fe2e3e223c641b7dd114acb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include "caffe2/serialize/istream_adapter.h" +#include "caffe2/serialize/read_adapter_interface.h" + +namespace caffe2 { +namespace serialize { + +class TORCH_API FileAdapter final : public ReadAdapterInterface { + public: + C10_DISABLE_COPY_AND_ASSIGN(FileAdapter); + explicit FileAdapter(const std::string& file_name); + size_t size() const override; + size_t read(uint64_t pos, void* buf, size_t n, const char* what = "") + const override; + ~FileAdapter() override; + + private: + // An RAII Wrapper for a FILE pointer. Closes on destruction. + struct RAIIFile { + FILE* fp_; + explicit RAIIFile(const std::string& file_name); + ~RAIIFile(); + }; + + RAIIFile file_; + // The size of the opened file in bytes + uint64_t size_; +}; + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..394898e5ed08ec4c62c8868ae12cf846ad7bf22f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace caffe2 { +namespace serialize { + +class MemoryReadAdapter final : public caffe2::serialize::ReadAdapterInterface { + public: + explicit MemoryReadAdapter(const void* data, off_t size) + : data_(data), size_(size) {} + + size_t size() const override { + return size_; + } + + size_t read(uint64_t pos, void* buf, size_t n, const char* what = "") + const override { + (void)what; + memcpy(buf, (int8_t*)(data_) + pos, n); + return n; + } + + private: + const void* data_; + off_t size_; +}; + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h new file mode 100644 index 0000000000000000000000000000000000000000..ef3436b6fece5e661fa4977cafb8d8534f2235fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h @@ -0,0 +1,315 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "caffe2/serialize/istream_adapter.h" +#include "caffe2/serialize/read_adapter_interface.h" +#include "caffe2/serialize/versions.h" + +extern "C" { +typedef struct mz_zip_archive mz_zip_archive; +} + +// PyTorch containers are a special zip archive with the following layout +// archive_name.zip contains: +// archive_name/ +// version # a file with a single decimal number written in ascii, +// # used to establish the version of the archive format +// model.json # overall model description, this is a json output of +// # ModelDef from torch.proto +// # the following names are by convention only, model.json will +// # refer to these files by full names +// tensors/ +// 0 # flat storage for tensor data, meta-data about shapes, etc. is +// # in model.json +// 1 +// ... +// # code entries will only exist for modules that have methods attached +// code/ +// archive_name.py # serialized torch script code (python syntax, using +// PythonPrint) archive_name_my_submodule.py # submodules have separate +// files +// +// The PyTorchStreamWriter also ensures additional useful properties for these +// files +// 1. All files are stored uncompressed. +// 2. All files in the archive are aligned to 64 byte boundaries such that +// it is possible to mmap the entire file and get an aligned pointer to +// tensor data. +// 3. We universally write in ZIP64 format for consistency. + +// The PyTorchStreamReader also provides additional properties: +// 1. It can read zip files that are created with common +// zip tools. This means that even though our writer doesn't compress files, +// the reader can still read files that were compressed. +// 2. It provides a getRecordOffset function which returns the offset into the +// raw file where file data lives. If the file was written with +// PyTorchStreamWriter it is guaranteed to be 64 byte aligned. + +// PyTorchReader/Writer handle checking the version number on the archive format +// and ensure that all files are written to a archive_name directory so they +// unzip cleanly. + +// When developing this format we want to pay particular attention to the +// following use cases: +// +// -- Reading -- +// 1) Reading with full random access +// a) Reading with file api's such as fread() +// b) mmaping the file and jumping around the mapped region +// 2) Reading with 1-pass sequential access +// -> A reader will need to build up a data structure of parsed structures +// as it reads +// +// -- Writing -- +// 1) Writing with full random access +// 2) Writing with 1-pass sequential access +// -> We must take care not to require updating values that have already +// been written. We place the variable-length index at the end and do +// not put any index into the header to fulfill this constraint. + +// The model.json, which contains all the metadata information, +// should be written as the last file. One reason is that the size of tensor +// data is usually stable. As long as the shape and type of the tensor do not +// change, the size of the data won't change. On the other sied, the size of the +// serialized model is likely to change, so we store it as the last record, and +// we don't need to move previous records when updating the model data. + +// The zip format is sufficiently flexible to handle the above use-case. +// it puts its central directory at the end of the archive and we write +// model.json as the last file when writing after we have accumulated all +// other information. + +namespace caffe2 { +namespace serialize { + +static constexpr const char* kSerializationIdRecordName = + ".data/serialization_id"; + +struct MzZipReaderIterWrapper; + +class TORCH_API ChunkRecordIterator { + public: + ~ChunkRecordIterator(); + + // Read at most `chunkSize` into `buf`. Return the number of actual bytes + // read. + size_t next(void* buf); + size_t recordSize() const { + return recordSize_; + } + + private: + ChunkRecordIterator( + size_t recordSize, + size_t chunkSize, + std::unique_ptr iter); + + const size_t recordSize_; + const size_t chunkSize_; + size_t offset_; + std::unique_ptr iter_; + + friend class PyTorchStreamReader; +}; + +class TORCH_API PyTorchStreamReader final { + public: + explicit PyTorchStreamReader(const std::string& file_name); + explicit PyTorchStreamReader(std::istream* in); + explicit PyTorchStreamReader(std::shared_ptr in); + + // return dataptr, size + // set allocator to override default cpu allocator + std::tuple getRecord( + const std::string& name, + std::optional allocator = std::nullopt); + // multi-thread getRecord + std::tuple getRecord( + const std::string& name, + std::vector>& additionalReaders, + std::optional allocator = std::nullopt); + // inplace memory writing + size_t getRecord(const std::string& name, void* dst, size_t n); + // inplace memory writing, multi-threads. + // When additionalReaders is empty, the default behavior is call + // getRecord(name, dst, n) with default reader This approach can be used for + // reading large tensors. + size_t getRecord( + const std::string& name, + void* dst, + size_t n, + std::vector>& additionalReaders); + size_t getRecord( + const std::string& name, + void* dst, + size_t n, + size_t chunk_size, + void* buf, + const std::function& memcpy_func = + nullptr); + + // Concurrent reading records with multiple readers. + // additionalReaders are additional clients to access the underlying record at + // different offsets and write to different trunks of buffers. If the overall + // size of the tensor is 10, and size of additionalReader is 2. The default + // thread will read [0,4), the additional reader will read [4,8). The default + // reader will read [8,10). The default reader will write to buffer[0,4), the + // additional reader will write to buffer[4,8), the additional reader will + // write to buffer[8,10). When additionalReaders is empty, the default + // behavior is call getRecord(name) with default reader This approach can be + // used for reading large tensors. + size_t getRecordMultiReaders( + const std::string& name, + std::vector>& additionalReaders, + void* dst, + size_t n); + + size_t getRecordSize(const std::string& name); + size_t getRecordHeaderOffset(const std::string& name); + size_t getRecordOffset(const std::string& name); + size_t getRecordOffsetNoRead( + size_t cursor, + std::string filename, + size_t size, + uint64_t alignment); + bool hasRecord(const std::string& name); + std::vector getAllRecords(); + + ChunkRecordIterator createChunkReaderIter( + const std::string& name, + const size_t recordSize, + const size_t chunkSize); + + ~PyTorchStreamReader(); + uint64_t version() const { + return version_; + } + const std::string& serializationId() { + return serialization_id_; + } + + void setShouldLoadDebugSymbol(bool should_load_debug_symbol) { + load_debug_symbol_ = should_load_debug_symbol; + } + void setAdditionalReaderSizeThreshold(const size_t& size) { + additional_reader_size_threshold_ = size; + } + + private: + void init(); + size_t read(uint64_t pos, char* buf, size_t n); + void valid(const char* what, const char* info = ""); + size_t getRecordID(const std::string& name); + + friend size_t + istream_read_func(void* pOpaque, uint64_t file_ofs, void* pBuf, size_t n); + std::unique_ptr ar_; + std::string archive_name_; + std::string archive_name_plus_slash_; + std::shared_ptr in_; + int64_t version_; + std::mutex reader_lock_; + bool load_debug_symbol_ = true; + std::string serialization_id_; + size_t additional_reader_size_threshold_; +}; + +class TORCH_API PyTorchStreamWriter final { + public: + explicit PyTorchStreamWriter( + const std::string& archive_name, + bool compute_crc32 = true, + uint64_t alignment = 64); + explicit PyTorchStreamWriter( + const std::function writer_func, + bool compute_crc32 = true, + uint64_t alignment = 64); + + void setMinVersion(const uint64_t version); + + void writeRecord( + const std::string& name, + const void* data, + size_t size, + bool compress = false); + void writeEndOfFile(); + + const std::unordered_set& getAllWrittenRecords(); + + bool finalized() const { + return finalized_; + } + + const std::string& archiveName() { + return archive_name_; + } + + const std::string& serializationId() { + return serialization_id_; + } + + ~PyTorchStreamWriter(); + + private: + void setup(const std::string& file_name); + void valid(const char* what, const char* info = ""); + void writeSerializationId(); + size_t current_pos_ = 0; + std::unordered_set files_written_; + std::unique_ptr ar_; + std::string archive_name_; + std::string archive_name_plus_slash_; + std::string padding_; + std::ofstream file_stream_; + std::function writer_func_; + uint64_t combined_uncomp_crc32_ = 0; + std::string serialization_id_; + bool compute_crc32_; + uint64_t alignment_; + + // This number will be updated when the model has operators + // that have valid upgraders. + uint64_t version_ = kMinProducedFileFormatVersion; + bool finalized_ = false; + bool err_seen_ = false; + friend size_t ostream_write_func( + void* pOpaque, + uint64_t file_ofs, + const void* pBuf, + size_t n); +}; + +namespace detail { + +// Returns a record to be appended to the local user extra data entry in order +// to make data beginning aligned at kFieldAlignment bytes boundary. +size_t getPadding( + size_t cursor, + size_t filename_size, + size_t size, + std::string& padding_buf, + uint64_t alignment); + +std::tuple +getOffset(size_t cursor, size_t filename_size, size_t size, uint64_t alignment); + +} // namespace detail + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..0e205be7f1ceef1ffc92f686d1cd464f60899ae3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include "c10/macros/Macros.h" +#include "caffe2/serialize/read_adapter_interface.h" + +namespace caffe2 { +namespace serialize { + +// this is a reader implemented by std::istream +class TORCH_API IStreamAdapter final : public ReadAdapterInterface { + public: + C10_DISABLE_COPY_AND_ASSIGN(IStreamAdapter); + explicit IStreamAdapter(std::istream* istream); + size_t size() const override; + size_t read(uint64_t pos, void* buf, size_t n, const char* what = "") + const override; + ~IStreamAdapter() override; + + private: + std::istream* istream_; + void validate(const char* what) const; +}; + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..bc4b4505f4b786a0c8088e7ecc2253b877a20298 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include "c10/macros/Macros.h" + +namespace caffe2 { +namespace serialize { + +// this is the interface for the (file/stream/memory) reader in +// PyTorchStreamReader. with this interface, we can extend the support +// besides standard istream +class TORCH_API ReadAdapterInterface { + public: + virtual size_t size() const = 0; + virtual size_t read(uint64_t pos, void* buf, size_t n, const char* what = "") + const = 0; + virtual ~ReadAdapterInterface(); +}; + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h new file mode 100644 index 0000000000000000000000000000000000000000..f21f4db27caa05bf69b3f05fdcf93ccf241d0944 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h @@ -0,0 +1,138 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace caffe2 { +namespace serialize { + +constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; + +constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL; + +// Versions (i.e. why was the version number bumped?) + +// Note [Dynamic Versions and torch.jit.save vs. torch.save] +// +// Our versioning scheme has a "produced file format version" which +// describes how an archive is to be read. The version written in an archive +// is at least this current produced file format version, but may be greater +// if it includes certain symbols. We refer to these conditional versions +// as "dynamic," since they are identified at runtime. +// +// Dynamic versioning is useful when an operator's semantics are updated. +// When using torch.jit.save we want those semantics to be preserved. If +// we bumped the produced file format version on every change, however, +// then older versions of PyTorch couldn't read even simple archives, like +// a single tensor, from newer versions of PyTorch. Instead, we +// assign dynamic versions to these changes that override the +// produced file format version as needed. That is, when the semantics +// of torch.div changed it was assigned dynamic version 4, and when +// torch.jit.saving modules that use torch.div those archives also have +// (at least) version 4. This prevents earlier versions of PyTorch +// from accidentally performing the wrong kind of division. Modules +// that don't use torch.div or other operators with dynamic versions +// can write the produced file format version, and these programs will +// run as expected on earlier versions of PyTorch. +// +// While torch.jit.save attempts to preserve operator semantics, +// torch.save does not. torch.save is analogous to pickling Python, so +// a function that uses torch.div will have different behavior if torch.saved +// and torch.loaded across PyTorch versions. From a technical perspective, +// torch.save ignores dynamic versioning. + +// 1. Initial version +// 2. Removed op_version_set version numbers +// 3. Added type tags to pickle serialization of container types +// 4. (Dynamic) Stopped integer division using torch.div +// (a versioned symbol preserves the historic behavior of versions 1--3) +// 5. (Dynamic) Stops torch.full inferring a floating point dtype +// when given bool or integer fill values. +// 6. Write version string to `./data/version` instead of `version`. + +// [12/15/2021] +// kProducedFileFormatVersion is set to 7 from 3 due to a different +// interpretation of what file format version is. +// Whenever there is new upgrader introduced, +// this number should be bumped. +// The reasons that version is bumped in the past: +// 1. aten::div is changed at version 4 +// 2. aten::full is changed at version 5 +// 3. torch.package uses version 6 +// 4. Introduce new upgrader design and set the version number to 7 +// mark this change +// -------------------------------------------------- +// We describe new operator version bump reasons here: +// 1) [01/24/2022] +// We bump the version number to 8 to update aten::linspace +// and aten::linspace.out to error out when steps is not +// provided. (see: https://github.com/pytorch/pytorch/issues/55951) +// 2) [01/30/2022] +// Bump the version number to 9 to update aten::logspace and +// and aten::logspace.out to error out when steps is not +// provided. (see: https://github.com/pytorch/pytorch/issues/55951) +// 3) [02/11/2022] +// Bump the version number to 10 to update aten::gelu and +// and aten::gelu.out to support the new approximate kwarg. +// (see: https://github.com/pytorch/pytorch/pull/61439) +constexpr uint64_t kProducedFileFormatVersion = 0xAL; + +// Absolute minimum version we will write packages. This +// means that every package from now on will always be +// greater than this number. +constexpr uint64_t kMinProducedFileFormatVersion = 0x3L; + +// The version we write when the archive contains bytecode. +// It must be higher or eq to kProducedFileFormatVersion. +// Because torchscript changes is likely introduce bytecode change. +// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion +// should be increased too. The relationship is: +// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion +// >= kProducedFileFormatVersion +// If a format change is forward compatible (still readable by older +// executables), we will not increment the version number, to minimize the +// risk of breaking existing clients. TODO: A better way would be to allow +// the caller that creates a model to specify a maximum version that its +// clients can accept. +// Versions: +// 0x1L: Initial version +// 0x2L: (Comment missing) +// 0x3L: (Comment missing) +// 0x4L: (update) Added schema to function tuple. Forward-compatible change. +// 0x5L: (update) Update bytecode is sharing constant tensor files from +// torchscript, and only serialize extra tensors that are not in the +// torchscript constant table. Also update tensor storage schema adapting to +// the unify format, the root key of tensor storage is updated from {index} to +// {the_pointer_value_the_tensor.storage}, for example: +// `140245072983168.storage` Forward-compatibility change. +// 0x6L: Implicit opereator versioning using number of specified argument. +// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for +// details. +// 0x7L: Enable support for operators with default arguments plus out +// arguments. Refer. See https://github.com/pytorch/pytorch/pull/63651 for +// details. +// 0x8L: Emit promoted operators as instructions. See +// https://github.com/pytorch/pytorch/pull/71662 for details. +// 0x9L: Change serialization format from pickle to format This version is to +// serve migration. v8 pickle and v9 flatbuffer are the same. Refer to the +// summary of https://github.com/pytorch/pytorch/pull/75201 for more details. +constexpr uint64_t kProducedBytecodeVersion = 0x8L; + +// static_assert( +// kProducedBytecodeVersion >= kProducedFileFormatVersion, +// "kProducedBytecodeVersion must be higher or equal to +// kProducedFileFormatVersion."); + +// Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion +// for limited backward/forward compatibility support of bytecode. If +// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion +// (in loader), we should support this model_version. For example, we provide a +// wrapper to handle an updated operator. +constexpr uint64_t kMinSupportedBytecodeVersion = 0x4L; +constexpr uint64_t kMaxSupportedBytecodeVersion = 0x9L; + +} // namespace serialize +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h new file mode 100644 index 0000000000000000000000000000000000000000..8041a2723c8603b05b26956126e37eff436ac905 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h @@ -0,0 +1,137 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_UTILS_FIXED_DIVISOR_H_ +#define CAFFE2_UTILS_FIXED_DIVISOR_H_ + +#include +#include +#include + +// See Note [hip-clang differences to hcc] + +#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__) || \ + (defined(__clang__) && defined(__CUDA__)) +#define FIXED_DIVISOR_DECL inline __host__ __device__ +#else +#define FIXED_DIVISOR_DECL inline +#endif + +namespace caffe2 { + +// Utility class for quickly calculating quotients and remainders for +// a known integer divisor +template +class FixedDivisor {}; + +// Works for any positive divisor, 1 to INT_MAX. One 64-bit +// multiplication and one 64-bit shift is used to calculate the +// result. +template <> +class FixedDivisor { + public: + FixedDivisor() = default; + + explicit FixedDivisor(const std::int32_t d) : d_(d) { +#if !defined(USE_ROCM) + CalcSignedMagic(); +#endif // USE_ROCM + } + + FIXED_DIVISOR_DECL std::int32_t d() const { + return d_; + } + +#if !defined(USE_ROCM) + FIXED_DIVISOR_DECL std::uint64_t magic() const { + return magic_; + } + + FIXED_DIVISOR_DECL int shift() const { + return shift_; + } +#endif // USE_ROCM + + /// Calculates `q = n / d`. + FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const { +#if defined(USE_ROCM) + return n / d_; +#else // USE_ROCM + // In lieu of a mulhi instruction being available, perform the + // work in uint64 + return (int32_t)((magic_ * (uint64_t)n) >> shift_); +#endif // USE_ROCM + } + + /// Calculates `r = n % d`. + FIXED_DIVISOR_DECL std::int32_t Mod(const std::int32_t n) const { + return n - d_ * Div(n); + } + + /// Calculates `q = n / d` and `r = n % d` together. + FIXED_DIVISOR_DECL void + DivMod(const std::int32_t n, std::int32_t* q, int32_t* r) const { + *q = Div(n); + *r = n - d_ * *q; + } + + private: +#if !defined(USE_ROCM) + // Calculates magic multiplicative value and shift amount for calculating `q = + // n / d` for signed 32-bit integers. + // Implementation taken from Hacker's Delight section 10. + void CalcSignedMagic() { + if (d_ == 1) { + magic_ = UINT64_C(0x1) << 32; + shift_ = 32; + return; + } + + const std::uint32_t two31 = UINT32_C(0x80000000); + const std::uint32_t ad = std::abs(d_); + const std::uint32_t t = two31 + ((uint32_t)d_ >> 31); + const std::uint32_t anc = t - 1 - t % ad; // Absolute value of nc. + std::uint32_t p = 31; // Init. p. + std::uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|. + std::uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|). + std::uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|. + std::uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|). + std::uint32_t delta = 0; + do { + ++p; + q1 <<= 1; // Update q1 = 2**p/|nc|. + r1 <<= 1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + ++q1; // comparison here). + r1 -= anc; + } + q2 <<= 1; // Update q2 = 2**p/|d|. + r2 <<= 1; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + ++q2; // comparison here). + r2 -= ad; + } + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + std::int32_t magic = q2 + 1; + if (d_ < 0) { + magic = -magic; + } + shift_ = p; + magic_ = (std::uint64_t)(std::uint32_t)magic; + } +#endif // USE_ROCM + + std::int32_t d_ = 1; + +#if !defined(USE_ROCM) + std::uint64_t magic_; + int shift_; +#endif // USE_ROCM +}; + +} // namespace caffe2 + +#endif // CAFFE2_UTILS_FIXED_DIVISOR_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h new file mode 100644 index 0000000000000000000000000000000000000000..29b58072e159b1ca826fc4b6d8631e7590943969 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_UTILS_PROTO_WRAP_H_ +#define CAFFE2_UTILS_PROTO_WRAP_H_ + +#include + +namespace caffe2 { + +// A wrapper function to shut down protobuf library (this is needed in ASAN +// testing and valgrind cases to avoid protobuf appearing to "leak" memory). +TORCH_API void ShutdownProtobufLibrary(); + +// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() +// function used to avoid duplicated global variable in the case when protobuf +// is built with hidden visibility. +TORCH_API const ::std::string& GetEmptyStringAlreadyInited(); +} // namespace caffe2 + +namespace ONNX_NAMESPACE { + +// ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function +// used to avoid duplicated global variable in the case when protobuf +// is built with hidden visibility. +TORCH_API const ::std::string& GetEmptyStringAlreadyInited(); + +} // namespace ONNX_NAMESPACE + +namespace torch { + +// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() +// function used to avoid duplicated global variable in the case when protobuf +// is built with hidden visibility. +TORCH_API const ::std::string& GetEmptyStringAlreadyInited(); + +void ShutdownProtobufLibrary(); + +} // namespace torch +#endif // CAFFE2_UTILS_PROTO_WRAP_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f8d2d49efdb0ca402a6e6b60c0c6de7db9249684 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace caffe2 { + +TORCH_API std::vector +split(char separator, const std::string& string, bool ignore_empty = false); + +TORCH_API std::string trim(const std::string& str); + +TORCH_API size_t editDistance( + const std::string& s1, + const std::string& s2, + size_t max_distance = 0); + +TORCH_API inline bool StartsWith( + const std::string& str, + const std::string& prefix) { + return str.length() >= prefix.length() && + std::mismatch(prefix.begin(), prefix.end(), str.begin()).first == + prefix.end(); +} + +TORCH_API inline bool EndsWith( + const std::string& full, + const std::string& ending) { + if (full.length() >= ending.length()) { + return ( + 0 == + full.compare(full.length() - ending.length(), ending.length(), ending)); + } else { + return false; + } +} + +TORCH_API int32_t editDistanceHelper( + const char* s1, + size_t s1_len, + const char* s2, + size_t s2_len, + std::vector& current, + std::vector& previous, + std::vector& previous1, + size_t max_distance); +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h new file mode 100644 index 0000000000000000000000000000000000000000..a3769ec59ebdc60be60a685779aa4c3903e0f721 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h @@ -0,0 +1,84 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_UTILS_THREADPOOL_H_ +#define CAFFE2_UTILS_THREADPOOL_H_ + +#include "ThreadPoolCommon.h" + +#include +#include +#include +#include +#include + +#include "c10/util/Flags.h" +#include "caffe2/core/common.h" + +// +// A work-stealing threadpool loosely based off of pthreadpool +// + +namespace caffe2 { + +struct Task; +class WorkersPool; + +constexpr size_t kCacheLineSize = 64; + +// A threadpool with the given number of threads. +// NOTE: the kCacheLineSize alignment is present only for cache +// performance, and is not strictly enforced (for example, when +// the object is created on the heap). Thus, in order to avoid +// misaligned intrinsics, no SSE instructions shall be involved in +// the ThreadPool implementation. +// Note: alignas is disabled because some compilers do not deal with +// TORCH_API and alignas annotations at the same time. +class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool { + public: + static ThreadPool* createThreadPool(int numThreads); + static std::unique_ptr defaultThreadPool(); + virtual ~ThreadPool() = default; + // Returns the number of threads currently in use + virtual int getNumThreads() const = 0; + virtual void setNumThreads(size_t numThreads) = 0; + + // Sets the minimum work size (range) for which to invoke the + // threadpool; work sizes smaller than this will just be run on the + // main (calling) thread + void setMinWorkSize(size_t size) { + std::lock_guard guard(executionMutex_); + minWorkSize_ = size; + } + + size_t getMinWorkSize() const { + return minWorkSize_; + } + virtual void run(const std::function& fn, size_t range) = 0; + + // Run an arbitrary function in a thread-safe manner accessing the Workers + // Pool + virtual void withPool(const std::function& fn) = 0; + + protected: + static size_t defaultNumThreads_; + mutable std::mutex executionMutex_; + size_t minWorkSize_; +}; + +size_t getDefaultNumThreads(); +} // namespace caffe2 + +C10_DECLARE_bool(caffe2_threadpool_force_inline); + +// Whether or not threadpool caps apply to Android +C10_DECLARE_int(caffe2_threadpool_android_cap); + +// Whether or not threadpool caps apply to iOS and MacOS +C10_DECLARE_int(caffe2_threadpool_ios_cap); +C10_DECLARE_int(caffe2_threadpool_macos_cap); + +C10_DECLARE_int(pthreadpool_size); +#endif // CAFFE2_UTILS_THREADPOOL_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..0bd04aa595c383ea8c1e0cb833e81e5478bc879b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef CAFFE2_UTILS_THREADPOOL_COMMON_H_ +#define CAFFE2_UTILS_THREADPOOL_COMMON_H_ + +#ifdef __APPLE__ +#include +#endif + +// caffe2 depends upon NNPACK, which depends upon this threadpool, so +// unfortunately we can't reference core/common.h here + +// This is copied from core/common.h's definition of C10_MOBILE +// Define enabled when building for iOS or Android devices +#if defined(__ANDROID__) +#define C10_ANDROID 1 +#elif (defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) +#define C10_IOS 1 +#endif // ANDROID / IOS + +#endif // CAFFE2_UTILS_THREADPOOL_COMMON_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h new file mode 100644 index 0000000000000000000000000000000000000000..a4adbac9b3c1b3a9672b511cb24dda1c48a4622e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h @@ -0,0 +1,383 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include "c10/util/thread_name.h" +#include +#include + +#if defined(_MSC_VER) +#include +#endif + +namespace caffe2 { + +// Uses code derived from gemmlowp, +// https://github.com/google/gemmlowp/blob/6c91e1ed0c2eff1182d804310b92911fe9c18019/internal/multi_thread_gemm.h +// Changes: +// - allocation-free execute() +// - Use RAII where possible. +// - Run the first task on the main thread (since that is the largest task). +// - removed custom allocator. +// - Removed some ifdef's +// - cache-line align Worker. +// - use std::atomic instead of volatile and custom barriers. +// - use std::mutex/std::condition_variable instead of raw pthreads. + +constexpr size_t kGEMMLOWPCacheLineSize = 64; + +template +struct AllocAligned { + // Allocate a T aligned at an `align` byte address + template + static T* alloc(Args&&... args) { + void* p = nullptr; + +#if defined(__ANDROID__) + p = memalign(kGEMMLOWPCacheLineSize, sizeof(T)); +#elif defined(_MSC_VER) + p = _aligned_malloc(sizeof(T), kGEMMLOWPCacheLineSize); +#else + auto res = posix_memalign(&p, kGEMMLOWPCacheLineSize, sizeof(T)); + (void)res; +#endif + + if (p) { + return new (p) T(std::forward(args)...); + } + + return nullptr; + } + + // Free a T previously allocated via AllocAligned::alloc() + static void release(T* p) { + if (p) { + p->~T(); +#if defined(_MSC_VER) + _aligned_free((void*)p); +#else + free((void*)p); +#endif + } + } +}; + +// Deleter object for unique_ptr for an aligned object +template +struct AlignedDeleter { + void operator()(T* p) const { AllocAligned::release(p); } +}; + +// make_unique that guarantees alignment +template +struct MakeAligned { + template + static std::unique_ptr> make(Args&&... args) { + return std::unique_ptr>( + AllocAligned::alloc(std::forward(args)...)); + } +}; + +const int kMaxBusyWaitNOPs = 32 * 1000 * 1000; + +#if defined(_MSC_VER) +#define GEMMLOWP_NOP __nop(); +#else +#define GEMMLOWP_NOP "nop\n" +#endif + +#define GEMMLOWP_STRING_CONCAT_4(X) X X X X +#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP) +#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4) +#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16) + +inline int Do256NOPs() { +#if defined(_MSC_VER) + GEMMLOWP_NOP64; +#else + asm volatile(GEMMLOWP_NOP64); +#endif + return 64; +} + +#undef GEMMLOWP_STRING_CONCAT_4 +#undef GEMMLOWP_NOP256 +#undef GEMMLOWP_NOP64 +#undef GEMMLOWP_NOP16 +#undef GEMMLOWP_NOP4 +#undef GEMMLOWP_NOP + +// Waits until *var != initial_value. +// +// Returns the new value of *var. The guarantee here is that +// the return value is different from initial_value, and that that +// new value has been taken by *var at some point during the +// execution of this function. There is no guarantee that this is +// still the value of *var when this function returns, since *var is +// not assumed to be guarded by any lock. +// +// First does some busy-waiting for a fixed number of no-op cycles, +// then falls back to passive waiting for the given condvar, guarded +// by the given mutex. +// +// The idea of doing some initial busy-waiting is to help get +// better and more consistent multithreading benefits for small GEMM sizes. +// Busy-waiting help ensuring that if we need to wake up soon after having +// started waiting, then we can wake up quickly (as opposed to, say, +// having to wait to be scheduled again by the OS). On the other hand, +// we must still eventually revert to passive waiting for longer waits +// (e.g. worker threads having finished a GEMM and waiting until the next GEMM) +// so as to avoid permanently spinning. +// +template +T WaitForVariableChange(std::atomic* var, + T initial_value, + std::condition_variable* cond, + std::mutex* mutex) { + // If we are on a platform that supports it, spin for some time. + { + int nops = 0; + // First, trivial case where the variable already changed value. + T new_value = var->load(std::memory_order_relaxed); + if (new_value != initial_value) { + std::atomic_thread_fence(std::memory_order_acquire); + return new_value; + } + // Then try busy-waiting. + while (nops < kMaxBusyWaitNOPs) { + nops += Do256NOPs(); + new_value = var->load(std::memory_order_relaxed); + if (new_value != initial_value) { + std::atomic_thread_fence(std::memory_order_acquire); + return new_value; + } + } + } + + // Finally, do real passive waiting. + { + std::unique_lock g(*mutex); + T new_value = var->load(std::memory_order_relaxed); + // Handle spurious wakeups. + cond->wait(g, [&]() { + new_value = var->load(std::memory_order_relaxed); + return new_value != initial_value; + }); + TORCH_DCHECK_NE(static_cast(new_value), static_cast(initial_value)); + return new_value; + } +} + +// A BlockingCounter lets one thread to wait for N events to occur. +// This is how the master thread waits for all the worker threads +// to have finished working. +class BlockingCounter { + public: + // Sets/resets the counter; initial_count is the number of + // decrementing events that the Wait() call will be waiting for. + void Reset(std::size_t initial_count) { + std::lock_guard g(mutex_); + TORCH_DCHECK_EQ(count_, 0); + count_ = initial_count; + } + + // Decrements the counter; if the counter hits zero, signals + // the thread that was waiting for that, and returns true. + // Otherwise (if the decremented count is still nonzero), + // returns false. + bool DecrementCount() { + const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1; + if (count_value == 0) { + std::lock_guard g(mutex_); + cond_.notify_one(); + } + bool retval = count_value == 0; + return retval; + } + + // Waits for the N other threads (N having been set by Reset()) + // to hit the BlockingCounter. + void Wait() { + while (size_t count_value = count_.load(std::memory_order_relaxed)) { + WaitForVariableChange(&count_, count_value, &cond_, &mutex_); + } + } + + private: + std::condition_variable cond_; + std::mutex mutex_; + std::atomic count_{0}; +}; + +// A workload for a worker. +struct Task { + Task() = default; + virtual ~Task() = default; + virtual void Run() = 0; +}; + +// A worker thread. +class alignas(kGEMMLOWPCacheLineSize) Worker { + public: + enum class State : uint8_t { + ThreadStartup, // The initial state before the thread main loop runs. + Ready, // Is not working, has not yet received new work to do. + HasWork, // Has work to do. + ExitAsSoonAsPossible // Should exit at earliest convenience. + }; + + explicit Worker(BlockingCounter* counter_to_decrement_when_ready) + : task_(nullptr), + state_(State::ThreadStartup), + counter_to_decrement_when_ready_(counter_to_decrement_when_ready) { + thread_ = std::make_unique([this]() { + c10::setThreadName("pt_thread_pool"); + this->ThreadFunc(); + }); + } + + ~Worker() { + ChangeState(State::ExitAsSoonAsPossible); + thread_->join(); + } + + // Changes State; may be called from either the worker thread + // or the master thread; however, not all state transitions are legal, + // which is guarded by assertions. + void ChangeState(State new_state) { + std::lock_guard g(state_mutex_); + DCHECK(new_state != state_.load(std::memory_order_relaxed)); + switch (state_.load(std::memory_order_relaxed)) { + case State::ThreadStartup: + DCHECK(new_state == State::Ready); + break; + case State::Ready: + DCHECK(new_state == State::HasWork || new_state == State::ExitAsSoonAsPossible); + break; + case State::HasWork: + DCHECK(new_state == State::Ready || new_state == State::ExitAsSoonAsPossible); + break; + case State::ExitAsSoonAsPossible: + default: + abort(); + } + state_.store(new_state, std::memory_order_relaxed); + state_cond_.notify_one(); + if (new_state == State::Ready) { + counter_to_decrement_when_ready_->DecrementCount(); + } + } + + // Thread entry point. + void ThreadFunc() { + c10::setThreadName("CaffeWorkersPool"); + ChangeState(State::Ready); + + // Thread main loop + while (true) { + // Get a state to act on + // In the 'Ready' state, we have nothing to do but to wait until + // we switch to another state. + State state_to_act_upon = + WaitForVariableChange(&state_, State::Ready, &state_cond_, &state_mutex_); + + // We now have a state to act on, so act. + switch (state_to_act_upon) { + case State::HasWork: + // Got work to do! So do it, and then revert to 'Ready' state. + DCHECK(task_.load()); + (*task_).Run(); + task_ = nullptr; + ChangeState(State::Ready); + break; + case State::ExitAsSoonAsPossible: + return; + case State::Ready: + case State::ThreadStartup: + default: + abort(); + } + } + } + + static void* ThreadFunc(void* arg) { + static_cast(arg)->ThreadFunc(); + return nullptr; + } + + // Called by the master thread to give this worker work to do. + // It is only legal to call this if the worker + void StartWork(Task* task) { + DCHECK(!task_.load()); + task_ = task; + DCHECK(state_.load(std::memory_order_acquire) == State::Ready); + ChangeState(State::HasWork); + } + + private: + // The underlying thread. + std::unique_ptr thread_; + + // The task to be worked on. + std::atomic task_; + + // The condition variable and mutex guarding state changes. + std::condition_variable state_cond_; + std::mutex state_mutex_; + + // The state enum tells if we're currently working, waiting for work, etc. + std::atomic state_; + + // pointer to the master's thread BlockingCounter object, to notify the + // master thread of when this worker switches to the 'Ready' state. + BlockingCounter* const counter_to_decrement_when_ready_; +}; + +class WorkersPool { + public: + WorkersPool() = default; + + void Execute(const std::vector>& tasks) { + CAFFE_ENFORCE_GE(tasks.size(), 1); + // One of the tasks will be run on the current thread. + int workers_count = tasks.size() - 1; + CreateWorkers(workers_count); + TORCH_DCHECK_LE(workers_count, (int)workers_.size()); + counter_to_decrement_when_ready_.Reset(workers_count); + for (const auto task : c10::irange(1, tasks.size())) { + workers_[task - 1]->StartWork(tasks[task].get()); + } + // Execute the remaining workload immediately on the current thread. + auto& task = tasks.front(); + task->Run(); + // Wait for the workers submitted above to finish. + counter_to_decrement_when_ready_.Wait(); + } + + private: + // Ensures that the pool has at least the given count of workers. + // If any new worker has to be created, this function waits for it to + // be ready. + void CreateWorkers(std::size_t workers_count) { + if (workers_.size() >= workers_count) { + return; + } + counter_to_decrement_when_ready_.Reset(workers_count - workers_.size()); + while (workers_.size() < workers_count) { + workers_.push_back(MakeAligned::make(&counter_to_decrement_when_ready_)); + } + counter_to_decrement_when_ready_.Wait(); + } + + C10_DISABLE_COPY_AND_ASSIGN(WorkersPool); + std::vector>> workers_; + // The BlockingCounter used to wait for the workers. + BlockingCounter counter_to_decrement_when_ready_; +}; +} // namespace caffe2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h new file mode 100644 index 0000000000000000000000000000000000000000..cb9a01d3bd2ec1bc12d5290b965f18d9bb0cbfb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h @@ -0,0 +1,60 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_PTHREADPOOL + +#ifdef USE_INTERNAL_PTHREADPOOL_IMPL +#include +#else +#include +#endif + +#include +#include +#include + +namespace caffe2 { + +class PThreadPool final { + public: + explicit PThreadPool(size_t thread_count); + ~PThreadPool() = default; + + PThreadPool(const PThreadPool&) = delete; + PThreadPool& operator=(const PThreadPool&) = delete; + + PThreadPool(PThreadPool&&) = delete; + PThreadPool& operator=(PThreadPool&&) = delete; + + size_t get_thread_count() const; + void set_thread_count(size_t thread_count); + + // Run, in parallel, function fn(task_id) over task_id in range [0, range). + // This function is blocking. All input is processed by the time it returns. + void run(const std::function& fn, size_t range); + + private: + friend pthreadpool_t pthreadpool_(); + + private: + mutable std::mutex mutex_; + std::unique_ptr threadpool_; +}; + +// Return a singleton instance of PThreadPool for ATen/TH multithreading. +PThreadPool* pthreadpool(); +PThreadPool* pthreadpool(size_t thread_count); + +// Exposes the underlying implementation of PThreadPool. +// Only for use in external libraries so as to unify threading across +// internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK) +// use cases. +pthreadpool_t pthreadpool_(); + +} // namespace caffe2 + +#endif /* USE_PTHREADPOOL */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..ff7ff896b589dff1f51abd155e685fe2ee231750 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h @@ -0,0 +1,198 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// pthreadpool header from https://github.com/Maratyszcza/pthreadpool +// for NNPACK +#ifndef CAFFE2_UTILS_PTHREADPOOL_H_ +#define CAFFE2_UTILS_PTHREADPOOL_H_ + +#include "ThreadPoolCommon.h" + +#include // for size_t +#include // for uint32_t + +#if defined(USE_PTHREADPOOL) +// This is a hack. +// Mainly introduced here because +// 1. NNPACK can be compiled to use internal legacy threadpool implementation because much of C2 depends on that. +// 2. Then if we want to use NNPACK in PyTorch, which uses new pthreadpool, then we will supply new pthreadpool pointer +// to NNPACK. This will not work if NNPACK is compiled with internal legacy threadpool. Thus this guard +// along with changes in pthreadpool_impl.cc allows us to override that behavior. +// It enables us to use NNPACK from pytorch using `caffe2::pthreadpool_()` +namespace caffe2 { +class WithCastToNewThreadPool { + public: + explicit WithCastToNewThreadPool(bool use_new_threadpool); + ~WithCastToNewThreadPool(); + private: + bool use_new_threadpool_; +}; +} +#endif + +typedef struct pthreadpool* legacy_pthreadpool_t; + +typedef void (*legacy_pthreadpool_function_1d_t)(void*, size_t); +typedef void (*legacy_pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); +typedef void (*legacy_pthreadpool_function_2d_t)(void*, size_t, size_t); +typedef void (*legacy_pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); +typedef void (*legacy_pthreadpool_function_3d_tiled_t)( + void*, + size_t, + size_t, + size_t, + size_t, + size_t, + size_t); +typedef void (*legacy_pthreadpool_function_4d_tiled_t)( + void*, + size_t, + size_t, + size_t, + size_t, + size_t, + size_t, + size_t, + size_t); + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Creates a thread pool with the specified number of threads. + * + * @param[in] threads_count The number of threads in the thread pool. + * A value of 0 has special interpretation: it creates a thread for each + * processor core available in the system. + * + * @returns A pointer to an opaque thread pool object. + * On error the function returns NULL and sets errno accordingly. + */ + +// Returns internal threadpool impl. +legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count); + +/** + * Queries the number of threads in a thread pool. + * + * @param[in] threadpool The thread pool to query. + * + * @returns The number of threads in the thread pool. + */ +size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool); + +/** + * Processes items in parallel using threads from a thread pool. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, the + * calls are serialized. + * + * @param[in] threadpool The thread pool to use for parallelisation. + * @param[in] function The function to call for each item. + * @param[in] argument The first argument passed to the @a function. + * @param[in] items The number of items to process. The @a function + * will be called once for each item. + */ +void legacy_pthreadpool_compute_1d( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_1d_t function, + void* argument, + size_t range); + +void legacy_pthreadpool_parallelize_1d( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_1d_t function, + void* argument, + size_t range, + uint32_t flags); + +void legacy_pthreadpool_compute_1d_tiled( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_1d_tiled_t function, + void* argument, + size_t range, + size_t tile); + +void legacy_pthreadpool_compute_2d( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_2d_t function, + void* argument, + size_t range_i, + size_t range_j); + +void legacy_pthreadpool_compute_2d_tiled( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_2d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_i, + size_t tile_j); + +void legacy_pthreadpool_compute_3d_tiled( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_3d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t tile_i, + size_t tile_j, + size_t tile_k); + +void legacy_pthreadpool_compute_4d_tiled( + legacy_pthreadpool_t threadpool, + legacy_pthreadpool_function_4d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t tile_i, + size_t tile_j, + size_t tile_k, + size_t tile_l); + +/** + * Terminates threads in the thread pool and releases associated resources. + * + * @warning Accessing the thread pool after a call to this function constitutes + * undefined behaviour and may cause data corruption. + * + * @param[in,out] threadpool The thread pool to destroy. + */ +void legacy_pthreadpool_destroy(legacy_pthreadpool_t threadpool); + +#ifdef USE_INTERNAL_PTHREADPOOL_IMPL + +#define pthreadpool_t legacy_pthreadpool_t +#define pthreadpool_function_1d_t legacy_pthreadpool_function_1d_t +#define pthreadpool_function_1d_tiled_t legacy_pthreadpool_function_1d_tiled_t +#define pthreadpool_function_2d_t legacy_pthreadpool_function_2d_t +#define pthreadpool_function_2d_tiled_t legacy_pthreadpool_function_2d_tiled_t +#define pthreadpool_function_3d_tiled_t legacy_pthreadpool_function_3d_tiled_t +#define pthreadpool_function_4d_tiled_t legacy_pthreadpool_function_4d_tiled_t +#define pthreadpool_create legacy_pthreadpool_create +#define pthreadpool_destroy legacy_pthreadpool_destroy +#define pthreadpool_get_threads_count legacy_pthreadpool_get_threads_count +#define pthreadpool_compute_1d legacy_pthreadpool_compute_1d +#define pthreadpool_parallelize_1d legacy_pthreadpool_parallelize_1d +#define pthreadpool_compute_1d_tiled legacy_pthreadpool_compute_1d_tiled +#define pthreadpool_compute_2d legacy_pthreadpool_compute_2d +#define pthreadpool_compute_2d_tiled legacy_pthreadpool_compute_2d_tiled +#define pthreadpool_compute_3d_tiled legacy_pthreadpool_compute_3d_tiled +#define pthreadpool_compute_4d_tiled legacy_pthreadpool_compute_4d_tiled + +#endif /* USE_INTERNAL_PTHREADPOOL_IMPL */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // CAFFE2_UTILS_PTHREADPOOL_H_ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h new file mode 100644 index 0000000000000000000000000000000000000000..cb76646e6f61bdc1540bacd7dcbf88b4aa09b5f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace caffe2 { + +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct TORCH_API _NoPThreadPoolGuard { + static bool is_enabled(); + static void set_enabled(bool enabled); + + _NoPThreadPoolGuard(): prev_mode_(_NoPThreadPoolGuard::is_enabled()) { + _NoPThreadPoolGuard::set_enabled(true); + } + ~_NoPThreadPoolGuard() { + _NoPThreadPoolGuard::set_enabled(prev_mode_); + } + private: + bool prev_mode_; +}; + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..7d54052e15455f00eb41246585ecd9e0470508e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h @@ -0,0 +1,414 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/any.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fany_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fany_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fany_2eproto; +PROTOBUF_NAMESPACE_OPEN +class Any; +class AnyDefaultTypeInternal; +PROTOBUF_EXPORT extern AnyDefaultTypeInternal _Any_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Any* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT Any PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Any) */ { + public: + inline Any() : Any(nullptr) {} + virtual ~Any(); + + Any(const Any& from); + Any(Any&& from) noexcept + : Any() { + *this = ::std::move(from); + } + + inline Any& operator=(const Any& from) { + CopyFrom(from); + return *this; + } + inline Any& operator=(Any&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Any& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Any* internal_default_instance() { + return reinterpret_cast( + &_Any_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + // implements Any ----------------------------------------------- + + void PackFrom(const ::PROTOBUF_NAMESPACE_ID::Message& message) { + _any_metadata_.PackFrom(message); + } + void PackFrom(const ::PROTOBUF_NAMESPACE_ID::Message& message, + const std::string& type_url_prefix) { + _any_metadata_.PackFrom(message, type_url_prefix); + } + bool UnpackTo(::PROTOBUF_NAMESPACE_ID::Message* message) const { + return _any_metadata_.UnpackTo(message); + } + static bool GetAnyFieldDescriptors( + const ::PROTOBUF_NAMESPACE_ID::Message& message, + const ::PROTOBUF_NAMESPACE_ID::FieldDescriptor** type_url_field, + const ::PROTOBUF_NAMESPACE_ID::FieldDescriptor** value_field); + template ::value>::type> + void PackFrom(const T& message) { + _any_metadata_.PackFrom(message); + } + template ::value>::type> + void PackFrom(const T& message, + const std::string& type_url_prefix) { + _any_metadata_.PackFrom(message, type_url_prefix);} + template ::value>::type> + bool UnpackTo(T* message) const { + return _any_metadata_.UnpackTo(message); + } + template bool Is() const { + return _any_metadata_.Is(); + } + static bool ParseAnyTypeUrl(const string& type_url, + std::string* full_type_name); + friend void swap(Any& a, Any& b) { + a.Swap(&b); + } + inline void Swap(Any* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Any* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Any* New() const final { + return CreateMaybeMessage(nullptr); + } + + Any* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Any& from); + void MergeFrom(const Any& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Any* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Any"; + } + protected: + explicit Any(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fany_2eproto); + return ::descriptor_table_google_2fprotobuf_2fany_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kTypeUrlFieldNumber = 1, + kValueFieldNumber = 2, + }; + // string type_url = 1; + void clear_type_url(); + const std::string& type_url() const; + void set_type_url(const std::string& value); + void set_type_url(std::string&& value); + void set_type_url(const char* value); + void set_type_url(const char* value, size_t size); + std::string* mutable_type_url(); + std::string* release_type_url(); + void set_allocated_type_url(std::string* type_url); + private: + const std::string& _internal_type_url() const; + void _internal_set_type_url(const std::string& value); + std::string* _internal_mutable_type_url(); + public: + + // bytes value = 2; + void clear_value(); + const std::string& value() const; + void set_value(const std::string& value); + void set_value(std::string&& value); + void set_value(const char* value); + void set_value(const void* value, size_t size); + std::string* mutable_value(); + std::string* release_value(); + void set_allocated_value(std::string* value); + private: + const std::string& _internal_value() const; + void _internal_set_value(const std::string& value); + std::string* _internal_mutable_value(); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Any) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr type_url_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr value_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata _any_metadata_; + friend struct ::TableStruct_google_2fprotobuf_2fany_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// Any + +// string type_url = 1; +inline void Any::clear_type_url() { + type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Any::type_url() const { + // @@protoc_insertion_point(field_get:google.protobuf.Any.type_url) + return _internal_type_url(); +} +inline void Any::set_type_url(const std::string& value) { + _internal_set_type_url(value); + // @@protoc_insertion_point(field_set:google.protobuf.Any.type_url) +} +inline std::string* Any::mutable_type_url() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Any.type_url) + return _internal_mutable_type_url(); +} +inline const std::string& Any::_internal_type_url() const { + return type_url_.Get(); +} +inline void Any::_internal_set_type_url(const std::string& value) { + + type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Any::set_type_url(std::string&& value) { + + type_url_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Any.type_url) +} +inline void Any::set_type_url(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Any.type_url) +} +inline void Any::set_type_url(const char* value, + size_t size) { + + type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Any.type_url) +} +inline std::string* Any::_internal_mutable_type_url() { + + return type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Any::release_type_url() { + // @@protoc_insertion_point(field_release:google.protobuf.Any.type_url) + return type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Any::set_allocated_type_url(std::string* type_url) { + if (type_url != nullptr) { + + } else { + + } + type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), type_url, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Any.type_url) +} + +// bytes value = 2; +inline void Any::clear_value() { + value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Any::value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Any.value) + return _internal_value(); +} +inline void Any::set_value(const std::string& value) { + _internal_set_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.Any.value) +} +inline std::string* Any::mutable_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Any.value) + return _internal_mutable_value(); +} +inline const std::string& Any::_internal_value() const { + return value_.Get(); +} +inline void Any::_internal_set_value(const std::string& value) { + + value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Any::set_value(std::string&& value) { + + value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Any.value) +} +inline void Any::set_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Any.value) +} +inline void Any::set_value(const void* value, + size_t size) { + + value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Any.value) +} +inline std::string* Any::_internal_mutable_value() { + + return value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Any::release_value() { + // @@protoc_insertion_point(field_release:google.protobuf.Any.value) + return value_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Any::set_allocated_value(std::string* value) { + if (value != nullptr) { + + } else { + + } + value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Any.value) +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..5b5c902661b1330f34e1ad49c3e7d291d895bda5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h @@ -0,0 +1,1505 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/api.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +#include +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fapi_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fapi_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[3] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fapi_2eproto; +PROTOBUF_NAMESPACE_OPEN +class Api; +class ApiDefaultTypeInternal; +PROTOBUF_EXPORT extern ApiDefaultTypeInternal _Api_default_instance_; +class Method; +class MethodDefaultTypeInternal; +PROTOBUF_EXPORT extern MethodDefaultTypeInternal _Method_default_instance_; +class Mixin; +class MixinDefaultTypeInternal; +PROTOBUF_EXPORT extern MixinDefaultTypeInternal _Mixin_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Api* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Method* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Mixin* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT Api PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Api) */ { + public: + inline Api() : Api(nullptr) {} + virtual ~Api(); + + Api(const Api& from); + Api(Api&& from) noexcept + : Api() { + *this = ::std::move(from); + } + + inline Api& operator=(const Api& from) { + CopyFrom(from); + return *this; + } + inline Api& operator=(Api&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Api& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Api* internal_default_instance() { + return reinterpret_cast( + &_Api_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(Api& a, Api& b) { + a.Swap(&b); + } + inline void Swap(Api* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Api* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Api* New() const final { + return CreateMaybeMessage(nullptr); + } + + Api* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Api& from); + void MergeFrom(const Api& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Api* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Api"; + } + protected: + explicit Api(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto); + return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kMethodsFieldNumber = 2, + kOptionsFieldNumber = 3, + kMixinsFieldNumber = 6, + kNameFieldNumber = 1, + kVersionFieldNumber = 4, + kSourceContextFieldNumber = 5, + kSyntaxFieldNumber = 7, + }; + // repeated .google.protobuf.Method methods = 2; + int methods_size() const; + private: + int _internal_methods_size() const; + public: + void clear_methods(); + PROTOBUF_NAMESPACE_ID::Method* mutable_methods(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >* + mutable_methods(); + private: + const PROTOBUF_NAMESPACE_ID::Method& _internal_methods(int index) const; + PROTOBUF_NAMESPACE_ID::Method* _internal_add_methods(); + public: + const PROTOBUF_NAMESPACE_ID::Method& methods(int index) const; + PROTOBUF_NAMESPACE_ID::Method* add_methods(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >& + methods() const; + + // repeated .google.protobuf.Option options = 3; + int options_size() const; + private: + int _internal_options_size() const; + public: + void clear_options(); + PROTOBUF_NAMESPACE_ID::Option* mutable_options(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >* + mutable_options(); + private: + const PROTOBUF_NAMESPACE_ID::Option& _internal_options(int index) const; + PROTOBUF_NAMESPACE_ID::Option* _internal_add_options(); + public: + const PROTOBUF_NAMESPACE_ID::Option& options(int index) const; + PROTOBUF_NAMESPACE_ID::Option* add_options(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >& + options() const; + + // repeated .google.protobuf.Mixin mixins = 6; + int mixins_size() const; + private: + int _internal_mixins_size() const; + public: + void clear_mixins(); + PROTOBUF_NAMESPACE_ID::Mixin* mutable_mixins(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >* + mutable_mixins(); + private: + const PROTOBUF_NAMESPACE_ID::Mixin& _internal_mixins(int index) const; + PROTOBUF_NAMESPACE_ID::Mixin* _internal_add_mixins(); + public: + const PROTOBUF_NAMESPACE_ID::Mixin& mixins(int index) const; + PROTOBUF_NAMESPACE_ID::Mixin* add_mixins(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >& + mixins() const; + + // string name = 1; + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // string version = 4; + void clear_version(); + const std::string& version() const; + void set_version(const std::string& value); + void set_version(std::string&& value); + void set_version(const char* value); + void set_version(const char* value, size_t size); + std::string* mutable_version(); + std::string* release_version(); + void set_allocated_version(std::string* version); + private: + const std::string& _internal_version() const; + void _internal_set_version(const std::string& value); + std::string* _internal_mutable_version(); + public: + + // .google.protobuf.SourceContext source_context = 5; + bool has_source_context() const; + private: + bool _internal_has_source_context() const; + public: + void clear_source_context(); + const PROTOBUF_NAMESPACE_ID::SourceContext& source_context() const; + PROTOBUF_NAMESPACE_ID::SourceContext* release_source_context(); + PROTOBUF_NAMESPACE_ID::SourceContext* mutable_source_context(); + void set_allocated_source_context(PROTOBUF_NAMESPACE_ID::SourceContext* source_context); + private: + const PROTOBUF_NAMESPACE_ID::SourceContext& _internal_source_context() const; + PROTOBUF_NAMESPACE_ID::SourceContext* _internal_mutable_source_context(); + public: + void unsafe_arena_set_allocated_source_context( + PROTOBUF_NAMESPACE_ID::SourceContext* source_context); + PROTOBUF_NAMESPACE_ID::SourceContext* unsafe_arena_release_source_context(); + + // .google.protobuf.Syntax syntax = 7; + void clear_syntax(); + PROTOBUF_NAMESPACE_ID::Syntax syntax() const; + void set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value); + private: + PROTOBUF_NAMESPACE_ID::Syntax _internal_syntax() const; + void _internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Api) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method > methods_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option > options_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin > mixins_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr version_; + PROTOBUF_NAMESPACE_ID::SourceContext* source_context_; + int syntax_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT Method PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Method) */ { + public: + inline Method() : Method(nullptr) {} + virtual ~Method(); + + Method(const Method& from); + Method(Method&& from) noexcept + : Method() { + *this = ::std::move(from); + } + + inline Method& operator=(const Method& from) { + CopyFrom(from); + return *this; + } + inline Method& operator=(Method&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Method& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Method* internal_default_instance() { + return reinterpret_cast( + &_Method_default_instance_); + } + static constexpr int kIndexInFileMessages = + 1; + + friend void swap(Method& a, Method& b) { + a.Swap(&b); + } + inline void Swap(Method* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Method* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Method* New() const final { + return CreateMaybeMessage(nullptr); + } + + Method* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Method& from); + void MergeFrom(const Method& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Method* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Method"; + } + protected: + explicit Method(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto); + return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kOptionsFieldNumber = 6, + kNameFieldNumber = 1, + kRequestTypeUrlFieldNumber = 2, + kResponseTypeUrlFieldNumber = 4, + kRequestStreamingFieldNumber = 3, + kResponseStreamingFieldNumber = 5, + kSyntaxFieldNumber = 7, + }; + // repeated .google.protobuf.Option options = 6; + int options_size() const; + private: + int _internal_options_size() const; + public: + void clear_options(); + PROTOBUF_NAMESPACE_ID::Option* mutable_options(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >* + mutable_options(); + private: + const PROTOBUF_NAMESPACE_ID::Option& _internal_options(int index) const; + PROTOBUF_NAMESPACE_ID::Option* _internal_add_options(); + public: + const PROTOBUF_NAMESPACE_ID::Option& options(int index) const; + PROTOBUF_NAMESPACE_ID::Option* add_options(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >& + options() const; + + // string name = 1; + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // string request_type_url = 2; + void clear_request_type_url(); + const std::string& request_type_url() const; + void set_request_type_url(const std::string& value); + void set_request_type_url(std::string&& value); + void set_request_type_url(const char* value); + void set_request_type_url(const char* value, size_t size); + std::string* mutable_request_type_url(); + std::string* release_request_type_url(); + void set_allocated_request_type_url(std::string* request_type_url); + private: + const std::string& _internal_request_type_url() const; + void _internal_set_request_type_url(const std::string& value); + std::string* _internal_mutable_request_type_url(); + public: + + // string response_type_url = 4; + void clear_response_type_url(); + const std::string& response_type_url() const; + void set_response_type_url(const std::string& value); + void set_response_type_url(std::string&& value); + void set_response_type_url(const char* value); + void set_response_type_url(const char* value, size_t size); + std::string* mutable_response_type_url(); + std::string* release_response_type_url(); + void set_allocated_response_type_url(std::string* response_type_url); + private: + const std::string& _internal_response_type_url() const; + void _internal_set_response_type_url(const std::string& value); + std::string* _internal_mutable_response_type_url(); + public: + + // bool request_streaming = 3; + void clear_request_streaming(); + bool request_streaming() const; + void set_request_streaming(bool value); + private: + bool _internal_request_streaming() const; + void _internal_set_request_streaming(bool value); + public: + + // bool response_streaming = 5; + void clear_response_streaming(); + bool response_streaming() const; + void set_response_streaming(bool value); + private: + bool _internal_response_streaming() const; + void _internal_set_response_streaming(bool value); + public: + + // .google.protobuf.Syntax syntax = 7; + void clear_syntax(); + PROTOBUF_NAMESPACE_ID::Syntax syntax() const; + void set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value); + private: + PROTOBUF_NAMESPACE_ID::Syntax _internal_syntax() const; + void _internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Method) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option > options_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr request_type_url_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr response_type_url_; + bool request_streaming_; + bool response_streaming_; + int syntax_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT Mixin PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Mixin) */ { + public: + inline Mixin() : Mixin(nullptr) {} + virtual ~Mixin(); + + Mixin(const Mixin& from); + Mixin(Mixin&& from) noexcept + : Mixin() { + *this = ::std::move(from); + } + + inline Mixin& operator=(const Mixin& from) { + CopyFrom(from); + return *this; + } + inline Mixin& operator=(Mixin&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Mixin& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Mixin* internal_default_instance() { + return reinterpret_cast( + &_Mixin_default_instance_); + } + static constexpr int kIndexInFileMessages = + 2; + + friend void swap(Mixin& a, Mixin& b) { + a.Swap(&b); + } + inline void Swap(Mixin* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Mixin* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Mixin* New() const final { + return CreateMaybeMessage(nullptr); + } + + Mixin* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Mixin& from); + void MergeFrom(const Mixin& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Mixin* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Mixin"; + } + protected: + explicit Mixin(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto); + return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 1, + kRootFieldNumber = 2, + }; + // string name = 1; + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // string root = 2; + void clear_root(); + const std::string& root() const; + void set_root(const std::string& value); + void set_root(std::string&& value); + void set_root(const char* value); + void set_root(const char* value, size_t size); + std::string* mutable_root(); + std::string* release_root(); + void set_allocated_root(std::string* root); + private: + const std::string& _internal_root() const; + void _internal_set_root(const std::string& value); + std::string* _internal_mutable_root(); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Mixin) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr root_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// Api + +// string name = 1; +inline void Api::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Api::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.name) + return _internal_name(); +} +inline void Api::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.Api.name) +} +inline std::string* Api::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.name) + return _internal_mutable_name(); +} +inline const std::string& Api::_internal_name() const { + return name_.Get(); +} +inline void Api::_internal_set_name(const std::string& value) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Api::set_name(std::string&& value) { + + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Api.name) +} +inline void Api::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Api.name) +} +inline void Api::set_name(const char* value, + size_t size) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Api.name) +} +inline std::string* Api::_internal_mutable_name() { + + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Api::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.Api.name) + return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Api::set_allocated_name(std::string* name) { + if (name != nullptr) { + + } else { + + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.name) +} + +// repeated .google.protobuf.Method methods = 2; +inline int Api::_internal_methods_size() const { + return methods_.size(); +} +inline int Api::methods_size() const { + return _internal_methods_size(); +} +inline void Api::clear_methods() { + methods_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::Method* Api::mutable_methods(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.methods) + return methods_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >* +Api::mutable_methods() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.methods) + return &methods_; +} +inline const PROTOBUF_NAMESPACE_ID::Method& Api::_internal_methods(int index) const { + return methods_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::Method& Api::methods(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.methods) + return _internal_methods(index); +} +inline PROTOBUF_NAMESPACE_ID::Method* Api::_internal_add_methods() { + return methods_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::Method* Api::add_methods() { + // @@protoc_insertion_point(field_add:google.protobuf.Api.methods) + return _internal_add_methods(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >& +Api::methods() const { + // @@protoc_insertion_point(field_list:google.protobuf.Api.methods) + return methods_; +} + +// repeated .google.protobuf.Option options = 3; +inline int Api::_internal_options_size() const { + return options_.size(); +} +inline int Api::options_size() const { + return _internal_options_size(); +} +inline PROTOBUF_NAMESPACE_ID::Option* Api::mutable_options(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.options) + return options_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >* +Api::mutable_options() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.options) + return &options_; +} +inline const PROTOBUF_NAMESPACE_ID::Option& Api::_internal_options(int index) const { + return options_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::Option& Api::options(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.options) + return _internal_options(index); +} +inline PROTOBUF_NAMESPACE_ID::Option* Api::_internal_add_options() { + return options_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::Option* Api::add_options() { + // @@protoc_insertion_point(field_add:google.protobuf.Api.options) + return _internal_add_options(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >& +Api::options() const { + // @@protoc_insertion_point(field_list:google.protobuf.Api.options) + return options_; +} + +// string version = 4; +inline void Api::clear_version() { + version_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Api::version() const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.version) + return _internal_version(); +} +inline void Api::set_version(const std::string& value) { + _internal_set_version(value); + // @@protoc_insertion_point(field_set:google.protobuf.Api.version) +} +inline std::string* Api::mutable_version() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.version) + return _internal_mutable_version(); +} +inline const std::string& Api::_internal_version() const { + return version_.Get(); +} +inline void Api::_internal_set_version(const std::string& value) { + + version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Api::set_version(std::string&& value) { + + version_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Api.version) +} +inline void Api::set_version(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Api.version) +} +inline void Api::set_version(const char* value, + size_t size) { + + version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Api.version) +} +inline std::string* Api::_internal_mutable_version() { + + return version_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Api::release_version() { + // @@protoc_insertion_point(field_release:google.protobuf.Api.version) + return version_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Api::set_allocated_version(std::string* version) { + if (version != nullptr) { + + } else { + + } + version_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), version, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.version) +} + +// .google.protobuf.SourceContext source_context = 5; +inline bool Api::_internal_has_source_context() const { + return this != internal_default_instance() && source_context_ != nullptr; +} +inline bool Api::has_source_context() const { + return _internal_has_source_context(); +} +inline const PROTOBUF_NAMESPACE_ID::SourceContext& Api::_internal_source_context() const { + const PROTOBUF_NAMESPACE_ID::SourceContext* p = source_context_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_SourceContext_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::SourceContext& Api::source_context() const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.source_context) + return _internal_source_context(); +} +inline void Api::unsafe_arena_set_allocated_source_context( + PROTOBUF_NAMESPACE_ID::SourceContext* source_context) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context_); + } + source_context_ = source_context; + if (source_context) { + + } else { + + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Api.source_context) +} +inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::release_source_context() { + + PROTOBUF_NAMESPACE_ID::SourceContext* temp = source_context_; + source_context_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::unsafe_arena_release_source_context() { + // @@protoc_insertion_point(field_release:google.protobuf.Api.source_context) + + PROTOBUF_NAMESPACE_ID::SourceContext* temp = source_context_; + source_context_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::_internal_mutable_source_context() { + + if (source_context_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + source_context_ = p; + } + return source_context_; +} +inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::mutable_source_context() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.source_context) + return _internal_mutable_source_context(); +} +inline void Api::set_allocated_source_context(PROTOBUF_NAMESPACE_ID::SourceContext* source_context) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete reinterpret_cast< ::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context_); + } + if (source_context) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context)->GetArena(); + if (message_arena != submessage_arena) { + source_context = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, source_context, submessage_arena); + } + + } else { + + } + source_context_ = source_context; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.source_context) +} + +// repeated .google.protobuf.Mixin mixins = 6; +inline int Api::_internal_mixins_size() const { + return mixins_.size(); +} +inline int Api::mixins_size() const { + return _internal_mixins_size(); +} +inline void Api::clear_mixins() { + mixins_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::Mixin* Api::mutable_mixins(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.Api.mixins) + return mixins_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >* +Api::mutable_mixins() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.mixins) + return &mixins_; +} +inline const PROTOBUF_NAMESPACE_ID::Mixin& Api::_internal_mixins(int index) const { + return mixins_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::Mixin& Api::mixins(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.mixins) + return _internal_mixins(index); +} +inline PROTOBUF_NAMESPACE_ID::Mixin* Api::_internal_add_mixins() { + return mixins_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::Mixin* Api::add_mixins() { + // @@protoc_insertion_point(field_add:google.protobuf.Api.mixins) + return _internal_add_mixins(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >& +Api::mixins() const { + // @@protoc_insertion_point(field_list:google.protobuf.Api.mixins) + return mixins_; +} + +// .google.protobuf.Syntax syntax = 7; +inline void Api::clear_syntax() { + syntax_ = 0; +} +inline PROTOBUF_NAMESPACE_ID::Syntax Api::_internal_syntax() const { + return static_cast< PROTOBUF_NAMESPACE_ID::Syntax >(syntax_); +} +inline PROTOBUF_NAMESPACE_ID::Syntax Api::syntax() const { + // @@protoc_insertion_point(field_get:google.protobuf.Api.syntax) + return _internal_syntax(); +} +inline void Api::_internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) { + + syntax_ = value; +} +inline void Api::set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) { + _internal_set_syntax(value); + // @@protoc_insertion_point(field_set:google.protobuf.Api.syntax) +} + +// ------------------------------------------------------------------- + +// Method + +// string name = 1; +inline void Method::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Method::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.name) + return _internal_name(); +} +inline void Method::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.name) +} +inline std::string* Method::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Method.name) + return _internal_mutable_name(); +} +inline const std::string& Method::_internal_name() const { + return name_.Get(); +} +inline void Method::_internal_set_name(const std::string& value) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Method::set_name(std::string&& value) { + + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.name) +} +inline void Method::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Method.name) +} +inline void Method::set_name(const char* value, + size_t size) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.name) +} +inline std::string* Method::_internal_mutable_name() { + + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Method::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.Method.name) + return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Method::set_allocated_name(std::string* name) { + if (name != nullptr) { + + } else { + + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.name) +} + +// string request_type_url = 2; +inline void Method::clear_request_type_url() { + request_type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Method::request_type_url() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.request_type_url) + return _internal_request_type_url(); +} +inline void Method::set_request_type_url(const std::string& value) { + _internal_set_request_type_url(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.request_type_url) +} +inline std::string* Method::mutable_request_type_url() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Method.request_type_url) + return _internal_mutable_request_type_url(); +} +inline const std::string& Method::_internal_request_type_url() const { + return request_type_url_.Get(); +} +inline void Method::_internal_set_request_type_url(const std::string& value) { + + request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Method::set_request_type_url(std::string&& value) { + + request_type_url_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.request_type_url) +} +inline void Method::set_request_type_url(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Method.request_type_url) +} +inline void Method::set_request_type_url(const char* value, + size_t size) { + + request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.request_type_url) +} +inline std::string* Method::_internal_mutable_request_type_url() { + + return request_type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Method::release_request_type_url() { + // @@protoc_insertion_point(field_release:google.protobuf.Method.request_type_url) + return request_type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Method::set_allocated_request_type_url(std::string* request_type_url) { + if (request_type_url != nullptr) { + + } else { + + } + request_type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), request_type_url, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.request_type_url) +} + +// bool request_streaming = 3; +inline void Method::clear_request_streaming() { + request_streaming_ = false; +} +inline bool Method::_internal_request_streaming() const { + return request_streaming_; +} +inline bool Method::request_streaming() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.request_streaming) + return _internal_request_streaming(); +} +inline void Method::_internal_set_request_streaming(bool value) { + + request_streaming_ = value; +} +inline void Method::set_request_streaming(bool value) { + _internal_set_request_streaming(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.request_streaming) +} + +// string response_type_url = 4; +inline void Method::clear_response_type_url() { + response_type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Method::response_type_url() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.response_type_url) + return _internal_response_type_url(); +} +inline void Method::set_response_type_url(const std::string& value) { + _internal_set_response_type_url(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.response_type_url) +} +inline std::string* Method::mutable_response_type_url() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Method.response_type_url) + return _internal_mutable_response_type_url(); +} +inline const std::string& Method::_internal_response_type_url() const { + return response_type_url_.Get(); +} +inline void Method::_internal_set_response_type_url(const std::string& value) { + + response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Method::set_response_type_url(std::string&& value) { + + response_type_url_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.response_type_url) +} +inline void Method::set_response_type_url(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Method.response_type_url) +} +inline void Method::set_response_type_url(const char* value, + size_t size) { + + response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.response_type_url) +} +inline std::string* Method::_internal_mutable_response_type_url() { + + return response_type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Method::release_response_type_url() { + // @@protoc_insertion_point(field_release:google.protobuf.Method.response_type_url) + return response_type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Method::set_allocated_response_type_url(std::string* response_type_url) { + if (response_type_url != nullptr) { + + } else { + + } + response_type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), response_type_url, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.response_type_url) +} + +// bool response_streaming = 5; +inline void Method::clear_response_streaming() { + response_streaming_ = false; +} +inline bool Method::_internal_response_streaming() const { + return response_streaming_; +} +inline bool Method::response_streaming() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.response_streaming) + return _internal_response_streaming(); +} +inline void Method::_internal_set_response_streaming(bool value) { + + response_streaming_ = value; +} +inline void Method::set_response_streaming(bool value) { + _internal_set_response_streaming(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.response_streaming) +} + +// repeated .google.protobuf.Option options = 6; +inline int Method::_internal_options_size() const { + return options_.size(); +} +inline int Method::options_size() const { + return _internal_options_size(); +} +inline PROTOBUF_NAMESPACE_ID::Option* Method::mutable_options(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.Method.options) + return options_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >* +Method::mutable_options() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.Method.options) + return &options_; +} +inline const PROTOBUF_NAMESPACE_ID::Option& Method::_internal_options(int index) const { + return options_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::Option& Method::options(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.options) + return _internal_options(index); +} +inline PROTOBUF_NAMESPACE_ID::Option* Method::_internal_add_options() { + return options_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::Option* Method::add_options() { + // @@protoc_insertion_point(field_add:google.protobuf.Method.options) + return _internal_add_options(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >& +Method::options() const { + // @@protoc_insertion_point(field_list:google.protobuf.Method.options) + return options_; +} + +// .google.protobuf.Syntax syntax = 7; +inline void Method::clear_syntax() { + syntax_ = 0; +} +inline PROTOBUF_NAMESPACE_ID::Syntax Method::_internal_syntax() const { + return static_cast< PROTOBUF_NAMESPACE_ID::Syntax >(syntax_); +} +inline PROTOBUF_NAMESPACE_ID::Syntax Method::syntax() const { + // @@protoc_insertion_point(field_get:google.protobuf.Method.syntax) + return _internal_syntax(); +} +inline void Method::_internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) { + + syntax_ = value; +} +inline void Method::set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) { + _internal_set_syntax(value); + // @@protoc_insertion_point(field_set:google.protobuf.Method.syntax) +} + +// ------------------------------------------------------------------- + +// Mixin + +// string name = 1; +inline void Mixin::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Mixin::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.Mixin.name) + return _internal_name(); +} +inline void Mixin::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.Mixin.name) +} +inline std::string* Mixin::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Mixin.name) + return _internal_mutable_name(); +} +inline const std::string& Mixin::_internal_name() const { + return name_.Get(); +} +inline void Mixin::_internal_set_name(const std::string& value) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Mixin::set_name(std::string&& value) { + + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Mixin.name) +} +inline void Mixin::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Mixin.name) +} +inline void Mixin::set_name(const char* value, + size_t size) { + + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Mixin.name) +} +inline std::string* Mixin::_internal_mutable_name() { + + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Mixin::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.Mixin.name) + return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Mixin::set_allocated_name(std::string* name) { + if (name != nullptr) { + + } else { + + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Mixin.name) +} + +// string root = 2; +inline void Mixin::clear_root() { + root_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& Mixin::root() const { + // @@protoc_insertion_point(field_get:google.protobuf.Mixin.root) + return _internal_root(); +} +inline void Mixin::set_root(const std::string& value) { + _internal_set_root(value); + // @@protoc_insertion_point(field_set:google.protobuf.Mixin.root) +} +inline std::string* Mixin::mutable_root() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Mixin.root) + return _internal_mutable_root(); +} +inline const std::string& Mixin::_internal_root() const { + return root_.Get(); +} +inline void Mixin::_internal_set_root(const std::string& value) { + + root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Mixin::set_root(std::string&& value) { + + root_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Mixin.root) +} +inline void Mixin::set_root(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Mixin.root) +} +inline void Mixin::set_root(const char* value, + size_t size) { + + root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Mixin.root) +} +inline std::string* Mixin::_internal_mutable_root() { + + return root_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Mixin::release_root() { + // @@protoc_insertion_point(field_release:google.protobuf.Mixin.root) + return root_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void Mixin::set_allocated_root(std::string* root) { + if (root != nullptr) { + + } else { + + } + root_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), root, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Mixin.root) +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h new file mode 100644 index 0000000000000000000000000000000000000000..33adc15cad401fbeb880476d3965a301232a5777 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h @@ -0,0 +1,741 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file defines an Arena allocator for better allocation performance. + +#ifndef GOOGLE_PROTOBUF_ARENA_H__ +#define GOOGLE_PROTOBUF_ARENA_H__ + + +#include +#include +#include +#ifdef max +#undef max // Visual Studio defines this macro +#endif +#if defined(_MSC_VER) && !defined(_LIBCPP_STD_VER) && !_HAS_EXCEPTIONS +// Work around bugs in MSVC header when _HAS_EXCEPTIONS=0. +#include +#include +namespace std { +using type_info = ::type_info; +} +#else +#include +#endif + +#include +#include +#include + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { + +struct ArenaOptions; // defined below + +} // namespace protobuf +} // namespace google + +namespace google { +namespace protobuf { + +class Arena; // defined below +class Message; // defined in message.h +class MessageLite; +template +class Map; + +namespace arena_metrics { + +void EnableArenaMetrics(ArenaOptions* options); + +} // namespace arena_metrics + +namespace internal { + +struct ArenaStringPtr; // defined in arenastring.h +class LazyField; // defined in lazy_field.h +class EpsCopyInputStream; // defined in parse_context.h + +template +class GenericTypeHandler; // defined in repeated_field.h + +// Templated cleanup methods. +template +void arena_destruct_object(void* object) { + reinterpret_cast(object)->~T(); +} +template +void arena_delete_object(void* object) { + delete reinterpret_cast(object); +} +inline void arena_free(void* object, size_t size) { +#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation) + ::operator delete(object, size); +#else + (void)size; + ::operator delete(object); +#endif +} + +} // namespace internal + +// ArenaOptions provides optional additional parameters to arena construction +// that control its block-allocation behavior. +struct ArenaOptions { + // This defines the size of the first block requested from the system malloc. + // Subsequent block sizes will increase in a geometric series up to a maximum. + size_t start_block_size; + + // This defines the maximum block size requested from system malloc (unless an + // individual arena allocation request occurs with a size larger than this + // maximum). Requested block sizes increase up to this value, then remain + // here. + size_t max_block_size; + + // An initial block of memory for the arena to use, or NULL for none. If + // provided, the block must live at least as long as the arena itself. The + // creator of the Arena retains ownership of the block after the Arena is + // destroyed. + char* initial_block; + + // The size of the initial block, if provided. + size_t initial_block_size; + + // A function pointer to an alloc method that returns memory blocks of size + // requested. By default, it contains a ptr to the malloc function. + // + // NOTE: block_alloc and dealloc functions are expected to behave like + // malloc and free, including Asan poisoning. + void* (*block_alloc)(size_t); + // A function pointer to a dealloc method that takes ownership of the blocks + // from the arena. By default, it contains a ptr to a wrapper function that + // calls free. + void (*block_dealloc)(void*, size_t); + + ArenaOptions() + : start_block_size(kDefaultStartBlockSize), + max_block_size(kDefaultMaxBlockSize), + initial_block(NULL), + initial_block_size(0), + block_alloc(&::operator new), + block_dealloc(&internal::arena_free), + on_arena_init(NULL), + on_arena_reset(NULL), + on_arena_destruction(NULL), + on_arena_allocation(NULL) {} + + private: + // Hooks for adding external functionality such as user-specific metrics + // collection, specific debugging abilities, etc. + // Init hook (if set) will always be called at Arena init time. Init hook may + // return a pointer to a cookie to be stored in the arena. Reset and + // destruction hooks will then be called with the same cookie pointer. This + // allows us to save an external object per arena instance and use it on the + // other hooks (Note: If init hook returns NULL, the other hooks will NOT be + // called on this arena instance). + // on_arena_reset and on_arena_destruction also receive the space used in the + // arena just before the reset. + void* (*on_arena_init)(Arena* arena); + void (*on_arena_reset)(Arena* arena, void* cookie, uint64 space_used); + void (*on_arena_destruction)(Arena* arena, void* cookie, uint64 space_used); + + // type_info is promised to be static - its lifetime extends to + // match program's lifetime (It is given by typeid operator). + // Note: typeid(void) will be passed as allocated_type every time we + // intentionally want to avoid monitoring an allocation. (i.e. internal + // allocations for managing the arena) + void (*on_arena_allocation)(const std::type_info* allocated_type, + uint64 alloc_size, void* cookie); + + // Constants define default starting block size and max block size for + // arena allocator behavior -- see descriptions above. + static const size_t kDefaultStartBlockSize = 256; + static const size_t kDefaultMaxBlockSize = 8192; + + friend void arena_metrics::EnableArenaMetrics(ArenaOptions*); + friend class Arena; + friend class ArenaOptionsTestFriend; +}; + +// Support for non-RTTI environments. (The metrics hooks API uses type +// information.) +#if PROTOBUF_RTTI +#define RTTI_TYPE_ID(type) (&typeid(type)) +#else +#define RTTI_TYPE_ID(type) (NULL) +#endif + +// Arena allocator. Arena allocation replaces ordinary (heap-based) allocation +// with new/delete, and improves performance by aggregating allocations into +// larger blocks and freeing allocations all at once. Protocol messages are +// allocated on an arena by using Arena::CreateMessage(Arena*), below, and +// are automatically freed when the arena is destroyed. +// +// This is a thread-safe implementation: multiple threads may allocate from the +// arena concurrently. Destruction is not thread-safe and the destructing +// thread must synchronize with users of the arena first. +// +// An arena provides two allocation interfaces: CreateMessage, which works +// for arena-enabled proto2 message types as well as other types that satisfy +// the appropriate protocol (described below), and Create, which works for +// any arbitrary type T. CreateMessage is better when the type T supports it, +// because this interface (i) passes the arena pointer to the created object so +// that its sub-objects and internal allocations can use the arena too, and (ii) +// elides the object's destructor call when possible. Create does not place +// any special requirements on the type T, and will invoke the object's +// destructor when the arena is destroyed. +// +// The arena message allocation protocol, required by +// CreateMessage(Arena* arena, Args&&... args), is as follows: +// +// - The type T must have (at least) two constructors: a constructor callable +// with `args` (without `arena`), called when a T is allocated on the heap; +// and a constructor callable with `Arena* arena, Args&&... args`, called when +// a T is allocated on an arena. If the second constructor is called with a +// NULL arena pointer, it must be equivalent to invoking the first +// (`args`-only) constructor. +// +// - The type T must have a particular type trait: a nested type +// |InternalArenaConstructable_|. This is usually a typedef to |void|. If no +// such type trait exists, then the instantiation CreateMessage will fail +// to compile. +// +// - The type T *may* have the type trait |DestructorSkippable_|. If this type +// trait is present in the type, then its destructor will not be called if and +// only if it was passed a non-NULL arena pointer. If this type trait is not +// present on the type, then its destructor is always called when the +// containing arena is destroyed. +// +// This protocol is implemented by all arena-enabled proto2 message classes as +// well as protobuf container types like RepeatedPtrField and Map. The protocol +// is internal to protobuf and is not guaranteed to be stable. Non-proto types +// should not rely on this protocol. +class PROTOBUF_EXPORT PROTOBUF_ALIGNAS(8) Arena final { + public: + // Arena constructor taking custom options. See ArenaOptions below for + // descriptions of the options available. + explicit Arena(const ArenaOptions& options) : impl_(options) { + Init(options); + } + + // Block overhead. Use this as a guide for how much to over-allocate the + // initial block if you want an allocation of size N to fit inside it. + // + // WARNING: if you allocate multiple objects, it is difficult to guarantee + // that a series of allocations will fit in the initial block, especially if + // Arena changes its alignment guarantees in the future! + static const size_t kBlockOverhead = internal::ArenaImpl::kBlockHeaderSize + + internal::ArenaImpl::kSerialArenaSize; + + // Default constructor with sensible default options, tuned for average + // use-cases. + Arena() : impl_(ArenaOptions()) { Init(ArenaOptions()); } + + ~Arena() { + if (hooks_cookie_) { + CallDestructorHooks(); + } + } + + void Init(const ArenaOptions& options) { + on_arena_allocation_ = options.on_arena_allocation; + on_arena_reset_ = options.on_arena_reset; + on_arena_destruction_ = options.on_arena_destruction; + // Call the initialization hook + if (options.on_arena_init != NULL) { + hooks_cookie_ = options.on_arena_init(this); + } else { + hooks_cookie_ = NULL; + } + } + + // API to create proto2 message objects on the arena. If the arena passed in + // is NULL, then a heap allocated object is returned. Type T must be a message + // defined in a .proto file with cc_enable_arenas set to true, otherwise a + // compilation error will occur. + // + // RepeatedField and RepeatedPtrField may also be instantiated directly on an + // arena with this method. + // + // This function also accepts any type T that satisfies the arena message + // allocation protocol, documented above. + template + PROTOBUF_ALWAYS_INLINE static T* CreateMessage(Arena* arena, Args&&... args) { + static_assert( + InternalHelper::is_arena_constructable::value, + "CreateMessage can only construct types that are ArenaConstructable"); + // We must delegate to CreateMaybeMessage() and NOT CreateMessageInternal() + // because protobuf generated classes specialize CreateMaybeMessage() and we + // need to use that specialization for code size reasons. + return Arena::CreateMaybeMessage(arena, std::forward(args)...); + } + + // API to create any objects on the arena. Note that only the object will + // be created on the arena; the underlying ptrs (in case of a proto2 message) + // will be still heap allocated. Proto messages should usually be allocated + // with CreateMessage() instead. + // + // Note that even if T satisfies the arena message construction protocol + // (InternalArenaConstructable_ trait and optional DestructorSkippable_ + // trait), as described above, this function does not follow the protocol; + // instead, it treats T as a black-box type, just as if it did not have these + // traits. Specifically, T's constructor arguments will always be only those + // passed to Create() -- no additional arena pointer is implicitly added. + // Furthermore, the destructor will always be called at arena destruction time + // (unless the destructor is trivial). Hence, from T's point of view, it is as + // if the object were allocated on the heap (except that the underlying memory + // is obtained from the arena). + template + PROTOBUF_ALWAYS_INLINE static T* Create(Arena* arena, Args&&... args) { + return CreateNoMessage(arena, is_arena_constructable(), + std::forward(args)...); + } + + // Create an array of object type T on the arena *without* invoking the + // constructor of T. If `arena` is null, then the return value should be freed + // with `delete[] x;` (or `::operator delete[](x);`). + // To ensure safe uses, this function checks at compile time + // (when compiled as C++11) that T is trivially default-constructible and + // trivially destructible. + template + PROTOBUF_ALWAYS_INLINE static T* CreateArray(Arena* arena, + size_t num_elements) { + static_assert(std::is_pod::value, + "CreateArray requires a trivially constructible type"); + static_assert(std::is_trivially_destructible::value, + "CreateArray requires a trivially destructible type"); + GOOGLE_CHECK_LE(num_elements, std::numeric_limits::max() / sizeof(T)) + << "Requested size is too large to fit into size_t."; + if (arena == NULL) { + return static_cast(::operator new[](num_elements * sizeof(T))); + } else { + return arena->CreateInternalRawArray(num_elements); + } + } + + // Returns the total space allocated by the arena, which is the sum of the + // sizes of the underlying blocks. This method is relatively fast; a counter + // is kept as blocks are allocated. + uint64 SpaceAllocated() const { return impl_.SpaceAllocated(); } + // Returns the total space used by the arena. Similar to SpaceAllocated but + // does not include free space and block overhead. The total space returned + // may not include space used by other threads executing concurrently with + // the call to this method. + uint64 SpaceUsed() const { return impl_.SpaceUsed(); } + + // Frees all storage allocated by this arena after calling destructors + // registered with OwnDestructor() and freeing objects registered with Own(). + // Any objects allocated on this arena are unusable after this call. It also + // returns the total space used by the arena which is the sums of the sizes + // of the allocated blocks. This method is not thread-safe. + PROTOBUF_NOINLINE uint64 Reset() { + // Call the reset hook + if (on_arena_reset_ != NULL) { + on_arena_reset_(this, hooks_cookie_, impl_.SpaceAllocated()); + } + return impl_.Reset(); + } + + // Adds |object| to a list of heap-allocated objects to be freed with |delete| + // when the arena is destroyed or reset. + template + PROTOBUF_NOINLINE void Own(T* object) { + OwnInternal(object, std::is_convertible()); + } + + // Adds |object| to a list of objects whose destructors will be manually + // called when the arena is destroyed or reset. This differs from Own() in + // that it does not free the underlying memory with |delete|; hence, it is + // normally only used for objects that are placement-newed into + // arena-allocated memory. + template + PROTOBUF_NOINLINE void OwnDestructor(T* object) { + if (object != NULL) { + impl_.AddCleanup(object, &internal::arena_destruct_object); + } + } + + // Adds a custom member function on an object to the list of destructors that + // will be manually called when the arena is destroyed or reset. This differs + // from OwnDestructor() in that any member function may be specified, not only + // the class destructor. + PROTOBUF_NOINLINE void OwnCustomDestructor(void* object, + void (*destruct)(void*)) { + impl_.AddCleanup(object, destruct); + } + + // Retrieves the arena associated with |value| if |value| is an arena-capable + // message, or NULL otherwise. If possible, the call resolves at compile time. + // Note that we can often devirtualize calls to `value->GetArena()` so usually + // calling this method is unnecessary. + template + PROTOBUF_ALWAYS_INLINE static Arena* GetArena(const T* value) { + return GetArenaInternal(value); + } + + template + class InternalHelper { + template + static char DestructorSkippable(const typename U::DestructorSkippable_*); + template + static double DestructorSkippable(...); + + typedef std::integral_constant< + bool, sizeof(DestructorSkippable(static_cast(0))) == + sizeof(char) || + std::is_trivially_destructible::value> + is_destructor_skippable; + + template + static char ArenaConstructable( + const typename U::InternalArenaConstructable_*); + template + static double ArenaConstructable(...); + + typedef std::integral_constant( + static_cast(0))) == + sizeof(char)> + is_arena_constructable; + + template () + .GetArena())>::value, + int>::type = 0> + static char HasGetArena(decltype(&U::GetArena)); + template + static double HasGetArena(...); + + typedef std::integral_constant(nullptr)) == + sizeof(char)> + has_get_arena; + + template + static T* Construct(void* ptr, Args&&... args) { + return new (ptr) T(std::forward(args)...); + } + + static Arena* GetArena(const T* p) { return p->GetArena(); } + + friend class Arena; + }; + + // Helper typetraits that indicates support for arenas in a type T at compile + // time. This is public only to allow construction of higher-level templated + // utilities. + // + // is_arena_constructable::value is true if the message type T has arena + // support enabled, and false otherwise. + // + // is_destructor_skippable::value is true if the message type T has told + // the arena that it is safe to skip the destructor, and false otherwise. + // + // This is inside Arena because only Arena has the friend relationships + // necessary to see the underlying generated code traits. + template + struct is_arena_constructable : InternalHelper::is_arena_constructable {}; + template + struct is_destructor_skippable : InternalHelper::is_destructor_skippable { + }; + + private: + template + struct has_get_arena : InternalHelper::has_get_arena {}; + + template + PROTOBUF_ALWAYS_INLINE static T* CreateMessageInternal(Arena* arena, + Args&&... args) { + static_assert( + InternalHelper::is_arena_constructable::value, + "CreateMessage can only construct types that are ArenaConstructable"); + if (arena == NULL) { + return new T(nullptr, std::forward(args)...); + } else { + return arena->DoCreateMessage(std::forward(args)...); + } + } + + // This specialization for no arguments is necessary, because its behavior is + // slightly different. When the arena pointer is nullptr, it calls T() + // instead of T(nullptr). + template + PROTOBUF_ALWAYS_INLINE static T* CreateMessageInternal(Arena* arena) { + static_assert( + InternalHelper::is_arena_constructable::value, + "CreateMessage can only construct types that are ArenaConstructable"); + if (arena == NULL) { + return new T(); + } else { + return arena->DoCreateMessage(); + } + } + + template + PROTOBUF_ALWAYS_INLINE static T* CreateInternal(Arena* arena, + Args&&... args) { + if (arena == NULL) { + return new T(std::forward(args)...); + } else { + return arena->DoCreate(std::is_trivially_destructible::value, + std::forward(args)...); + } + } + + void CallDestructorHooks(); + void OnArenaAllocation(const std::type_info* allocated_type, size_t n) const; + inline void AllocHook(const std::type_info* allocated_type, size_t n) const { + if (PROTOBUF_PREDICT_FALSE(hooks_cookie_ != NULL)) { + OnArenaAllocation(allocated_type, n); + } + } + + // Allocate and also optionally call on_arena_allocation callback with the + // allocated type info when the hooks are in place in ArenaOptions and + // the cookie is not null. + template + PROTOBUF_ALWAYS_INLINE void* AllocateInternal(bool skip_explicit_ownership) { + static_assert(alignof(T) <= 8, "T is overaligned, see b/151247138"); + const size_t n = internal::AlignUpTo8(sizeof(T)); + AllocHook(RTTI_TYPE_ID(T), n); + // Monitor allocation if needed. + if (skip_explicit_ownership) { + return AllocateAlignedNoHook(n); + } else { + return impl_.AllocateAlignedAndAddCleanup( + n, &internal::arena_destruct_object); + } + } + + // CreateMessage requires that T supports arenas, but this private method + // works whether or not T supports arenas. These are not exposed to user code + // as it can cause confusing API usages, and end up having double free in + // user code. These are used only internally from LazyField and Repeated + // fields, since they are designed to work in all mode combinations. + template + PROTOBUF_ALWAYS_INLINE static Msg* DoCreateMaybeMessage(Arena* arena, + std::true_type, + Args&&... args) { + return CreateMessageInternal(arena, std::forward(args)...); + } + + template + PROTOBUF_ALWAYS_INLINE static T* DoCreateMaybeMessage(Arena* arena, + std::false_type, + Args&&... args) { + return CreateInternal(arena, std::forward(args)...); + } + + template + PROTOBUF_ALWAYS_INLINE static T* CreateMaybeMessage(Arena* arena, + Args&&... args) { + return DoCreateMaybeMessage(arena, is_arena_constructable(), + std::forward(args)...); + } + + template + PROTOBUF_ALWAYS_INLINE static T* CreateNoMessage(Arena* arena, std::true_type, + Args&&... args) { + // User is constructing with Create() despite the fact that T supports arena + // construction. In this case we have to delegate to CreateInternal(), and + // we can't use any CreateMaybeMessage() specialization that may be defined. + return CreateInternal(arena, std::forward(args)...); + } + + template + PROTOBUF_ALWAYS_INLINE static T* CreateNoMessage(Arena* arena, + std::false_type, + Args&&... args) { + // User is constructing with Create() and the type does not support arena + // construction. In this case we can delegate to CreateMaybeMessage() and + // use any specialization that may be available for that. + return CreateMaybeMessage(arena, std::forward(args)...); + } + + // Just allocate the required size for the given type assuming the + // type has a trivial constructor. + template + PROTOBUF_ALWAYS_INLINE T* CreateInternalRawArray(size_t num_elements) { + GOOGLE_CHECK_LE(num_elements, std::numeric_limits::max() / sizeof(T)) + << "Requested size is too large to fit into size_t."; + const size_t n = internal::AlignUpTo8(sizeof(T) * num_elements); + // Monitor allocation if needed. + AllocHook(RTTI_TYPE_ID(T), n); + return static_cast(AllocateAlignedNoHook(n)); + } + + template + PROTOBUF_ALWAYS_INLINE T* DoCreate(bool skip_explicit_ownership, + Args&&... args) { + return new (AllocateInternal(skip_explicit_ownership)) + T(std::forward(args)...); + } + template + PROTOBUF_ALWAYS_INLINE T* DoCreateMessage(Args&&... args) { + return InternalHelper::Construct( + AllocateInternal(InternalHelper::is_destructor_skippable::value), + this, std::forward(args)...); + } + + // CreateInArenaStorage is used to implement map field. Without it, + // Map need to call generated message's protected arena constructor, + // which needs to declare Map as friend of generated message. + template + static void CreateInArenaStorage(T* ptr, Arena* arena, Args&&... args) { + CreateInArenaStorageInternal(ptr, arena, + typename is_arena_constructable::type(), + std::forward(args)...); + RegisterDestructorInternal( + ptr, arena, + typename InternalHelper::is_destructor_skippable::type()); + } + + template + static void CreateInArenaStorageInternal(T* ptr, Arena* arena, + std::true_type, Args&&... args) { + InternalHelper::Construct(ptr, arena, std::forward(args)...); + } + template + static void CreateInArenaStorageInternal(T* ptr, Arena* /* arena */, + std::false_type, Args&&... args) { + new (ptr) T(std::forward(args)...); + } + + template + static void RegisterDestructorInternal(T* /* ptr */, Arena* /* arena */, + std::true_type) {} + template + static void RegisterDestructorInternal(T* ptr, Arena* arena, + std::false_type) { + arena->OwnDestructor(ptr); + } + + // These implement Own(), which registers an object for deletion (destructor + // call and operator delete()). The second parameter has type 'true_type' if T + // is a subtype of Message and 'false_type' otherwise. Collapsing + // all template instantiations to one for generic Message reduces code size, + // using the virtual destructor instead. + template + PROTOBUF_ALWAYS_INLINE void OwnInternal(T* object, std::true_type) { + if (object != NULL) { + impl_.AddCleanup(object, &internal::arena_delete_object); + } + } + template + PROTOBUF_ALWAYS_INLINE void OwnInternal(T* object, std::false_type) { + if (object != NULL) { + impl_.AddCleanup(object, &internal::arena_delete_object); + } + } + + // Implementation for GetArena(). Only message objects with + // InternalArenaConstructable_ tags can be associated with an arena, and such + // objects must implement a GetArena() method. + template ::value, int>::type = 0> + PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) { + return InternalHelper::GetArena(value); + } + template ::value && + has_get_arena::value, + int>::type = 0> + PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) { + return value->GetArena(); + } + template ::value && + !has_get_arena::value, + int>::type = 0> + PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) { + (void)value; + return nullptr; + } + + // For friends of arena. + void* AllocateAligned(size_t n) { + AllocHook(NULL, n); + return AllocateAlignedNoHook(internal::AlignUpTo8(n)); + } + template + void* AllocateAlignedTo(size_t n) { + static_assert(Align > 0, "Alignment must be greater than 0"); + static_assert((Align & (Align - 1)) == 0, "Alignment must be power of two"); + if (Align <= 8) return AllocateAligned(n); + // TODO(b/151247138): if the pointer would have been aligned already, + // this is wasting space. We should pass the alignment down. + uintptr_t ptr = reinterpret_cast(AllocateAligned(n + Align - 8)); + ptr = (ptr + Align - 1) & -Align; + return reinterpret_cast(ptr); + } + + void* AllocateAlignedNoHook(size_t n); + + internal::ArenaImpl impl_; + + void (*on_arena_allocation_)(const std::type_info* allocated_type, + uint64 alloc_size, void* cookie); + void (*on_arena_reset_)(Arena* arena, void* cookie, uint64 space_used); + void (*on_arena_destruction_)(Arena* arena, void* cookie, uint64 space_used); + + // The arena may save a cookie it receives from the external on_init hook + // and then use it when calling the on_reset and on_destruction hooks. + void* hooks_cookie_; + + template + friend class internal::GenericTypeHandler; + friend struct internal::ArenaStringPtr; // For AllocateAligned. + friend class internal::LazyField; // For CreateMaybeMessage. + friend class internal::EpsCopyInputStream; // For parser performance + friend class MessageLite; + template + friend class Map; +}; + +// Defined above for supporting environments without RTTI. +#undef RTTI_TYPE_ID + +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_ARENA_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..9eb2b2e55df09165755d5977cef55d75725ab2d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h @@ -0,0 +1,12958 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/descriptor.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fdescriptor_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fdescriptor_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[27] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fdescriptor_2eproto; +PROTOBUF_NAMESPACE_OPEN +class DescriptorProto; +class DescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern DescriptorProtoDefaultTypeInternal _DescriptorProto_default_instance_; +class DescriptorProto_ExtensionRange; +class DescriptorProto_ExtensionRangeDefaultTypeInternal; +PROTOBUF_EXPORT extern DescriptorProto_ExtensionRangeDefaultTypeInternal _DescriptorProto_ExtensionRange_default_instance_; +class DescriptorProto_ReservedRange; +class DescriptorProto_ReservedRangeDefaultTypeInternal; +PROTOBUF_EXPORT extern DescriptorProto_ReservedRangeDefaultTypeInternal _DescriptorProto_ReservedRange_default_instance_; +class EnumDescriptorProto; +class EnumDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern EnumDescriptorProtoDefaultTypeInternal _EnumDescriptorProto_default_instance_; +class EnumDescriptorProto_EnumReservedRange; +class EnumDescriptorProto_EnumReservedRangeDefaultTypeInternal; +PROTOBUF_EXPORT extern EnumDescriptorProto_EnumReservedRangeDefaultTypeInternal _EnumDescriptorProto_EnumReservedRange_default_instance_; +class EnumOptions; +class EnumOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern EnumOptionsDefaultTypeInternal _EnumOptions_default_instance_; +class EnumValueDescriptorProto; +class EnumValueDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern EnumValueDescriptorProtoDefaultTypeInternal _EnumValueDescriptorProto_default_instance_; +class EnumValueOptions; +class EnumValueOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern EnumValueOptionsDefaultTypeInternal _EnumValueOptions_default_instance_; +class ExtensionRangeOptions; +class ExtensionRangeOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern ExtensionRangeOptionsDefaultTypeInternal _ExtensionRangeOptions_default_instance_; +class FieldDescriptorProto; +class FieldDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern FieldDescriptorProtoDefaultTypeInternal _FieldDescriptorProto_default_instance_; +class FieldOptions; +class FieldOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern FieldOptionsDefaultTypeInternal _FieldOptions_default_instance_; +class FileDescriptorProto; +class FileDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern FileDescriptorProtoDefaultTypeInternal _FileDescriptorProto_default_instance_; +class FileDescriptorSet; +class FileDescriptorSetDefaultTypeInternal; +PROTOBUF_EXPORT extern FileDescriptorSetDefaultTypeInternal _FileDescriptorSet_default_instance_; +class FileOptions; +class FileOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern FileOptionsDefaultTypeInternal _FileOptions_default_instance_; +class GeneratedCodeInfo; +class GeneratedCodeInfoDefaultTypeInternal; +PROTOBUF_EXPORT extern GeneratedCodeInfoDefaultTypeInternal _GeneratedCodeInfo_default_instance_; +class GeneratedCodeInfo_Annotation; +class GeneratedCodeInfo_AnnotationDefaultTypeInternal; +PROTOBUF_EXPORT extern GeneratedCodeInfo_AnnotationDefaultTypeInternal _GeneratedCodeInfo_Annotation_default_instance_; +class MessageOptions; +class MessageOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern MessageOptionsDefaultTypeInternal _MessageOptions_default_instance_; +class MethodDescriptorProto; +class MethodDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern MethodDescriptorProtoDefaultTypeInternal _MethodDescriptorProto_default_instance_; +class MethodOptions; +class MethodOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern MethodOptionsDefaultTypeInternal _MethodOptions_default_instance_; +class OneofDescriptorProto; +class OneofDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern OneofDescriptorProtoDefaultTypeInternal _OneofDescriptorProto_default_instance_; +class OneofOptions; +class OneofOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern OneofOptionsDefaultTypeInternal _OneofOptions_default_instance_; +class ServiceDescriptorProto; +class ServiceDescriptorProtoDefaultTypeInternal; +PROTOBUF_EXPORT extern ServiceDescriptorProtoDefaultTypeInternal _ServiceDescriptorProto_default_instance_; +class ServiceOptions; +class ServiceOptionsDefaultTypeInternal; +PROTOBUF_EXPORT extern ServiceOptionsDefaultTypeInternal _ServiceOptions_default_instance_; +class SourceCodeInfo; +class SourceCodeInfoDefaultTypeInternal; +PROTOBUF_EXPORT extern SourceCodeInfoDefaultTypeInternal _SourceCodeInfo_default_instance_; +class SourceCodeInfo_Location; +class SourceCodeInfo_LocationDefaultTypeInternal; +PROTOBUF_EXPORT extern SourceCodeInfo_LocationDefaultTypeInternal _SourceCodeInfo_Location_default_instance_; +class UninterpretedOption; +class UninterpretedOptionDefaultTypeInternal; +PROTOBUF_EXPORT extern UninterpretedOptionDefaultTypeInternal _UninterpretedOption_default_instance_; +class UninterpretedOption_NamePart; +class UninterpretedOption_NamePartDefaultTypeInternal; +PROTOBUF_EXPORT extern UninterpretedOption_NamePartDefaultTypeInternal _UninterpretedOption_NamePart_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumValueOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileDescriptorSet* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MessageOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MethodOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::OneofOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ServiceOptions* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceCodeInfo* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::UninterpretedOption* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +enum FieldDescriptorProto_Type : int { + FieldDescriptorProto_Type_TYPE_DOUBLE = 1, + FieldDescriptorProto_Type_TYPE_FLOAT = 2, + FieldDescriptorProto_Type_TYPE_INT64 = 3, + FieldDescriptorProto_Type_TYPE_UINT64 = 4, + FieldDescriptorProto_Type_TYPE_INT32 = 5, + FieldDescriptorProto_Type_TYPE_FIXED64 = 6, + FieldDescriptorProto_Type_TYPE_FIXED32 = 7, + FieldDescriptorProto_Type_TYPE_BOOL = 8, + FieldDescriptorProto_Type_TYPE_STRING = 9, + FieldDescriptorProto_Type_TYPE_GROUP = 10, + FieldDescriptorProto_Type_TYPE_MESSAGE = 11, + FieldDescriptorProto_Type_TYPE_BYTES = 12, + FieldDescriptorProto_Type_TYPE_UINT32 = 13, + FieldDescriptorProto_Type_TYPE_ENUM = 14, + FieldDescriptorProto_Type_TYPE_SFIXED32 = 15, + FieldDescriptorProto_Type_TYPE_SFIXED64 = 16, + FieldDescriptorProto_Type_TYPE_SINT32 = 17, + FieldDescriptorProto_Type_TYPE_SINT64 = 18 +}; +PROTOBUF_EXPORT bool FieldDescriptorProto_Type_IsValid(int value); +constexpr FieldDescriptorProto_Type FieldDescriptorProto_Type_Type_MIN = FieldDescriptorProto_Type_TYPE_DOUBLE; +constexpr FieldDescriptorProto_Type FieldDescriptorProto_Type_Type_MAX = FieldDescriptorProto_Type_TYPE_SINT64; +constexpr int FieldDescriptorProto_Type_Type_ARRAYSIZE = FieldDescriptorProto_Type_Type_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldDescriptorProto_Type_descriptor(); +template +inline const std::string& FieldDescriptorProto_Type_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function FieldDescriptorProto_Type_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + FieldDescriptorProto_Type_descriptor(), enum_t_value); +} +inline bool FieldDescriptorProto_Type_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldDescriptorProto_Type* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + FieldDescriptorProto_Type_descriptor(), name, value); +} +enum FieldDescriptorProto_Label : int { + FieldDescriptorProto_Label_LABEL_OPTIONAL = 1, + FieldDescriptorProto_Label_LABEL_REQUIRED = 2, + FieldDescriptorProto_Label_LABEL_REPEATED = 3 +}; +PROTOBUF_EXPORT bool FieldDescriptorProto_Label_IsValid(int value); +constexpr FieldDescriptorProto_Label FieldDescriptorProto_Label_Label_MIN = FieldDescriptorProto_Label_LABEL_OPTIONAL; +constexpr FieldDescriptorProto_Label FieldDescriptorProto_Label_Label_MAX = FieldDescriptorProto_Label_LABEL_REPEATED; +constexpr int FieldDescriptorProto_Label_Label_ARRAYSIZE = FieldDescriptorProto_Label_Label_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldDescriptorProto_Label_descriptor(); +template +inline const std::string& FieldDescriptorProto_Label_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function FieldDescriptorProto_Label_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + FieldDescriptorProto_Label_descriptor(), enum_t_value); +} +inline bool FieldDescriptorProto_Label_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldDescriptorProto_Label* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + FieldDescriptorProto_Label_descriptor(), name, value); +} +enum FileOptions_OptimizeMode : int { + FileOptions_OptimizeMode_SPEED = 1, + FileOptions_OptimizeMode_CODE_SIZE = 2, + FileOptions_OptimizeMode_LITE_RUNTIME = 3 +}; +PROTOBUF_EXPORT bool FileOptions_OptimizeMode_IsValid(int value); +constexpr FileOptions_OptimizeMode FileOptions_OptimizeMode_OptimizeMode_MIN = FileOptions_OptimizeMode_SPEED; +constexpr FileOptions_OptimizeMode FileOptions_OptimizeMode_OptimizeMode_MAX = FileOptions_OptimizeMode_LITE_RUNTIME; +constexpr int FileOptions_OptimizeMode_OptimizeMode_ARRAYSIZE = FileOptions_OptimizeMode_OptimizeMode_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FileOptions_OptimizeMode_descriptor(); +template +inline const std::string& FileOptions_OptimizeMode_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function FileOptions_OptimizeMode_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + FileOptions_OptimizeMode_descriptor(), enum_t_value); +} +inline bool FileOptions_OptimizeMode_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FileOptions_OptimizeMode* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + FileOptions_OptimizeMode_descriptor(), name, value); +} +enum FieldOptions_CType : int { + FieldOptions_CType_STRING = 0, + FieldOptions_CType_CORD = 1, + FieldOptions_CType_STRING_PIECE = 2 +}; +PROTOBUF_EXPORT bool FieldOptions_CType_IsValid(int value); +constexpr FieldOptions_CType FieldOptions_CType_CType_MIN = FieldOptions_CType_STRING; +constexpr FieldOptions_CType FieldOptions_CType_CType_MAX = FieldOptions_CType_STRING_PIECE; +constexpr int FieldOptions_CType_CType_ARRAYSIZE = FieldOptions_CType_CType_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldOptions_CType_descriptor(); +template +inline const std::string& FieldOptions_CType_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function FieldOptions_CType_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + FieldOptions_CType_descriptor(), enum_t_value); +} +inline bool FieldOptions_CType_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldOptions_CType* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + FieldOptions_CType_descriptor(), name, value); +} +enum FieldOptions_JSType : int { + FieldOptions_JSType_JS_NORMAL = 0, + FieldOptions_JSType_JS_STRING = 1, + FieldOptions_JSType_JS_NUMBER = 2 +}; +PROTOBUF_EXPORT bool FieldOptions_JSType_IsValid(int value); +constexpr FieldOptions_JSType FieldOptions_JSType_JSType_MIN = FieldOptions_JSType_JS_NORMAL; +constexpr FieldOptions_JSType FieldOptions_JSType_JSType_MAX = FieldOptions_JSType_JS_NUMBER; +constexpr int FieldOptions_JSType_JSType_ARRAYSIZE = FieldOptions_JSType_JSType_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldOptions_JSType_descriptor(); +template +inline const std::string& FieldOptions_JSType_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function FieldOptions_JSType_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + FieldOptions_JSType_descriptor(), enum_t_value); +} +inline bool FieldOptions_JSType_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldOptions_JSType* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + FieldOptions_JSType_descriptor(), name, value); +} +enum MethodOptions_IdempotencyLevel : int { + MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN = 0, + MethodOptions_IdempotencyLevel_NO_SIDE_EFFECTS = 1, + MethodOptions_IdempotencyLevel_IDEMPOTENT = 2 +}; +PROTOBUF_EXPORT bool MethodOptions_IdempotencyLevel_IsValid(int value); +constexpr MethodOptions_IdempotencyLevel MethodOptions_IdempotencyLevel_IdempotencyLevel_MIN = MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN; +constexpr MethodOptions_IdempotencyLevel MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX = MethodOptions_IdempotencyLevel_IDEMPOTENT; +constexpr int MethodOptions_IdempotencyLevel_IdempotencyLevel_ARRAYSIZE = MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* MethodOptions_IdempotencyLevel_descriptor(); +template +inline const std::string& MethodOptions_IdempotencyLevel_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function MethodOptions_IdempotencyLevel_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + MethodOptions_IdempotencyLevel_descriptor(), enum_t_value); +} +inline bool MethodOptions_IdempotencyLevel_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, MethodOptions_IdempotencyLevel* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + MethodOptions_IdempotencyLevel_descriptor(), name, value); +} +// =================================================================== + +class PROTOBUF_EXPORT FileDescriptorSet PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileDescriptorSet) */ { + public: + inline FileDescriptorSet() : FileDescriptorSet(nullptr) {} + virtual ~FileDescriptorSet(); + + FileDescriptorSet(const FileDescriptorSet& from); + FileDescriptorSet(FileDescriptorSet&& from) noexcept + : FileDescriptorSet() { + *this = ::std::move(from); + } + + inline FileDescriptorSet& operator=(const FileDescriptorSet& from) { + CopyFrom(from); + return *this; + } + inline FileDescriptorSet& operator=(FileDescriptorSet&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FileDescriptorSet& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FileDescriptorSet* internal_default_instance() { + return reinterpret_cast( + &_FileDescriptorSet_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(FileDescriptorSet& a, FileDescriptorSet& b) { + a.Swap(&b); + } + inline void Swap(FileDescriptorSet* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FileDescriptorSet* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FileDescriptorSet* New() const final { + return CreateMaybeMessage(nullptr); + } + + FileDescriptorSet* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FileDescriptorSet& from); + void MergeFrom(const FileDescriptorSet& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FileDescriptorSet* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FileDescriptorSet"; + } + protected: + explicit FileDescriptorSet(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kFileFieldNumber = 1, + }; + // repeated .google.protobuf.FileDescriptorProto file = 1; + int file_size() const; + private: + int _internal_file_size() const; + public: + void clear_file(); + PROTOBUF_NAMESPACE_ID::FileDescriptorProto* mutable_file(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >* + mutable_file(); + private: + const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& _internal_file(int index) const; + PROTOBUF_NAMESPACE_ID::FileDescriptorProto* _internal_add_file(); + public: + const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& file(int index) const; + PROTOBUF_NAMESPACE_ID::FileDescriptorProto* add_file(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >& + file() const; + + // @@protoc_insertion_point(class_scope:google.protobuf.FileDescriptorSet) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto > file_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT FileDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileDescriptorProto) */ { + public: + inline FileDescriptorProto() : FileDescriptorProto(nullptr) {} + virtual ~FileDescriptorProto(); + + FileDescriptorProto(const FileDescriptorProto& from); + FileDescriptorProto(FileDescriptorProto&& from) noexcept + : FileDescriptorProto() { + *this = ::std::move(from); + } + + inline FileDescriptorProto& operator=(const FileDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline FileDescriptorProto& operator=(FileDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FileDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FileDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_FileDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 1; + + friend void swap(FileDescriptorProto& a, FileDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(FileDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FileDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FileDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + FileDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FileDescriptorProto& from); + void MergeFrom(const FileDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FileDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FileDescriptorProto"; + } + protected: + explicit FileDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kDependencyFieldNumber = 3, + kMessageTypeFieldNumber = 4, + kEnumTypeFieldNumber = 5, + kServiceFieldNumber = 6, + kExtensionFieldNumber = 7, + kPublicDependencyFieldNumber = 10, + kWeakDependencyFieldNumber = 11, + kNameFieldNumber = 1, + kPackageFieldNumber = 2, + kSyntaxFieldNumber = 12, + kOptionsFieldNumber = 8, + kSourceCodeInfoFieldNumber = 9, + }; + // repeated string dependency = 3; + int dependency_size() const; + private: + int _internal_dependency_size() const; + public: + void clear_dependency(); + const std::string& dependency(int index) const; + std::string* mutable_dependency(int index); + void set_dependency(int index, const std::string& value); + void set_dependency(int index, std::string&& value); + void set_dependency(int index, const char* value); + void set_dependency(int index, const char* value, size_t size); + std::string* add_dependency(); + void add_dependency(const std::string& value); + void add_dependency(std::string&& value); + void add_dependency(const char* value); + void add_dependency(const char* value, size_t size); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& dependency() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* mutable_dependency(); + private: + const std::string& _internal_dependency(int index) const; + std::string* _internal_add_dependency(); + public: + + // repeated .google.protobuf.DescriptorProto message_type = 4; + int message_type_size() const; + private: + int _internal_message_type_size() const; + public: + void clear_message_type(); + PROTOBUF_NAMESPACE_ID::DescriptorProto* mutable_message_type(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >* + mutable_message_type(); + private: + const PROTOBUF_NAMESPACE_ID::DescriptorProto& _internal_message_type(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto* _internal_add_message_type(); + public: + const PROTOBUF_NAMESPACE_ID::DescriptorProto& message_type(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto* add_message_type(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >& + message_type() const; + + // repeated .google.protobuf.EnumDescriptorProto enum_type = 5; + int enum_type_size() const; + private: + int _internal_enum_type_size() const; + public: + void clear_enum_type(); + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* mutable_enum_type(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >* + mutable_enum_type(); + private: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& _internal_enum_type(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* _internal_add_enum_type(); + public: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& enum_type(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* add_enum_type(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >& + enum_type() const; + + // repeated .google.protobuf.ServiceDescriptorProto service = 6; + int service_size() const; + private: + int _internal_service_size() const; + public: + void clear_service(); + PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* mutable_service(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >* + mutable_service(); + private: + const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& _internal_service(int index) const; + PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* _internal_add_service(); + public: + const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& service(int index) const; + PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* add_service(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >& + service() const; + + // repeated .google.protobuf.FieldDescriptorProto extension = 7; + int extension_size() const; + private: + int _internal_extension_size() const; + public: + void clear_extension(); + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_extension(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* + mutable_extension(); + private: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_extension(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_extension(); + public: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& extension(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_extension(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& + extension() const; + + // repeated int32 public_dependency = 10; + int public_dependency_size() const; + private: + int _internal_public_dependency_size() const; + public: + void clear_public_dependency(); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_public_dependency(int index) const; + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + _internal_public_dependency() const; + void _internal_add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value); + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + _internal_mutable_public_dependency(); + public: + ::PROTOBUF_NAMESPACE_ID::int32 public_dependency(int index) const; + void set_public_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value); + void add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value); + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + public_dependency() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + mutable_public_dependency(); + + // repeated int32 weak_dependency = 11; + int weak_dependency_size() const; + private: + int _internal_weak_dependency_size() const; + public: + void clear_weak_dependency(); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_weak_dependency(int index) const; + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + _internal_weak_dependency() const; + void _internal_add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value); + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + _internal_mutable_weak_dependency(); + public: + ::PROTOBUF_NAMESPACE_ID::int32 weak_dependency(int index) const; + void set_weak_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value); + void add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value); + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + weak_dependency() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + mutable_weak_dependency(); + + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional string package = 2; + bool has_package() const; + private: + bool _internal_has_package() const; + public: + void clear_package(); + const std::string& package() const; + void set_package(const std::string& value); + void set_package(std::string&& value); + void set_package(const char* value); + void set_package(const char* value, size_t size); + std::string* mutable_package(); + std::string* release_package(); + void set_allocated_package(std::string* package); + private: + const std::string& _internal_package() const; + void _internal_set_package(const std::string& value); + std::string* _internal_mutable_package(); + public: + + // optional string syntax = 12; + bool has_syntax() const; + private: + bool _internal_has_syntax() const; + public: + void clear_syntax(); + const std::string& syntax() const; + void set_syntax(const std::string& value); + void set_syntax(std::string&& value); + void set_syntax(const char* value); + void set_syntax(const char* value, size_t size); + std::string* mutable_syntax(); + std::string* release_syntax(); + void set_allocated_syntax(std::string* syntax); + private: + const std::string& _internal_syntax() const; + void _internal_set_syntax(const std::string& value); + std::string* _internal_mutable_syntax(); + public: + + // optional .google.protobuf.FileOptions options = 8; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::FileOptions& options() const; + PROTOBUF_NAMESPACE_ID::FileOptions* release_options(); + PROTOBUF_NAMESPACE_ID::FileOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::FileOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::FileOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::FileOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::FileOptions* options); + PROTOBUF_NAMESPACE_ID::FileOptions* unsafe_arena_release_options(); + + // optional .google.protobuf.SourceCodeInfo source_code_info = 9; + bool has_source_code_info() const; + private: + bool _internal_has_source_code_info() const; + public: + void clear_source_code_info(); + const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& source_code_info() const; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* release_source_code_info(); + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* mutable_source_code_info(); + void set_allocated_source_code_info(PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info); + private: + const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& _internal_source_code_info() const; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* _internal_mutable_source_code_info(); + public: + void unsafe_arena_set_allocated_source_code_info( + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info); + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* unsafe_arena_release_source_code_info(); + + // @@protoc_insertion_point(class_scope:google.protobuf.FileDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField dependency_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto > message_type_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto > enum_type_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto > service_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > extension_; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > public_dependency_; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > weak_dependency_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr package_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr syntax_; + PROTOBUF_NAMESPACE_ID::FileOptions* options_; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT DescriptorProto_ExtensionRange PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto.ExtensionRange) */ { + public: + inline DescriptorProto_ExtensionRange() : DescriptorProto_ExtensionRange(nullptr) {} + virtual ~DescriptorProto_ExtensionRange(); + + DescriptorProto_ExtensionRange(const DescriptorProto_ExtensionRange& from); + DescriptorProto_ExtensionRange(DescriptorProto_ExtensionRange&& from) noexcept + : DescriptorProto_ExtensionRange() { + *this = ::std::move(from); + } + + inline DescriptorProto_ExtensionRange& operator=(const DescriptorProto_ExtensionRange& from) { + CopyFrom(from); + return *this; + } + inline DescriptorProto_ExtensionRange& operator=(DescriptorProto_ExtensionRange&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const DescriptorProto_ExtensionRange& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const DescriptorProto_ExtensionRange* internal_default_instance() { + return reinterpret_cast( + &_DescriptorProto_ExtensionRange_default_instance_); + } + static constexpr int kIndexInFileMessages = + 2; + + friend void swap(DescriptorProto_ExtensionRange& a, DescriptorProto_ExtensionRange& b) { + a.Swap(&b); + } + inline void Swap(DescriptorProto_ExtensionRange* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(DescriptorProto_ExtensionRange* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline DescriptorProto_ExtensionRange* New() const final { + return CreateMaybeMessage(nullptr); + } + + DescriptorProto_ExtensionRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const DescriptorProto_ExtensionRange& from); + void MergeFrom(const DescriptorProto_ExtensionRange& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(DescriptorProto_ExtensionRange* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.DescriptorProto.ExtensionRange"; + } + protected: + explicit DescriptorProto_ExtensionRange(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kOptionsFieldNumber = 3, + kStartFieldNumber = 1, + kEndFieldNumber = 2, + }; + // optional .google.protobuf.ExtensionRangeOptions options = 3; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& options() const; + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* release_options(); + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options); + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* unsafe_arena_release_options(); + + // optional int32 start = 1; + bool has_start() const; + private: + bool _internal_has_start() const; + public: + void clear_start(); + ::PROTOBUF_NAMESPACE_ID::int32 start() const; + void set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const; + void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional int32 end = 2; + bool has_end() const; + private: + bool _internal_has_end() const; + public: + void clear_end(); + ::PROTOBUF_NAMESPACE_ID::int32 end() const; + void set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const; + void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto.ExtensionRange) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options_; + ::PROTOBUF_NAMESPACE_ID::int32 start_; + ::PROTOBUF_NAMESPACE_ID::int32 end_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT DescriptorProto_ReservedRange PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto.ReservedRange) */ { + public: + inline DescriptorProto_ReservedRange() : DescriptorProto_ReservedRange(nullptr) {} + virtual ~DescriptorProto_ReservedRange(); + + DescriptorProto_ReservedRange(const DescriptorProto_ReservedRange& from); + DescriptorProto_ReservedRange(DescriptorProto_ReservedRange&& from) noexcept + : DescriptorProto_ReservedRange() { + *this = ::std::move(from); + } + + inline DescriptorProto_ReservedRange& operator=(const DescriptorProto_ReservedRange& from) { + CopyFrom(from); + return *this; + } + inline DescriptorProto_ReservedRange& operator=(DescriptorProto_ReservedRange&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const DescriptorProto_ReservedRange& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const DescriptorProto_ReservedRange* internal_default_instance() { + return reinterpret_cast( + &_DescriptorProto_ReservedRange_default_instance_); + } + static constexpr int kIndexInFileMessages = + 3; + + friend void swap(DescriptorProto_ReservedRange& a, DescriptorProto_ReservedRange& b) { + a.Swap(&b); + } + inline void Swap(DescriptorProto_ReservedRange* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(DescriptorProto_ReservedRange* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline DescriptorProto_ReservedRange* New() const final { + return CreateMaybeMessage(nullptr); + } + + DescriptorProto_ReservedRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const DescriptorProto_ReservedRange& from); + void MergeFrom(const DescriptorProto_ReservedRange& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(DescriptorProto_ReservedRange* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.DescriptorProto.ReservedRange"; + } + protected: + explicit DescriptorProto_ReservedRange(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kStartFieldNumber = 1, + kEndFieldNumber = 2, + }; + // optional int32 start = 1; + bool has_start() const; + private: + bool _internal_has_start() const; + public: + void clear_start(); + ::PROTOBUF_NAMESPACE_ID::int32 start() const; + void set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const; + void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional int32 end = 2; + bool has_end() const; + private: + bool _internal_has_end() const; + public: + void clear_end(); + ::PROTOBUF_NAMESPACE_ID::int32 end() const; + void set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const; + void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto.ReservedRange) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::int32 start_; + ::PROTOBUF_NAMESPACE_ID::int32 end_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT DescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto) */ { + public: + inline DescriptorProto() : DescriptorProto(nullptr) {} + virtual ~DescriptorProto(); + + DescriptorProto(const DescriptorProto& from); + DescriptorProto(DescriptorProto&& from) noexcept + : DescriptorProto() { + *this = ::std::move(from); + } + + inline DescriptorProto& operator=(const DescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline DescriptorProto& operator=(DescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const DescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const DescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_DescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 4; + + friend void swap(DescriptorProto& a, DescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(DescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(DescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline DescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + DescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const DescriptorProto& from); + void MergeFrom(const DescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(DescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.DescriptorProto"; + } + protected: + explicit DescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef DescriptorProto_ExtensionRange ExtensionRange; + typedef DescriptorProto_ReservedRange ReservedRange; + + // accessors ------------------------------------------------------- + + enum : int { + kFieldFieldNumber = 2, + kNestedTypeFieldNumber = 3, + kEnumTypeFieldNumber = 4, + kExtensionRangeFieldNumber = 5, + kExtensionFieldNumber = 6, + kOneofDeclFieldNumber = 8, + kReservedRangeFieldNumber = 9, + kReservedNameFieldNumber = 10, + kNameFieldNumber = 1, + kOptionsFieldNumber = 7, + }; + // repeated .google.protobuf.FieldDescriptorProto field = 2; + int field_size() const; + private: + int _internal_field_size() const; + public: + void clear_field(); + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_field(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* + mutable_field(); + private: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_field(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_field(); + public: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& field(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_field(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& + field() const; + + // repeated .google.protobuf.DescriptorProto nested_type = 3; + int nested_type_size() const; + private: + int _internal_nested_type_size() const; + public: + void clear_nested_type(); + PROTOBUF_NAMESPACE_ID::DescriptorProto* mutable_nested_type(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >* + mutable_nested_type(); + private: + const PROTOBUF_NAMESPACE_ID::DescriptorProto& _internal_nested_type(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto* _internal_add_nested_type(); + public: + const PROTOBUF_NAMESPACE_ID::DescriptorProto& nested_type(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto* add_nested_type(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >& + nested_type() const; + + // repeated .google.protobuf.EnumDescriptorProto enum_type = 4; + int enum_type_size() const; + private: + int _internal_enum_type_size() const; + public: + void clear_enum_type(); + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* mutable_enum_type(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >* + mutable_enum_type(); + private: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& _internal_enum_type(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* _internal_add_enum_type(); + public: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& enum_type(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* add_enum_type(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >& + enum_type() const; + + // repeated .google.protobuf.DescriptorProto.ExtensionRange extension_range = 5; + int extension_range_size() const; + private: + int _internal_extension_range_size() const; + public: + void clear_extension_range(); + PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* mutable_extension_range(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >* + mutable_extension_range(); + private: + const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& _internal_extension_range(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* _internal_add_extension_range(); + public: + const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& extension_range(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* add_extension_range(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >& + extension_range() const; + + // repeated .google.protobuf.FieldDescriptorProto extension = 6; + int extension_size() const; + private: + int _internal_extension_size() const; + public: + void clear_extension(); + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_extension(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* + mutable_extension(); + private: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_extension(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_extension(); + public: + const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& extension(int index) const; + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_extension(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& + extension() const; + + // repeated .google.protobuf.OneofDescriptorProto oneof_decl = 8; + int oneof_decl_size() const; + private: + int _internal_oneof_decl_size() const; + public: + void clear_oneof_decl(); + PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* mutable_oneof_decl(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >* + mutable_oneof_decl(); + private: + const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& _internal_oneof_decl(int index) const; + PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* _internal_add_oneof_decl(); + public: + const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& oneof_decl(int index) const; + PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* add_oneof_decl(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >& + oneof_decl() const; + + // repeated .google.protobuf.DescriptorProto.ReservedRange reserved_range = 9; + int reserved_range_size() const; + private: + int _internal_reserved_range_size() const; + public: + void clear_reserved_range(); + PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* mutable_reserved_range(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >* + mutable_reserved_range(); + private: + const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& _internal_reserved_range(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* _internal_add_reserved_range(); + public: + const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& reserved_range(int index) const; + PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* add_reserved_range(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >& + reserved_range() const; + + // repeated string reserved_name = 10; + int reserved_name_size() const; + private: + int _internal_reserved_name_size() const; + public: + void clear_reserved_name(); + const std::string& reserved_name(int index) const; + std::string* mutable_reserved_name(int index); + void set_reserved_name(int index, const std::string& value); + void set_reserved_name(int index, std::string&& value); + void set_reserved_name(int index, const char* value); + void set_reserved_name(int index, const char* value, size_t size); + std::string* add_reserved_name(); + void add_reserved_name(const std::string& value); + void add_reserved_name(std::string&& value); + void add_reserved_name(const char* value); + void add_reserved_name(const char* value, size_t size); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& reserved_name() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* mutable_reserved_name(); + private: + const std::string& _internal_reserved_name(int index) const; + std::string* _internal_add_reserved_name(); + public: + + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional .google.protobuf.MessageOptions options = 7; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::MessageOptions& options() const; + PROTOBUF_NAMESPACE_ID::MessageOptions* release_options(); + PROTOBUF_NAMESPACE_ID::MessageOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::MessageOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::MessageOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::MessageOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::MessageOptions* options); + PROTOBUF_NAMESPACE_ID::MessageOptions* unsafe_arena_release_options(); + + // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > field_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto > nested_type_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto > enum_type_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange > extension_range_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > extension_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto > oneof_decl_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange > reserved_range_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField reserved_name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + PROTOBUF_NAMESPACE_ID::MessageOptions* options_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT ExtensionRangeOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ExtensionRangeOptions) */ { + public: + inline ExtensionRangeOptions() : ExtensionRangeOptions(nullptr) {} + virtual ~ExtensionRangeOptions(); + + ExtensionRangeOptions(const ExtensionRangeOptions& from); + ExtensionRangeOptions(ExtensionRangeOptions&& from) noexcept + : ExtensionRangeOptions() { + *this = ::std::move(from); + } + + inline ExtensionRangeOptions& operator=(const ExtensionRangeOptions& from) { + CopyFrom(from); + return *this; + } + inline ExtensionRangeOptions& operator=(ExtensionRangeOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const ExtensionRangeOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const ExtensionRangeOptions* internal_default_instance() { + return reinterpret_cast( + &_ExtensionRangeOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 5; + + friend void swap(ExtensionRangeOptions& a, ExtensionRangeOptions& b) { + a.Swap(&b); + } + inline void Swap(ExtensionRangeOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(ExtensionRangeOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline ExtensionRangeOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + ExtensionRangeOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const ExtensionRangeOptions& from); + void MergeFrom(const ExtensionRangeOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(ExtensionRangeOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.ExtensionRangeOptions"; + } + protected: + explicit ExtensionRangeOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(ExtensionRangeOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.ExtensionRangeOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT FieldDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldDescriptorProto) */ { + public: + inline FieldDescriptorProto() : FieldDescriptorProto(nullptr) {} + virtual ~FieldDescriptorProto(); + + FieldDescriptorProto(const FieldDescriptorProto& from); + FieldDescriptorProto(FieldDescriptorProto&& from) noexcept + : FieldDescriptorProto() { + *this = ::std::move(from); + } + + inline FieldDescriptorProto& operator=(const FieldDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline FieldDescriptorProto& operator=(FieldDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FieldDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FieldDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_FieldDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 6; + + friend void swap(FieldDescriptorProto& a, FieldDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(FieldDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FieldDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FieldDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + FieldDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FieldDescriptorProto& from); + void MergeFrom(const FieldDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FieldDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FieldDescriptorProto"; + } + protected: + explicit FieldDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef FieldDescriptorProto_Type Type; + static constexpr Type TYPE_DOUBLE = + FieldDescriptorProto_Type_TYPE_DOUBLE; + static constexpr Type TYPE_FLOAT = + FieldDescriptorProto_Type_TYPE_FLOAT; + static constexpr Type TYPE_INT64 = + FieldDescriptorProto_Type_TYPE_INT64; + static constexpr Type TYPE_UINT64 = + FieldDescriptorProto_Type_TYPE_UINT64; + static constexpr Type TYPE_INT32 = + FieldDescriptorProto_Type_TYPE_INT32; + static constexpr Type TYPE_FIXED64 = + FieldDescriptorProto_Type_TYPE_FIXED64; + static constexpr Type TYPE_FIXED32 = + FieldDescriptorProto_Type_TYPE_FIXED32; + static constexpr Type TYPE_BOOL = + FieldDescriptorProto_Type_TYPE_BOOL; + static constexpr Type TYPE_STRING = + FieldDescriptorProto_Type_TYPE_STRING; + static constexpr Type TYPE_GROUP = + FieldDescriptorProto_Type_TYPE_GROUP; + static constexpr Type TYPE_MESSAGE = + FieldDescriptorProto_Type_TYPE_MESSAGE; + static constexpr Type TYPE_BYTES = + FieldDescriptorProto_Type_TYPE_BYTES; + static constexpr Type TYPE_UINT32 = + FieldDescriptorProto_Type_TYPE_UINT32; + static constexpr Type TYPE_ENUM = + FieldDescriptorProto_Type_TYPE_ENUM; + static constexpr Type TYPE_SFIXED32 = + FieldDescriptorProto_Type_TYPE_SFIXED32; + static constexpr Type TYPE_SFIXED64 = + FieldDescriptorProto_Type_TYPE_SFIXED64; + static constexpr Type TYPE_SINT32 = + FieldDescriptorProto_Type_TYPE_SINT32; + static constexpr Type TYPE_SINT64 = + FieldDescriptorProto_Type_TYPE_SINT64; + static inline bool Type_IsValid(int value) { + return FieldDescriptorProto_Type_IsValid(value); + } + static constexpr Type Type_MIN = + FieldDescriptorProto_Type_Type_MIN; + static constexpr Type Type_MAX = + FieldDescriptorProto_Type_Type_MAX; + static constexpr int Type_ARRAYSIZE = + FieldDescriptorProto_Type_Type_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + Type_descriptor() { + return FieldDescriptorProto_Type_descriptor(); + } + template + static inline const std::string& Type_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function Type_Name."); + return FieldDescriptorProto_Type_Name(enum_t_value); + } + static inline bool Type_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + Type* value) { + return FieldDescriptorProto_Type_Parse(name, value); + } + + typedef FieldDescriptorProto_Label Label; + static constexpr Label LABEL_OPTIONAL = + FieldDescriptorProto_Label_LABEL_OPTIONAL; + static constexpr Label LABEL_REQUIRED = + FieldDescriptorProto_Label_LABEL_REQUIRED; + static constexpr Label LABEL_REPEATED = + FieldDescriptorProto_Label_LABEL_REPEATED; + static inline bool Label_IsValid(int value) { + return FieldDescriptorProto_Label_IsValid(value); + } + static constexpr Label Label_MIN = + FieldDescriptorProto_Label_Label_MIN; + static constexpr Label Label_MAX = + FieldDescriptorProto_Label_Label_MAX; + static constexpr int Label_ARRAYSIZE = + FieldDescriptorProto_Label_Label_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + Label_descriptor() { + return FieldDescriptorProto_Label_descriptor(); + } + template + static inline const std::string& Label_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function Label_Name."); + return FieldDescriptorProto_Label_Name(enum_t_value); + } + static inline bool Label_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + Label* value) { + return FieldDescriptorProto_Label_Parse(name, value); + } + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 1, + kExtendeeFieldNumber = 2, + kTypeNameFieldNumber = 6, + kDefaultValueFieldNumber = 7, + kJsonNameFieldNumber = 10, + kOptionsFieldNumber = 8, + kNumberFieldNumber = 3, + kOneofIndexFieldNumber = 9, + kProto3OptionalFieldNumber = 17, + kLabelFieldNumber = 4, + kTypeFieldNumber = 5, + }; + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional string extendee = 2; + bool has_extendee() const; + private: + bool _internal_has_extendee() const; + public: + void clear_extendee(); + const std::string& extendee() const; + void set_extendee(const std::string& value); + void set_extendee(std::string&& value); + void set_extendee(const char* value); + void set_extendee(const char* value, size_t size); + std::string* mutable_extendee(); + std::string* release_extendee(); + void set_allocated_extendee(std::string* extendee); + private: + const std::string& _internal_extendee() const; + void _internal_set_extendee(const std::string& value); + std::string* _internal_mutable_extendee(); + public: + + // optional string type_name = 6; + bool has_type_name() const; + private: + bool _internal_has_type_name() const; + public: + void clear_type_name(); + const std::string& type_name() const; + void set_type_name(const std::string& value); + void set_type_name(std::string&& value); + void set_type_name(const char* value); + void set_type_name(const char* value, size_t size); + std::string* mutable_type_name(); + std::string* release_type_name(); + void set_allocated_type_name(std::string* type_name); + private: + const std::string& _internal_type_name() const; + void _internal_set_type_name(const std::string& value); + std::string* _internal_mutable_type_name(); + public: + + // optional string default_value = 7; + bool has_default_value() const; + private: + bool _internal_has_default_value() const; + public: + void clear_default_value(); + const std::string& default_value() const; + void set_default_value(const std::string& value); + void set_default_value(std::string&& value); + void set_default_value(const char* value); + void set_default_value(const char* value, size_t size); + std::string* mutable_default_value(); + std::string* release_default_value(); + void set_allocated_default_value(std::string* default_value); + private: + const std::string& _internal_default_value() const; + void _internal_set_default_value(const std::string& value); + std::string* _internal_mutable_default_value(); + public: + + // optional string json_name = 10; + bool has_json_name() const; + private: + bool _internal_has_json_name() const; + public: + void clear_json_name(); + const std::string& json_name() const; + void set_json_name(const std::string& value); + void set_json_name(std::string&& value); + void set_json_name(const char* value); + void set_json_name(const char* value, size_t size); + std::string* mutable_json_name(); + std::string* release_json_name(); + void set_allocated_json_name(std::string* json_name); + private: + const std::string& _internal_json_name() const; + void _internal_set_json_name(const std::string& value); + std::string* _internal_mutable_json_name(); + public: + + // optional .google.protobuf.FieldOptions options = 8; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::FieldOptions& options() const; + PROTOBUF_NAMESPACE_ID::FieldOptions* release_options(); + PROTOBUF_NAMESPACE_ID::FieldOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::FieldOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::FieldOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::FieldOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::FieldOptions* options); + PROTOBUF_NAMESPACE_ID::FieldOptions* unsafe_arena_release_options(); + + // optional int32 number = 3; + bool has_number() const; + private: + bool _internal_has_number() const; + public: + void clear_number(); + ::PROTOBUF_NAMESPACE_ID::int32 number() const; + void set_number(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_number() const; + void _internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional int32 oneof_index = 9; + bool has_oneof_index() const; + private: + bool _internal_has_oneof_index() const; + public: + void clear_oneof_index(); + ::PROTOBUF_NAMESPACE_ID::int32 oneof_index() const; + void set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_oneof_index() const; + void _internal_set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional bool proto3_optional = 17; + bool has_proto3_optional() const; + private: + bool _internal_has_proto3_optional() const; + public: + void clear_proto3_optional(); + bool proto3_optional() const; + void set_proto3_optional(bool value); + private: + bool _internal_proto3_optional() const; + void _internal_set_proto3_optional(bool value); + public: + + // optional .google.protobuf.FieldDescriptorProto.Label label = 4; + bool has_label() const; + private: + bool _internal_has_label() const; + public: + void clear_label(); + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label label() const; + void set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value); + private: + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label _internal_label() const; + void _internal_set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value); + public: + + // optional .google.protobuf.FieldDescriptorProto.Type type = 5; + bool has_type() const; + private: + bool _internal_has_type() const; + public: + void clear_type(); + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type type() const; + void set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value); + private: + PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type _internal_type() const; + void _internal_set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.FieldDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr extendee_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr type_name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr default_value_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr json_name_; + PROTOBUF_NAMESPACE_ID::FieldOptions* options_; + ::PROTOBUF_NAMESPACE_ID::int32 number_; + ::PROTOBUF_NAMESPACE_ID::int32 oneof_index_; + bool proto3_optional_; + int label_; + int type_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT OneofDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.OneofDescriptorProto) */ { + public: + inline OneofDescriptorProto() : OneofDescriptorProto(nullptr) {} + virtual ~OneofDescriptorProto(); + + OneofDescriptorProto(const OneofDescriptorProto& from); + OneofDescriptorProto(OneofDescriptorProto&& from) noexcept + : OneofDescriptorProto() { + *this = ::std::move(from); + } + + inline OneofDescriptorProto& operator=(const OneofDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline OneofDescriptorProto& operator=(OneofDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const OneofDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const OneofDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_OneofDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 7; + + friend void swap(OneofDescriptorProto& a, OneofDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(OneofDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(OneofDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline OneofDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + OneofDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const OneofDescriptorProto& from); + void MergeFrom(const OneofDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(OneofDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.OneofDescriptorProto"; + } + protected: + explicit OneofDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 1, + kOptionsFieldNumber = 2, + }; + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional .google.protobuf.OneofOptions options = 2; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::OneofOptions& options() const; + PROTOBUF_NAMESPACE_ID::OneofOptions* release_options(); + PROTOBUF_NAMESPACE_ID::OneofOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::OneofOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::OneofOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::OneofOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::OneofOptions* options); + PROTOBUF_NAMESPACE_ID::OneofOptions* unsafe_arena_release_options(); + + // @@protoc_insertion_point(class_scope:google.protobuf.OneofDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + PROTOBUF_NAMESPACE_ID::OneofOptions* options_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT EnumDescriptorProto_EnumReservedRange PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumDescriptorProto.EnumReservedRange) */ { + public: + inline EnumDescriptorProto_EnumReservedRange() : EnumDescriptorProto_EnumReservedRange(nullptr) {} + virtual ~EnumDescriptorProto_EnumReservedRange(); + + EnumDescriptorProto_EnumReservedRange(const EnumDescriptorProto_EnumReservedRange& from); + EnumDescriptorProto_EnumReservedRange(EnumDescriptorProto_EnumReservedRange&& from) noexcept + : EnumDescriptorProto_EnumReservedRange() { + *this = ::std::move(from); + } + + inline EnumDescriptorProto_EnumReservedRange& operator=(const EnumDescriptorProto_EnumReservedRange& from) { + CopyFrom(from); + return *this; + } + inline EnumDescriptorProto_EnumReservedRange& operator=(EnumDescriptorProto_EnumReservedRange&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const EnumDescriptorProto_EnumReservedRange& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const EnumDescriptorProto_EnumReservedRange* internal_default_instance() { + return reinterpret_cast( + &_EnumDescriptorProto_EnumReservedRange_default_instance_); + } + static constexpr int kIndexInFileMessages = + 8; + + friend void swap(EnumDescriptorProto_EnumReservedRange& a, EnumDescriptorProto_EnumReservedRange& b) { + a.Swap(&b); + } + inline void Swap(EnumDescriptorProto_EnumReservedRange* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(EnumDescriptorProto_EnumReservedRange* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline EnumDescriptorProto_EnumReservedRange* New() const final { + return CreateMaybeMessage(nullptr); + } + + EnumDescriptorProto_EnumReservedRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const EnumDescriptorProto_EnumReservedRange& from); + void MergeFrom(const EnumDescriptorProto_EnumReservedRange& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(EnumDescriptorProto_EnumReservedRange* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.EnumDescriptorProto.EnumReservedRange"; + } + protected: + explicit EnumDescriptorProto_EnumReservedRange(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kStartFieldNumber = 1, + kEndFieldNumber = 2, + }; + // optional int32 start = 1; + bool has_start() const; + private: + bool _internal_has_start() const; + public: + void clear_start(); + ::PROTOBUF_NAMESPACE_ID::int32 start() const; + void set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const; + void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional int32 end = 2; + bool has_end() const; + private: + bool _internal_has_end() const; + public: + void clear_end(); + ::PROTOBUF_NAMESPACE_ID::int32 end() const; + void set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const; + void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.EnumDescriptorProto.EnumReservedRange) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::int32 start_; + ::PROTOBUF_NAMESPACE_ID::int32 end_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT EnumDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumDescriptorProto) */ { + public: + inline EnumDescriptorProto() : EnumDescriptorProto(nullptr) {} + virtual ~EnumDescriptorProto(); + + EnumDescriptorProto(const EnumDescriptorProto& from); + EnumDescriptorProto(EnumDescriptorProto&& from) noexcept + : EnumDescriptorProto() { + *this = ::std::move(from); + } + + inline EnumDescriptorProto& operator=(const EnumDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline EnumDescriptorProto& operator=(EnumDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const EnumDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const EnumDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_EnumDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 9; + + friend void swap(EnumDescriptorProto& a, EnumDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(EnumDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(EnumDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline EnumDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + EnumDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const EnumDescriptorProto& from); + void MergeFrom(const EnumDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(EnumDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.EnumDescriptorProto"; + } + protected: + explicit EnumDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef EnumDescriptorProto_EnumReservedRange EnumReservedRange; + + // accessors ------------------------------------------------------- + + enum : int { + kValueFieldNumber = 2, + kReservedRangeFieldNumber = 4, + kReservedNameFieldNumber = 5, + kNameFieldNumber = 1, + kOptionsFieldNumber = 3, + }; + // repeated .google.protobuf.EnumValueDescriptorProto value = 2; + int value_size() const; + private: + int _internal_value_size() const; + public: + void clear_value(); + PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* mutable_value(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >* + mutable_value(); + private: + const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& _internal_value(int index) const; + PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* _internal_add_value(); + public: + const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& value(int index) const; + PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* add_value(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >& + value() const; + + // repeated .google.protobuf.EnumDescriptorProto.EnumReservedRange reserved_range = 4; + int reserved_range_size() const; + private: + int _internal_reserved_range_size() const; + public: + void clear_reserved_range(); + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* mutable_reserved_range(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >* + mutable_reserved_range(); + private: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& _internal_reserved_range(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* _internal_add_reserved_range(); + public: + const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& reserved_range(int index) const; + PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* add_reserved_range(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >& + reserved_range() const; + + // repeated string reserved_name = 5; + int reserved_name_size() const; + private: + int _internal_reserved_name_size() const; + public: + void clear_reserved_name(); + const std::string& reserved_name(int index) const; + std::string* mutable_reserved_name(int index); + void set_reserved_name(int index, const std::string& value); + void set_reserved_name(int index, std::string&& value); + void set_reserved_name(int index, const char* value); + void set_reserved_name(int index, const char* value, size_t size); + std::string* add_reserved_name(); + void add_reserved_name(const std::string& value); + void add_reserved_name(std::string&& value); + void add_reserved_name(const char* value); + void add_reserved_name(const char* value, size_t size); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& reserved_name() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* mutable_reserved_name(); + private: + const std::string& _internal_reserved_name(int index) const; + std::string* _internal_add_reserved_name(); + public: + + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional .google.protobuf.EnumOptions options = 3; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::EnumOptions& options() const; + PROTOBUF_NAMESPACE_ID::EnumOptions* release_options(); + PROTOBUF_NAMESPACE_ID::EnumOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::EnumOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::EnumOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::EnumOptions* options); + PROTOBUF_NAMESPACE_ID::EnumOptions* unsafe_arena_release_options(); + + // @@protoc_insertion_point(class_scope:google.protobuf.EnumDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto > value_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange > reserved_range_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField reserved_name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + PROTOBUF_NAMESPACE_ID::EnumOptions* options_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT EnumValueDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumValueDescriptorProto) */ { + public: + inline EnumValueDescriptorProto() : EnumValueDescriptorProto(nullptr) {} + virtual ~EnumValueDescriptorProto(); + + EnumValueDescriptorProto(const EnumValueDescriptorProto& from); + EnumValueDescriptorProto(EnumValueDescriptorProto&& from) noexcept + : EnumValueDescriptorProto() { + *this = ::std::move(from); + } + + inline EnumValueDescriptorProto& operator=(const EnumValueDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline EnumValueDescriptorProto& operator=(EnumValueDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const EnumValueDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const EnumValueDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_EnumValueDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 10; + + friend void swap(EnumValueDescriptorProto& a, EnumValueDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(EnumValueDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(EnumValueDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline EnumValueDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + EnumValueDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const EnumValueDescriptorProto& from); + void MergeFrom(const EnumValueDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(EnumValueDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.EnumValueDescriptorProto"; + } + protected: + explicit EnumValueDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 1, + kOptionsFieldNumber = 3, + kNumberFieldNumber = 2, + }; + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional .google.protobuf.EnumValueOptions options = 3; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::EnumValueOptions& options() const; + PROTOBUF_NAMESPACE_ID::EnumValueOptions* release_options(); + PROTOBUF_NAMESPACE_ID::EnumValueOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumValueOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::EnumValueOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::EnumValueOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::EnumValueOptions* options); + PROTOBUF_NAMESPACE_ID::EnumValueOptions* unsafe_arena_release_options(); + + // optional int32 number = 2; + bool has_number() const; + private: + bool _internal_has_number() const; + public: + void clear_number(); + ::PROTOBUF_NAMESPACE_ID::int32 number() const; + void set_number(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_number() const; + void _internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.EnumValueDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + PROTOBUF_NAMESPACE_ID::EnumValueOptions* options_; + ::PROTOBUF_NAMESPACE_ID::int32 number_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT ServiceDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ServiceDescriptorProto) */ { + public: + inline ServiceDescriptorProto() : ServiceDescriptorProto(nullptr) {} + virtual ~ServiceDescriptorProto(); + + ServiceDescriptorProto(const ServiceDescriptorProto& from); + ServiceDescriptorProto(ServiceDescriptorProto&& from) noexcept + : ServiceDescriptorProto() { + *this = ::std::move(from); + } + + inline ServiceDescriptorProto& operator=(const ServiceDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline ServiceDescriptorProto& operator=(ServiceDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const ServiceDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const ServiceDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_ServiceDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 11; + + friend void swap(ServiceDescriptorProto& a, ServiceDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(ServiceDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(ServiceDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline ServiceDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + ServiceDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const ServiceDescriptorProto& from); + void MergeFrom(const ServiceDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(ServiceDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.ServiceDescriptorProto"; + } + protected: + explicit ServiceDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kMethodFieldNumber = 2, + kNameFieldNumber = 1, + kOptionsFieldNumber = 3, + }; + // repeated .google.protobuf.MethodDescriptorProto method = 2; + int method_size() const; + private: + int _internal_method_size() const; + public: + void clear_method(); + PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* mutable_method(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >* + mutable_method(); + private: + const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& _internal_method(int index) const; + PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* _internal_add_method(); + public: + const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& method(int index) const; + PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* add_method(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >& + method() const; + + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional .google.protobuf.ServiceOptions options = 3; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::ServiceOptions& options() const; + PROTOBUF_NAMESPACE_ID::ServiceOptions* release_options(); + PROTOBUF_NAMESPACE_ID::ServiceOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::ServiceOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::ServiceOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::ServiceOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::ServiceOptions* options); + PROTOBUF_NAMESPACE_ID::ServiceOptions* unsafe_arena_release_options(); + + // @@protoc_insertion_point(class_scope:google.protobuf.ServiceDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto > method_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + PROTOBUF_NAMESPACE_ID::ServiceOptions* options_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT MethodDescriptorProto PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MethodDescriptorProto) */ { + public: + inline MethodDescriptorProto() : MethodDescriptorProto(nullptr) {} + virtual ~MethodDescriptorProto(); + + MethodDescriptorProto(const MethodDescriptorProto& from); + MethodDescriptorProto(MethodDescriptorProto&& from) noexcept + : MethodDescriptorProto() { + *this = ::std::move(from); + } + + inline MethodDescriptorProto& operator=(const MethodDescriptorProto& from) { + CopyFrom(from); + return *this; + } + inline MethodDescriptorProto& operator=(MethodDescriptorProto&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const MethodDescriptorProto& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const MethodDescriptorProto* internal_default_instance() { + return reinterpret_cast( + &_MethodDescriptorProto_default_instance_); + } + static constexpr int kIndexInFileMessages = + 12; + + friend void swap(MethodDescriptorProto& a, MethodDescriptorProto& b) { + a.Swap(&b); + } + inline void Swap(MethodDescriptorProto* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(MethodDescriptorProto* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline MethodDescriptorProto* New() const final { + return CreateMaybeMessage(nullptr); + } + + MethodDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const MethodDescriptorProto& from); + void MergeFrom(const MethodDescriptorProto& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(MethodDescriptorProto* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.MethodDescriptorProto"; + } + protected: + explicit MethodDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 1, + kInputTypeFieldNumber = 2, + kOutputTypeFieldNumber = 3, + kOptionsFieldNumber = 4, + kClientStreamingFieldNumber = 5, + kServerStreamingFieldNumber = 6, + }; + // optional string name = 1; + bool has_name() const; + private: + bool _internal_has_name() const; + public: + void clear_name(); + const std::string& name() const; + void set_name(const std::string& value); + void set_name(std::string&& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + std::string* mutable_name(); + std::string* release_name(); + void set_allocated_name(std::string* name); + private: + const std::string& _internal_name() const; + void _internal_set_name(const std::string& value); + std::string* _internal_mutable_name(); + public: + + // optional string input_type = 2; + bool has_input_type() const; + private: + bool _internal_has_input_type() const; + public: + void clear_input_type(); + const std::string& input_type() const; + void set_input_type(const std::string& value); + void set_input_type(std::string&& value); + void set_input_type(const char* value); + void set_input_type(const char* value, size_t size); + std::string* mutable_input_type(); + std::string* release_input_type(); + void set_allocated_input_type(std::string* input_type); + private: + const std::string& _internal_input_type() const; + void _internal_set_input_type(const std::string& value); + std::string* _internal_mutable_input_type(); + public: + + // optional string output_type = 3; + bool has_output_type() const; + private: + bool _internal_has_output_type() const; + public: + void clear_output_type(); + const std::string& output_type() const; + void set_output_type(const std::string& value); + void set_output_type(std::string&& value); + void set_output_type(const char* value); + void set_output_type(const char* value, size_t size); + std::string* mutable_output_type(); + std::string* release_output_type(); + void set_allocated_output_type(std::string* output_type); + private: + const std::string& _internal_output_type() const; + void _internal_set_output_type(const std::string& value); + std::string* _internal_mutable_output_type(); + public: + + // optional .google.protobuf.MethodOptions options = 4; + bool has_options() const; + private: + bool _internal_has_options() const; + public: + void clear_options(); + const PROTOBUF_NAMESPACE_ID::MethodOptions& options() const; + PROTOBUF_NAMESPACE_ID::MethodOptions* release_options(); + PROTOBUF_NAMESPACE_ID::MethodOptions* mutable_options(); + void set_allocated_options(PROTOBUF_NAMESPACE_ID::MethodOptions* options); + private: + const PROTOBUF_NAMESPACE_ID::MethodOptions& _internal_options() const; + PROTOBUF_NAMESPACE_ID::MethodOptions* _internal_mutable_options(); + public: + void unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::MethodOptions* options); + PROTOBUF_NAMESPACE_ID::MethodOptions* unsafe_arena_release_options(); + + // optional bool client_streaming = 5 [default = false]; + bool has_client_streaming() const; + private: + bool _internal_has_client_streaming() const; + public: + void clear_client_streaming(); + bool client_streaming() const; + void set_client_streaming(bool value); + private: + bool _internal_client_streaming() const; + void _internal_set_client_streaming(bool value); + public: + + // optional bool server_streaming = 6 [default = false]; + bool has_server_streaming() const; + private: + bool _internal_has_server_streaming() const; + public: + void clear_server_streaming(); + bool server_streaming() const; + void set_server_streaming(bool value); + private: + bool _internal_server_streaming() const; + void _internal_set_server_streaming(bool value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.MethodDescriptorProto) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr input_type_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr output_type_; + PROTOBUF_NAMESPACE_ID::MethodOptions* options_; + bool client_streaming_; + bool server_streaming_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT FileOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileOptions) */ { + public: + inline FileOptions() : FileOptions(nullptr) {} + virtual ~FileOptions(); + + FileOptions(const FileOptions& from); + FileOptions(FileOptions&& from) noexcept + : FileOptions() { + *this = ::std::move(from); + } + + inline FileOptions& operator=(const FileOptions& from) { + CopyFrom(from); + return *this; + } + inline FileOptions& operator=(FileOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FileOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FileOptions* internal_default_instance() { + return reinterpret_cast( + &_FileOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 13; + + friend void swap(FileOptions& a, FileOptions& b) { + a.Swap(&b); + } + inline void Swap(FileOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FileOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FileOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + FileOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FileOptions& from); + void MergeFrom(const FileOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FileOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FileOptions"; + } + protected: + explicit FileOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef FileOptions_OptimizeMode OptimizeMode; + static constexpr OptimizeMode SPEED = + FileOptions_OptimizeMode_SPEED; + static constexpr OptimizeMode CODE_SIZE = + FileOptions_OptimizeMode_CODE_SIZE; + static constexpr OptimizeMode LITE_RUNTIME = + FileOptions_OptimizeMode_LITE_RUNTIME; + static inline bool OptimizeMode_IsValid(int value) { + return FileOptions_OptimizeMode_IsValid(value); + } + static constexpr OptimizeMode OptimizeMode_MIN = + FileOptions_OptimizeMode_OptimizeMode_MIN; + static constexpr OptimizeMode OptimizeMode_MAX = + FileOptions_OptimizeMode_OptimizeMode_MAX; + static constexpr int OptimizeMode_ARRAYSIZE = + FileOptions_OptimizeMode_OptimizeMode_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + OptimizeMode_descriptor() { + return FileOptions_OptimizeMode_descriptor(); + } + template + static inline const std::string& OptimizeMode_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function OptimizeMode_Name."); + return FileOptions_OptimizeMode_Name(enum_t_value); + } + static inline bool OptimizeMode_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + OptimizeMode* value) { + return FileOptions_OptimizeMode_Parse(name, value); + } + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kJavaPackageFieldNumber = 1, + kJavaOuterClassnameFieldNumber = 8, + kGoPackageFieldNumber = 11, + kObjcClassPrefixFieldNumber = 36, + kCsharpNamespaceFieldNumber = 37, + kSwiftPrefixFieldNumber = 39, + kPhpClassPrefixFieldNumber = 40, + kPhpNamespaceFieldNumber = 41, + kPhpMetadataNamespaceFieldNumber = 44, + kRubyPackageFieldNumber = 45, + kJavaMultipleFilesFieldNumber = 10, + kJavaGenerateEqualsAndHashFieldNumber = 20, + kJavaStringCheckUtf8FieldNumber = 27, + kCcGenericServicesFieldNumber = 16, + kJavaGenericServicesFieldNumber = 17, + kPyGenericServicesFieldNumber = 18, + kPhpGenericServicesFieldNumber = 42, + kDeprecatedFieldNumber = 23, + kOptimizeForFieldNumber = 9, + kCcEnableArenasFieldNumber = 31, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional string java_package = 1; + bool has_java_package() const; + private: + bool _internal_has_java_package() const; + public: + void clear_java_package(); + const std::string& java_package() const; + void set_java_package(const std::string& value); + void set_java_package(std::string&& value); + void set_java_package(const char* value); + void set_java_package(const char* value, size_t size); + std::string* mutable_java_package(); + std::string* release_java_package(); + void set_allocated_java_package(std::string* java_package); + private: + const std::string& _internal_java_package() const; + void _internal_set_java_package(const std::string& value); + std::string* _internal_mutable_java_package(); + public: + + // optional string java_outer_classname = 8; + bool has_java_outer_classname() const; + private: + bool _internal_has_java_outer_classname() const; + public: + void clear_java_outer_classname(); + const std::string& java_outer_classname() const; + void set_java_outer_classname(const std::string& value); + void set_java_outer_classname(std::string&& value); + void set_java_outer_classname(const char* value); + void set_java_outer_classname(const char* value, size_t size); + std::string* mutable_java_outer_classname(); + std::string* release_java_outer_classname(); + void set_allocated_java_outer_classname(std::string* java_outer_classname); + private: + const std::string& _internal_java_outer_classname() const; + void _internal_set_java_outer_classname(const std::string& value); + std::string* _internal_mutable_java_outer_classname(); + public: + + // optional string go_package = 11; + bool has_go_package() const; + private: + bool _internal_has_go_package() const; + public: + void clear_go_package(); + const std::string& go_package() const; + void set_go_package(const std::string& value); + void set_go_package(std::string&& value); + void set_go_package(const char* value); + void set_go_package(const char* value, size_t size); + std::string* mutable_go_package(); + std::string* release_go_package(); + void set_allocated_go_package(std::string* go_package); + private: + const std::string& _internal_go_package() const; + void _internal_set_go_package(const std::string& value); + std::string* _internal_mutable_go_package(); + public: + + // optional string objc_class_prefix = 36; + bool has_objc_class_prefix() const; + private: + bool _internal_has_objc_class_prefix() const; + public: + void clear_objc_class_prefix(); + const std::string& objc_class_prefix() const; + void set_objc_class_prefix(const std::string& value); + void set_objc_class_prefix(std::string&& value); + void set_objc_class_prefix(const char* value); + void set_objc_class_prefix(const char* value, size_t size); + std::string* mutable_objc_class_prefix(); + std::string* release_objc_class_prefix(); + void set_allocated_objc_class_prefix(std::string* objc_class_prefix); + private: + const std::string& _internal_objc_class_prefix() const; + void _internal_set_objc_class_prefix(const std::string& value); + std::string* _internal_mutable_objc_class_prefix(); + public: + + // optional string csharp_namespace = 37; + bool has_csharp_namespace() const; + private: + bool _internal_has_csharp_namespace() const; + public: + void clear_csharp_namespace(); + const std::string& csharp_namespace() const; + void set_csharp_namespace(const std::string& value); + void set_csharp_namespace(std::string&& value); + void set_csharp_namespace(const char* value); + void set_csharp_namespace(const char* value, size_t size); + std::string* mutable_csharp_namespace(); + std::string* release_csharp_namespace(); + void set_allocated_csharp_namespace(std::string* csharp_namespace); + private: + const std::string& _internal_csharp_namespace() const; + void _internal_set_csharp_namespace(const std::string& value); + std::string* _internal_mutable_csharp_namespace(); + public: + + // optional string swift_prefix = 39; + bool has_swift_prefix() const; + private: + bool _internal_has_swift_prefix() const; + public: + void clear_swift_prefix(); + const std::string& swift_prefix() const; + void set_swift_prefix(const std::string& value); + void set_swift_prefix(std::string&& value); + void set_swift_prefix(const char* value); + void set_swift_prefix(const char* value, size_t size); + std::string* mutable_swift_prefix(); + std::string* release_swift_prefix(); + void set_allocated_swift_prefix(std::string* swift_prefix); + private: + const std::string& _internal_swift_prefix() const; + void _internal_set_swift_prefix(const std::string& value); + std::string* _internal_mutable_swift_prefix(); + public: + + // optional string php_class_prefix = 40; + bool has_php_class_prefix() const; + private: + bool _internal_has_php_class_prefix() const; + public: + void clear_php_class_prefix(); + const std::string& php_class_prefix() const; + void set_php_class_prefix(const std::string& value); + void set_php_class_prefix(std::string&& value); + void set_php_class_prefix(const char* value); + void set_php_class_prefix(const char* value, size_t size); + std::string* mutable_php_class_prefix(); + std::string* release_php_class_prefix(); + void set_allocated_php_class_prefix(std::string* php_class_prefix); + private: + const std::string& _internal_php_class_prefix() const; + void _internal_set_php_class_prefix(const std::string& value); + std::string* _internal_mutable_php_class_prefix(); + public: + + // optional string php_namespace = 41; + bool has_php_namespace() const; + private: + bool _internal_has_php_namespace() const; + public: + void clear_php_namespace(); + const std::string& php_namespace() const; + void set_php_namespace(const std::string& value); + void set_php_namespace(std::string&& value); + void set_php_namespace(const char* value); + void set_php_namespace(const char* value, size_t size); + std::string* mutable_php_namespace(); + std::string* release_php_namespace(); + void set_allocated_php_namespace(std::string* php_namespace); + private: + const std::string& _internal_php_namespace() const; + void _internal_set_php_namespace(const std::string& value); + std::string* _internal_mutable_php_namespace(); + public: + + // optional string php_metadata_namespace = 44; + bool has_php_metadata_namespace() const; + private: + bool _internal_has_php_metadata_namespace() const; + public: + void clear_php_metadata_namespace(); + const std::string& php_metadata_namespace() const; + void set_php_metadata_namespace(const std::string& value); + void set_php_metadata_namespace(std::string&& value); + void set_php_metadata_namespace(const char* value); + void set_php_metadata_namespace(const char* value, size_t size); + std::string* mutable_php_metadata_namespace(); + std::string* release_php_metadata_namespace(); + void set_allocated_php_metadata_namespace(std::string* php_metadata_namespace); + private: + const std::string& _internal_php_metadata_namespace() const; + void _internal_set_php_metadata_namespace(const std::string& value); + std::string* _internal_mutable_php_metadata_namespace(); + public: + + // optional string ruby_package = 45; + bool has_ruby_package() const; + private: + bool _internal_has_ruby_package() const; + public: + void clear_ruby_package(); + const std::string& ruby_package() const; + void set_ruby_package(const std::string& value); + void set_ruby_package(std::string&& value); + void set_ruby_package(const char* value); + void set_ruby_package(const char* value, size_t size); + std::string* mutable_ruby_package(); + std::string* release_ruby_package(); + void set_allocated_ruby_package(std::string* ruby_package); + private: + const std::string& _internal_ruby_package() const; + void _internal_set_ruby_package(const std::string& value); + std::string* _internal_mutable_ruby_package(); + public: + + // optional bool java_multiple_files = 10 [default = false]; + bool has_java_multiple_files() const; + private: + bool _internal_has_java_multiple_files() const; + public: + void clear_java_multiple_files(); + bool java_multiple_files() const; + void set_java_multiple_files(bool value); + private: + bool _internal_java_multiple_files() const; + void _internal_set_java_multiple_files(bool value); + public: + + // optional bool java_generate_equals_and_hash = 20 [deprecated = true]; + PROTOBUF_DEPRECATED bool has_java_generate_equals_and_hash() const; + private: + bool _internal_has_java_generate_equals_and_hash() const; + public: + PROTOBUF_DEPRECATED void clear_java_generate_equals_and_hash(); + PROTOBUF_DEPRECATED bool java_generate_equals_and_hash() const; + PROTOBUF_DEPRECATED void set_java_generate_equals_and_hash(bool value); + private: + bool _internal_java_generate_equals_and_hash() const; + void _internal_set_java_generate_equals_and_hash(bool value); + public: + + // optional bool java_string_check_utf8 = 27 [default = false]; + bool has_java_string_check_utf8() const; + private: + bool _internal_has_java_string_check_utf8() const; + public: + void clear_java_string_check_utf8(); + bool java_string_check_utf8() const; + void set_java_string_check_utf8(bool value); + private: + bool _internal_java_string_check_utf8() const; + void _internal_set_java_string_check_utf8(bool value); + public: + + // optional bool cc_generic_services = 16 [default = false]; + bool has_cc_generic_services() const; + private: + bool _internal_has_cc_generic_services() const; + public: + void clear_cc_generic_services(); + bool cc_generic_services() const; + void set_cc_generic_services(bool value); + private: + bool _internal_cc_generic_services() const; + void _internal_set_cc_generic_services(bool value); + public: + + // optional bool java_generic_services = 17 [default = false]; + bool has_java_generic_services() const; + private: + bool _internal_has_java_generic_services() const; + public: + void clear_java_generic_services(); + bool java_generic_services() const; + void set_java_generic_services(bool value); + private: + bool _internal_java_generic_services() const; + void _internal_set_java_generic_services(bool value); + public: + + // optional bool py_generic_services = 18 [default = false]; + bool has_py_generic_services() const; + private: + bool _internal_has_py_generic_services() const; + public: + void clear_py_generic_services(); + bool py_generic_services() const; + void set_py_generic_services(bool value); + private: + bool _internal_py_generic_services() const; + void _internal_set_py_generic_services(bool value); + public: + + // optional bool php_generic_services = 42 [default = false]; + bool has_php_generic_services() const; + private: + bool _internal_has_php_generic_services() const; + public: + void clear_php_generic_services(); + bool php_generic_services() const; + void set_php_generic_services(bool value); + private: + bool _internal_php_generic_services() const; + void _internal_set_php_generic_services(bool value); + public: + + // optional bool deprecated = 23 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + // optional .google.protobuf.FileOptions.OptimizeMode optimize_for = 9 [default = SPEED]; + bool has_optimize_for() const; + private: + bool _internal_has_optimize_for() const; + public: + void clear_optimize_for(); + PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode optimize_for() const; + void set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value); + private: + PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode _internal_optimize_for() const; + void _internal_set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value); + public: + + // optional bool cc_enable_arenas = 31 [default = true]; + bool has_cc_enable_arenas() const; + private: + bool _internal_has_cc_enable_arenas() const; + public: + void clear_cc_enable_arenas(); + bool cc_enable_arenas() const; + void set_cc_enable_arenas(bool value); + private: + bool _internal_cc_enable_arenas() const; + void _internal_set_cc_enable_arenas(bool value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(FileOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.FileOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr java_package_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr java_outer_classname_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr go_package_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr objc_class_prefix_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr csharp_namespace_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr swift_prefix_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_class_prefix_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_namespace_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_metadata_namespace_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr ruby_package_; + bool java_multiple_files_; + bool java_generate_equals_and_hash_; + bool java_string_check_utf8_; + bool cc_generic_services_; + bool java_generic_services_; + bool py_generic_services_; + bool php_generic_services_; + bool deprecated_; + int optimize_for_; + bool cc_enable_arenas_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT MessageOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MessageOptions) */ { + public: + inline MessageOptions() : MessageOptions(nullptr) {} + virtual ~MessageOptions(); + + MessageOptions(const MessageOptions& from); + MessageOptions(MessageOptions&& from) noexcept + : MessageOptions() { + *this = ::std::move(from); + } + + inline MessageOptions& operator=(const MessageOptions& from) { + CopyFrom(from); + return *this; + } + inline MessageOptions& operator=(MessageOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const MessageOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const MessageOptions* internal_default_instance() { + return reinterpret_cast( + &_MessageOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 14; + + friend void swap(MessageOptions& a, MessageOptions& b) { + a.Swap(&b); + } + inline void Swap(MessageOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(MessageOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline MessageOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + MessageOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const MessageOptions& from); + void MergeFrom(const MessageOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(MessageOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.MessageOptions"; + } + protected: + explicit MessageOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kMessageSetWireFormatFieldNumber = 1, + kNoStandardDescriptorAccessorFieldNumber = 2, + kDeprecatedFieldNumber = 3, + kMapEntryFieldNumber = 7, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional bool message_set_wire_format = 1 [default = false]; + bool has_message_set_wire_format() const; + private: + bool _internal_has_message_set_wire_format() const; + public: + void clear_message_set_wire_format(); + bool message_set_wire_format() const; + void set_message_set_wire_format(bool value); + private: + bool _internal_message_set_wire_format() const; + void _internal_set_message_set_wire_format(bool value); + public: + + // optional bool no_standard_descriptor_accessor = 2 [default = false]; + bool has_no_standard_descriptor_accessor() const; + private: + bool _internal_has_no_standard_descriptor_accessor() const; + public: + void clear_no_standard_descriptor_accessor(); + bool no_standard_descriptor_accessor() const; + void set_no_standard_descriptor_accessor(bool value); + private: + bool _internal_no_standard_descriptor_accessor() const; + void _internal_set_no_standard_descriptor_accessor(bool value); + public: + + // optional bool deprecated = 3 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + // optional bool map_entry = 7; + bool has_map_entry() const; + private: + bool _internal_has_map_entry() const; + public: + void clear_map_entry(); + bool map_entry() const; + void set_map_entry(bool value); + private: + bool _internal_map_entry() const; + void _internal_set_map_entry(bool value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(MessageOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.MessageOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + bool message_set_wire_format_; + bool no_standard_descriptor_accessor_; + bool deprecated_; + bool map_entry_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT FieldOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldOptions) */ { + public: + inline FieldOptions() : FieldOptions(nullptr) {} + virtual ~FieldOptions(); + + FieldOptions(const FieldOptions& from); + FieldOptions(FieldOptions&& from) noexcept + : FieldOptions() { + *this = ::std::move(from); + } + + inline FieldOptions& operator=(const FieldOptions& from) { + CopyFrom(from); + return *this; + } + inline FieldOptions& operator=(FieldOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FieldOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FieldOptions* internal_default_instance() { + return reinterpret_cast( + &_FieldOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 15; + + friend void swap(FieldOptions& a, FieldOptions& b) { + a.Swap(&b); + } + inline void Swap(FieldOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FieldOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FieldOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + FieldOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FieldOptions& from); + void MergeFrom(const FieldOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FieldOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FieldOptions"; + } + protected: + explicit FieldOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef FieldOptions_CType CType; + static constexpr CType STRING = + FieldOptions_CType_STRING; + static constexpr CType CORD = + FieldOptions_CType_CORD; + static constexpr CType STRING_PIECE = + FieldOptions_CType_STRING_PIECE; + static inline bool CType_IsValid(int value) { + return FieldOptions_CType_IsValid(value); + } + static constexpr CType CType_MIN = + FieldOptions_CType_CType_MIN; + static constexpr CType CType_MAX = + FieldOptions_CType_CType_MAX; + static constexpr int CType_ARRAYSIZE = + FieldOptions_CType_CType_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + CType_descriptor() { + return FieldOptions_CType_descriptor(); + } + template + static inline const std::string& CType_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function CType_Name."); + return FieldOptions_CType_Name(enum_t_value); + } + static inline bool CType_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + CType* value) { + return FieldOptions_CType_Parse(name, value); + } + + typedef FieldOptions_JSType JSType; + static constexpr JSType JS_NORMAL = + FieldOptions_JSType_JS_NORMAL; + static constexpr JSType JS_STRING = + FieldOptions_JSType_JS_STRING; + static constexpr JSType JS_NUMBER = + FieldOptions_JSType_JS_NUMBER; + static inline bool JSType_IsValid(int value) { + return FieldOptions_JSType_IsValid(value); + } + static constexpr JSType JSType_MIN = + FieldOptions_JSType_JSType_MIN; + static constexpr JSType JSType_MAX = + FieldOptions_JSType_JSType_MAX; + static constexpr int JSType_ARRAYSIZE = + FieldOptions_JSType_JSType_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + JSType_descriptor() { + return FieldOptions_JSType_descriptor(); + } + template + static inline const std::string& JSType_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function JSType_Name."); + return FieldOptions_JSType_Name(enum_t_value); + } + static inline bool JSType_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + JSType* value) { + return FieldOptions_JSType_Parse(name, value); + } + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kCtypeFieldNumber = 1, + kPackedFieldNumber = 2, + kLazyFieldNumber = 5, + kDeprecatedFieldNumber = 3, + kWeakFieldNumber = 10, + kJstypeFieldNumber = 6, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional .google.protobuf.FieldOptions.CType ctype = 1 [default = STRING]; + bool has_ctype() const; + private: + bool _internal_has_ctype() const; + public: + void clear_ctype(); + PROTOBUF_NAMESPACE_ID::FieldOptions_CType ctype() const; + void set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value); + private: + PROTOBUF_NAMESPACE_ID::FieldOptions_CType _internal_ctype() const; + void _internal_set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value); + public: + + // optional bool packed = 2; + bool has_packed() const; + private: + bool _internal_has_packed() const; + public: + void clear_packed(); + bool packed() const; + void set_packed(bool value); + private: + bool _internal_packed() const; + void _internal_set_packed(bool value); + public: + + // optional bool lazy = 5 [default = false]; + bool has_lazy() const; + private: + bool _internal_has_lazy() const; + public: + void clear_lazy(); + bool lazy() const; + void set_lazy(bool value); + private: + bool _internal_lazy() const; + void _internal_set_lazy(bool value); + public: + + // optional bool deprecated = 3 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + // optional bool weak = 10 [default = false]; + bool has_weak() const; + private: + bool _internal_has_weak() const; + public: + void clear_weak(); + bool weak() const; + void set_weak(bool value); + private: + bool _internal_weak() const; + void _internal_set_weak(bool value); + public: + + // optional .google.protobuf.FieldOptions.JSType jstype = 6 [default = JS_NORMAL]; + bool has_jstype() const; + private: + bool _internal_has_jstype() const; + public: + void clear_jstype(); + PROTOBUF_NAMESPACE_ID::FieldOptions_JSType jstype() const; + void set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value); + private: + PROTOBUF_NAMESPACE_ID::FieldOptions_JSType _internal_jstype() const; + void _internal_set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(FieldOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.FieldOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + int ctype_; + bool packed_; + bool lazy_; + bool deprecated_; + bool weak_; + int jstype_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT OneofOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.OneofOptions) */ { + public: + inline OneofOptions() : OneofOptions(nullptr) {} + virtual ~OneofOptions(); + + OneofOptions(const OneofOptions& from); + OneofOptions(OneofOptions&& from) noexcept + : OneofOptions() { + *this = ::std::move(from); + } + + inline OneofOptions& operator=(const OneofOptions& from) { + CopyFrom(from); + return *this; + } + inline OneofOptions& operator=(OneofOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const OneofOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const OneofOptions* internal_default_instance() { + return reinterpret_cast( + &_OneofOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 16; + + friend void swap(OneofOptions& a, OneofOptions& b) { + a.Swap(&b); + } + inline void Swap(OneofOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(OneofOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline OneofOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + OneofOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const OneofOptions& from); + void MergeFrom(const OneofOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(OneofOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.OneofOptions"; + } + protected: + explicit OneofOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(OneofOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.OneofOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT EnumOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumOptions) */ { + public: + inline EnumOptions() : EnumOptions(nullptr) {} + virtual ~EnumOptions(); + + EnumOptions(const EnumOptions& from); + EnumOptions(EnumOptions&& from) noexcept + : EnumOptions() { + *this = ::std::move(from); + } + + inline EnumOptions& operator=(const EnumOptions& from) { + CopyFrom(from); + return *this; + } + inline EnumOptions& operator=(EnumOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const EnumOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const EnumOptions* internal_default_instance() { + return reinterpret_cast( + &_EnumOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 17; + + friend void swap(EnumOptions& a, EnumOptions& b) { + a.Swap(&b); + } + inline void Swap(EnumOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(EnumOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline EnumOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + EnumOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const EnumOptions& from); + void MergeFrom(const EnumOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(EnumOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.EnumOptions"; + } + protected: + explicit EnumOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kAllowAliasFieldNumber = 2, + kDeprecatedFieldNumber = 3, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional bool allow_alias = 2; + bool has_allow_alias() const; + private: + bool _internal_has_allow_alias() const; + public: + void clear_allow_alias(); + bool allow_alias() const; + void set_allow_alias(bool value); + private: + bool _internal_allow_alias() const; + void _internal_set_allow_alias(bool value); + public: + + // optional bool deprecated = 3 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(EnumOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.EnumOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + bool allow_alias_; + bool deprecated_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT EnumValueOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumValueOptions) */ { + public: + inline EnumValueOptions() : EnumValueOptions(nullptr) {} + virtual ~EnumValueOptions(); + + EnumValueOptions(const EnumValueOptions& from); + EnumValueOptions(EnumValueOptions&& from) noexcept + : EnumValueOptions() { + *this = ::std::move(from); + } + + inline EnumValueOptions& operator=(const EnumValueOptions& from) { + CopyFrom(from); + return *this; + } + inline EnumValueOptions& operator=(EnumValueOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const EnumValueOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const EnumValueOptions* internal_default_instance() { + return reinterpret_cast( + &_EnumValueOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 18; + + friend void swap(EnumValueOptions& a, EnumValueOptions& b) { + a.Swap(&b); + } + inline void Swap(EnumValueOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(EnumValueOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline EnumValueOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + EnumValueOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const EnumValueOptions& from); + void MergeFrom(const EnumValueOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(EnumValueOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.EnumValueOptions"; + } + protected: + explicit EnumValueOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kDeprecatedFieldNumber = 1, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional bool deprecated = 1 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(EnumValueOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.EnumValueOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + bool deprecated_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT ServiceOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ServiceOptions) */ { + public: + inline ServiceOptions() : ServiceOptions(nullptr) {} + virtual ~ServiceOptions(); + + ServiceOptions(const ServiceOptions& from); + ServiceOptions(ServiceOptions&& from) noexcept + : ServiceOptions() { + *this = ::std::move(from); + } + + inline ServiceOptions& operator=(const ServiceOptions& from) { + CopyFrom(from); + return *this; + } + inline ServiceOptions& operator=(ServiceOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const ServiceOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const ServiceOptions* internal_default_instance() { + return reinterpret_cast( + &_ServiceOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 19; + + friend void swap(ServiceOptions& a, ServiceOptions& b) { + a.Swap(&b); + } + inline void Swap(ServiceOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(ServiceOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline ServiceOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + ServiceOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const ServiceOptions& from); + void MergeFrom(const ServiceOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(ServiceOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.ServiceOptions"; + } + protected: + explicit ServiceOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kDeprecatedFieldNumber = 33, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional bool deprecated = 33 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(ServiceOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.ServiceOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + bool deprecated_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT MethodOptions PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MethodOptions) */ { + public: + inline MethodOptions() : MethodOptions(nullptr) {} + virtual ~MethodOptions(); + + MethodOptions(const MethodOptions& from); + MethodOptions(MethodOptions&& from) noexcept + : MethodOptions() { + *this = ::std::move(from); + } + + inline MethodOptions& operator=(const MethodOptions& from) { + CopyFrom(from); + return *this; + } + inline MethodOptions& operator=(MethodOptions&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const MethodOptions& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const MethodOptions* internal_default_instance() { + return reinterpret_cast( + &_MethodOptions_default_instance_); + } + static constexpr int kIndexInFileMessages = + 20; + + friend void swap(MethodOptions& a, MethodOptions& b) { + a.Swap(&b); + } + inline void Swap(MethodOptions* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(MethodOptions* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline MethodOptions* New() const final { + return CreateMaybeMessage(nullptr); + } + + MethodOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const MethodOptions& from); + void MergeFrom(const MethodOptions& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(MethodOptions* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.MethodOptions"; + } + protected: + explicit MethodOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef MethodOptions_IdempotencyLevel IdempotencyLevel; + static constexpr IdempotencyLevel IDEMPOTENCY_UNKNOWN = + MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN; + static constexpr IdempotencyLevel NO_SIDE_EFFECTS = + MethodOptions_IdempotencyLevel_NO_SIDE_EFFECTS; + static constexpr IdempotencyLevel IDEMPOTENT = + MethodOptions_IdempotencyLevel_IDEMPOTENT; + static inline bool IdempotencyLevel_IsValid(int value) { + return MethodOptions_IdempotencyLevel_IsValid(value); + } + static constexpr IdempotencyLevel IdempotencyLevel_MIN = + MethodOptions_IdempotencyLevel_IdempotencyLevel_MIN; + static constexpr IdempotencyLevel IdempotencyLevel_MAX = + MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX; + static constexpr int IdempotencyLevel_ARRAYSIZE = + MethodOptions_IdempotencyLevel_IdempotencyLevel_ARRAYSIZE; + static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* + IdempotencyLevel_descriptor() { + return MethodOptions_IdempotencyLevel_descriptor(); + } + template + static inline const std::string& IdempotencyLevel_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function IdempotencyLevel_Name."); + return MethodOptions_IdempotencyLevel_Name(enum_t_value); + } + static inline bool IdempotencyLevel_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name, + IdempotencyLevel* value) { + return MethodOptions_IdempotencyLevel_Parse(name, value); + } + + // accessors ------------------------------------------------------- + + enum : int { + kUninterpretedOptionFieldNumber = 999, + kDeprecatedFieldNumber = 33, + kIdempotencyLevelFieldNumber = 34, + }; + // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; + int uninterpreted_option_size() const; + private: + int _internal_uninterpreted_option_size() const; + public: + void clear_uninterpreted_option(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* + mutable_uninterpreted_option(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& + uninterpreted_option() const; + + // optional bool deprecated = 33 [default = false]; + bool has_deprecated() const; + private: + bool _internal_has_deprecated() const; + public: + void clear_deprecated(); + bool deprecated() const; + void set_deprecated(bool value); + private: + bool _internal_deprecated() const; + void _internal_set_deprecated(bool value); + public: + + // optional .google.protobuf.MethodOptions.IdempotencyLevel idempotency_level = 34 [default = IDEMPOTENCY_UNKNOWN]; + bool has_idempotency_level() const; + private: + bool _internal_has_idempotency_level() const; + public: + void clear_idempotency_level(); + PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel idempotency_level() const; + void set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value); + private: + PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel _internal_idempotency_level() const; + void _internal_set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value); + public: + + GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(MethodOptions) + // @@protoc_insertion_point(class_scope:google.protobuf.MethodOptions) + private: + class _Internal; + + ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_; + bool deprecated_; + int idempotency_level_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT UninterpretedOption_NamePart PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.UninterpretedOption.NamePart) */ { + public: + inline UninterpretedOption_NamePart() : UninterpretedOption_NamePart(nullptr) {} + virtual ~UninterpretedOption_NamePart(); + + UninterpretedOption_NamePart(const UninterpretedOption_NamePart& from); + UninterpretedOption_NamePart(UninterpretedOption_NamePart&& from) noexcept + : UninterpretedOption_NamePart() { + *this = ::std::move(from); + } + + inline UninterpretedOption_NamePart& operator=(const UninterpretedOption_NamePart& from) { + CopyFrom(from); + return *this; + } + inline UninterpretedOption_NamePart& operator=(UninterpretedOption_NamePart&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const UninterpretedOption_NamePart& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const UninterpretedOption_NamePart* internal_default_instance() { + return reinterpret_cast( + &_UninterpretedOption_NamePart_default_instance_); + } + static constexpr int kIndexInFileMessages = + 21; + + friend void swap(UninterpretedOption_NamePart& a, UninterpretedOption_NamePart& b) { + a.Swap(&b); + } + inline void Swap(UninterpretedOption_NamePart* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(UninterpretedOption_NamePart* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline UninterpretedOption_NamePart* New() const final { + return CreateMaybeMessage(nullptr); + } + + UninterpretedOption_NamePart* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const UninterpretedOption_NamePart& from); + void MergeFrom(const UninterpretedOption_NamePart& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(UninterpretedOption_NamePart* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.UninterpretedOption.NamePart"; + } + protected: + explicit UninterpretedOption_NamePart(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNamePartFieldNumber = 1, + kIsExtensionFieldNumber = 2, + }; + // required string name_part = 1; + bool has_name_part() const; + private: + bool _internal_has_name_part() const; + public: + void clear_name_part(); + const std::string& name_part() const; + void set_name_part(const std::string& value); + void set_name_part(std::string&& value); + void set_name_part(const char* value); + void set_name_part(const char* value, size_t size); + std::string* mutable_name_part(); + std::string* release_name_part(); + void set_allocated_name_part(std::string* name_part); + private: + const std::string& _internal_name_part() const; + void _internal_set_name_part(const std::string& value); + std::string* _internal_mutable_name_part(); + public: + + // required bool is_extension = 2; + bool has_is_extension() const; + private: + bool _internal_has_is_extension() const; + public: + void clear_is_extension(); + bool is_extension() const; + void set_is_extension(bool value); + private: + bool _internal_is_extension() const; + void _internal_set_is_extension(bool value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.UninterpretedOption.NamePart) + private: + class _Internal; + + // helper for ByteSizeLong() + size_t RequiredFieldsByteSizeFallback() const; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_part_; + bool is_extension_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT UninterpretedOption PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.UninterpretedOption) */ { + public: + inline UninterpretedOption() : UninterpretedOption(nullptr) {} + virtual ~UninterpretedOption(); + + UninterpretedOption(const UninterpretedOption& from); + UninterpretedOption(UninterpretedOption&& from) noexcept + : UninterpretedOption() { + *this = ::std::move(from); + } + + inline UninterpretedOption& operator=(const UninterpretedOption& from) { + CopyFrom(from); + return *this; + } + inline UninterpretedOption& operator=(UninterpretedOption&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const UninterpretedOption& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const UninterpretedOption* internal_default_instance() { + return reinterpret_cast( + &_UninterpretedOption_default_instance_); + } + static constexpr int kIndexInFileMessages = + 22; + + friend void swap(UninterpretedOption& a, UninterpretedOption& b) { + a.Swap(&b); + } + inline void Swap(UninterpretedOption* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(UninterpretedOption* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline UninterpretedOption* New() const final { + return CreateMaybeMessage(nullptr); + } + + UninterpretedOption* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const UninterpretedOption& from); + void MergeFrom(const UninterpretedOption& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(UninterpretedOption* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.UninterpretedOption"; + } + protected: + explicit UninterpretedOption(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef UninterpretedOption_NamePart NamePart; + + // accessors ------------------------------------------------------- + + enum : int { + kNameFieldNumber = 2, + kIdentifierValueFieldNumber = 3, + kStringValueFieldNumber = 7, + kAggregateValueFieldNumber = 8, + kPositiveIntValueFieldNumber = 4, + kNegativeIntValueFieldNumber = 5, + kDoubleValueFieldNumber = 6, + }; + // repeated .google.protobuf.UninterpretedOption.NamePart name = 2; + int name_size() const; + private: + int _internal_name_size() const; + public: + void clear_name(); + PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* mutable_name(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >* + mutable_name(); + private: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& _internal_name(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* _internal_add_name(); + public: + const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& name(int index) const; + PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* add_name(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >& + name() const; + + // optional string identifier_value = 3; + bool has_identifier_value() const; + private: + bool _internal_has_identifier_value() const; + public: + void clear_identifier_value(); + const std::string& identifier_value() const; + void set_identifier_value(const std::string& value); + void set_identifier_value(std::string&& value); + void set_identifier_value(const char* value); + void set_identifier_value(const char* value, size_t size); + std::string* mutable_identifier_value(); + std::string* release_identifier_value(); + void set_allocated_identifier_value(std::string* identifier_value); + private: + const std::string& _internal_identifier_value() const; + void _internal_set_identifier_value(const std::string& value); + std::string* _internal_mutable_identifier_value(); + public: + + // optional bytes string_value = 7; + bool has_string_value() const; + private: + bool _internal_has_string_value() const; + public: + void clear_string_value(); + const std::string& string_value() const; + void set_string_value(const std::string& value); + void set_string_value(std::string&& value); + void set_string_value(const char* value); + void set_string_value(const void* value, size_t size); + std::string* mutable_string_value(); + std::string* release_string_value(); + void set_allocated_string_value(std::string* string_value); + private: + const std::string& _internal_string_value() const; + void _internal_set_string_value(const std::string& value); + std::string* _internal_mutable_string_value(); + public: + + // optional string aggregate_value = 8; + bool has_aggregate_value() const; + private: + bool _internal_has_aggregate_value() const; + public: + void clear_aggregate_value(); + const std::string& aggregate_value() const; + void set_aggregate_value(const std::string& value); + void set_aggregate_value(std::string&& value); + void set_aggregate_value(const char* value); + void set_aggregate_value(const char* value, size_t size); + std::string* mutable_aggregate_value(); + std::string* release_aggregate_value(); + void set_allocated_aggregate_value(std::string* aggregate_value); + private: + const std::string& _internal_aggregate_value() const; + void _internal_set_aggregate_value(const std::string& value); + std::string* _internal_mutable_aggregate_value(); + public: + + // optional uint64 positive_int_value = 4; + bool has_positive_int_value() const; + private: + bool _internal_has_positive_int_value() const; + public: + void clear_positive_int_value(); + ::PROTOBUF_NAMESPACE_ID::uint64 positive_int_value() const; + void set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value); + private: + ::PROTOBUF_NAMESPACE_ID::uint64 _internal_positive_int_value() const; + void _internal_set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value); + public: + + // optional int64 negative_int_value = 5; + bool has_negative_int_value() const; + private: + bool _internal_has_negative_int_value() const; + public: + void clear_negative_int_value(); + ::PROTOBUF_NAMESPACE_ID::int64 negative_int_value() const; + void set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value); + private: + ::PROTOBUF_NAMESPACE_ID::int64 _internal_negative_int_value() const; + void _internal_set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value); + public: + + // optional double double_value = 6; + bool has_double_value() const; + private: + bool _internal_has_double_value() const; + public: + void clear_double_value(); + double double_value() const; + void set_double_value(double value); + private: + double _internal_double_value() const; + void _internal_set_double_value(double value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.UninterpretedOption) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart > name_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr identifier_value_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr string_value_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr aggregate_value_; + ::PROTOBUF_NAMESPACE_ID::uint64 positive_int_value_; + ::PROTOBUF_NAMESPACE_ID::int64 negative_int_value_; + double double_value_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT SourceCodeInfo_Location PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceCodeInfo.Location) */ { + public: + inline SourceCodeInfo_Location() : SourceCodeInfo_Location(nullptr) {} + virtual ~SourceCodeInfo_Location(); + + SourceCodeInfo_Location(const SourceCodeInfo_Location& from); + SourceCodeInfo_Location(SourceCodeInfo_Location&& from) noexcept + : SourceCodeInfo_Location() { + *this = ::std::move(from); + } + + inline SourceCodeInfo_Location& operator=(const SourceCodeInfo_Location& from) { + CopyFrom(from); + return *this; + } + inline SourceCodeInfo_Location& operator=(SourceCodeInfo_Location&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const SourceCodeInfo_Location& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const SourceCodeInfo_Location* internal_default_instance() { + return reinterpret_cast( + &_SourceCodeInfo_Location_default_instance_); + } + static constexpr int kIndexInFileMessages = + 23; + + friend void swap(SourceCodeInfo_Location& a, SourceCodeInfo_Location& b) { + a.Swap(&b); + } + inline void Swap(SourceCodeInfo_Location* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(SourceCodeInfo_Location* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline SourceCodeInfo_Location* New() const final { + return CreateMaybeMessage(nullptr); + } + + SourceCodeInfo_Location* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const SourceCodeInfo_Location& from); + void MergeFrom(const SourceCodeInfo_Location& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(SourceCodeInfo_Location* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.SourceCodeInfo.Location"; + } + protected: + explicit SourceCodeInfo_Location(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kPathFieldNumber = 1, + kSpanFieldNumber = 2, + kLeadingDetachedCommentsFieldNumber = 6, + kLeadingCommentsFieldNumber = 3, + kTrailingCommentsFieldNumber = 4, + }; + // repeated int32 path = 1 [packed = true]; + int path_size() const; + private: + int _internal_path_size() const; + public: + void clear_path(); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_path(int index) const; + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + _internal_path() const; + void _internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value); + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + _internal_mutable_path(); + public: + ::PROTOBUF_NAMESPACE_ID::int32 path(int index) const; + void set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value); + void add_path(::PROTOBUF_NAMESPACE_ID::int32 value); + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + path() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + mutable_path(); + + // repeated int32 span = 2 [packed = true]; + int span_size() const; + private: + int _internal_span_size() const; + public: + void clear_span(); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_span(int index) const; + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + _internal_span() const; + void _internal_add_span(::PROTOBUF_NAMESPACE_ID::int32 value); + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + _internal_mutable_span(); + public: + ::PROTOBUF_NAMESPACE_ID::int32 span(int index) const; + void set_span(int index, ::PROTOBUF_NAMESPACE_ID::int32 value); + void add_span(::PROTOBUF_NAMESPACE_ID::int32 value); + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + span() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + mutable_span(); + + // repeated string leading_detached_comments = 6; + int leading_detached_comments_size() const; + private: + int _internal_leading_detached_comments_size() const; + public: + void clear_leading_detached_comments(); + const std::string& leading_detached_comments(int index) const; + std::string* mutable_leading_detached_comments(int index); + void set_leading_detached_comments(int index, const std::string& value); + void set_leading_detached_comments(int index, std::string&& value); + void set_leading_detached_comments(int index, const char* value); + void set_leading_detached_comments(int index, const char* value, size_t size); + std::string* add_leading_detached_comments(); + void add_leading_detached_comments(const std::string& value); + void add_leading_detached_comments(std::string&& value); + void add_leading_detached_comments(const char* value); + void add_leading_detached_comments(const char* value, size_t size); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& leading_detached_comments() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* mutable_leading_detached_comments(); + private: + const std::string& _internal_leading_detached_comments(int index) const; + std::string* _internal_add_leading_detached_comments(); + public: + + // optional string leading_comments = 3; + bool has_leading_comments() const; + private: + bool _internal_has_leading_comments() const; + public: + void clear_leading_comments(); + const std::string& leading_comments() const; + void set_leading_comments(const std::string& value); + void set_leading_comments(std::string&& value); + void set_leading_comments(const char* value); + void set_leading_comments(const char* value, size_t size); + std::string* mutable_leading_comments(); + std::string* release_leading_comments(); + void set_allocated_leading_comments(std::string* leading_comments); + private: + const std::string& _internal_leading_comments() const; + void _internal_set_leading_comments(const std::string& value); + std::string* _internal_mutable_leading_comments(); + public: + + // optional string trailing_comments = 4; + bool has_trailing_comments() const; + private: + bool _internal_has_trailing_comments() const; + public: + void clear_trailing_comments(); + const std::string& trailing_comments() const; + void set_trailing_comments(const std::string& value); + void set_trailing_comments(std::string&& value); + void set_trailing_comments(const char* value); + void set_trailing_comments(const char* value, size_t size); + std::string* mutable_trailing_comments(); + std::string* release_trailing_comments(); + void set_allocated_trailing_comments(std::string* trailing_comments); + private: + const std::string& _internal_trailing_comments() const; + void _internal_set_trailing_comments(const std::string& value); + std::string* _internal_mutable_trailing_comments(); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.SourceCodeInfo.Location) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > path_; + mutable std::atomic _path_cached_byte_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > span_; + mutable std::atomic _span_cached_byte_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField leading_detached_comments_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr leading_comments_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr trailing_comments_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT SourceCodeInfo PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceCodeInfo) */ { + public: + inline SourceCodeInfo() : SourceCodeInfo(nullptr) {} + virtual ~SourceCodeInfo(); + + SourceCodeInfo(const SourceCodeInfo& from); + SourceCodeInfo(SourceCodeInfo&& from) noexcept + : SourceCodeInfo() { + *this = ::std::move(from); + } + + inline SourceCodeInfo& operator=(const SourceCodeInfo& from) { + CopyFrom(from); + return *this; + } + inline SourceCodeInfo& operator=(SourceCodeInfo&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const SourceCodeInfo& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const SourceCodeInfo* internal_default_instance() { + return reinterpret_cast( + &_SourceCodeInfo_default_instance_); + } + static constexpr int kIndexInFileMessages = + 24; + + friend void swap(SourceCodeInfo& a, SourceCodeInfo& b) { + a.Swap(&b); + } + inline void Swap(SourceCodeInfo* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(SourceCodeInfo* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline SourceCodeInfo* New() const final { + return CreateMaybeMessage(nullptr); + } + + SourceCodeInfo* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const SourceCodeInfo& from); + void MergeFrom(const SourceCodeInfo& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(SourceCodeInfo* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.SourceCodeInfo"; + } + protected: + explicit SourceCodeInfo(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef SourceCodeInfo_Location Location; + + // accessors ------------------------------------------------------- + + enum : int { + kLocationFieldNumber = 1, + }; + // repeated .google.protobuf.SourceCodeInfo.Location location = 1; + int location_size() const; + private: + int _internal_location_size() const; + public: + void clear_location(); + PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* mutable_location(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >* + mutable_location(); + private: + const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& _internal_location(int index) const; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* _internal_add_location(); + public: + const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& location(int index) const; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* add_location(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >& + location() const; + + // @@protoc_insertion_point(class_scope:google.protobuf.SourceCodeInfo) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location > location_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT GeneratedCodeInfo_Annotation PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.GeneratedCodeInfo.Annotation) */ { + public: + inline GeneratedCodeInfo_Annotation() : GeneratedCodeInfo_Annotation(nullptr) {} + virtual ~GeneratedCodeInfo_Annotation(); + + GeneratedCodeInfo_Annotation(const GeneratedCodeInfo_Annotation& from); + GeneratedCodeInfo_Annotation(GeneratedCodeInfo_Annotation&& from) noexcept + : GeneratedCodeInfo_Annotation() { + *this = ::std::move(from); + } + + inline GeneratedCodeInfo_Annotation& operator=(const GeneratedCodeInfo_Annotation& from) { + CopyFrom(from); + return *this; + } + inline GeneratedCodeInfo_Annotation& operator=(GeneratedCodeInfo_Annotation&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const GeneratedCodeInfo_Annotation& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const GeneratedCodeInfo_Annotation* internal_default_instance() { + return reinterpret_cast( + &_GeneratedCodeInfo_Annotation_default_instance_); + } + static constexpr int kIndexInFileMessages = + 25; + + friend void swap(GeneratedCodeInfo_Annotation& a, GeneratedCodeInfo_Annotation& b) { + a.Swap(&b); + } + inline void Swap(GeneratedCodeInfo_Annotation* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(GeneratedCodeInfo_Annotation* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline GeneratedCodeInfo_Annotation* New() const final { + return CreateMaybeMessage(nullptr); + } + + GeneratedCodeInfo_Annotation* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const GeneratedCodeInfo_Annotation& from); + void MergeFrom(const GeneratedCodeInfo_Annotation& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(GeneratedCodeInfo_Annotation* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.GeneratedCodeInfo.Annotation"; + } + protected: + explicit GeneratedCodeInfo_Annotation(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kPathFieldNumber = 1, + kSourceFileFieldNumber = 2, + kBeginFieldNumber = 3, + kEndFieldNumber = 4, + }; + // repeated int32 path = 1 [packed = true]; + int path_size() const; + private: + int _internal_path_size() const; + public: + void clear_path(); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_path(int index) const; + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + _internal_path() const; + void _internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value); + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + _internal_mutable_path(); + public: + ::PROTOBUF_NAMESPACE_ID::int32 path(int index) const; + void set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value); + void add_path(::PROTOBUF_NAMESPACE_ID::int32 value); + const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& + path() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* + mutable_path(); + + // optional string source_file = 2; + bool has_source_file() const; + private: + bool _internal_has_source_file() const; + public: + void clear_source_file(); + const std::string& source_file() const; + void set_source_file(const std::string& value); + void set_source_file(std::string&& value); + void set_source_file(const char* value); + void set_source_file(const char* value, size_t size); + std::string* mutable_source_file(); + std::string* release_source_file(); + void set_allocated_source_file(std::string* source_file); + private: + const std::string& _internal_source_file() const; + void _internal_set_source_file(const std::string& value); + std::string* _internal_mutable_source_file(); + public: + + // optional int32 begin = 3; + bool has_begin() const; + private: + bool _internal_has_begin() const; + public: + void clear_begin(); + ::PROTOBUF_NAMESPACE_ID::int32 begin() const; + void set_begin(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_begin() const; + void _internal_set_begin(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // optional int32 end = 4; + bool has_end() const; + private: + bool _internal_has_end() const; + public: + void clear_end(); + ::PROTOBUF_NAMESPACE_ID::int32 end() const; + void set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const; + void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.GeneratedCodeInfo.Annotation) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > path_; + mutable std::atomic _path_cached_byte_size_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr source_file_; + ::PROTOBUF_NAMESPACE_ID::int32 begin_; + ::PROTOBUF_NAMESPACE_ID::int32 end_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT GeneratedCodeInfo PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.GeneratedCodeInfo) */ { + public: + inline GeneratedCodeInfo() : GeneratedCodeInfo(nullptr) {} + virtual ~GeneratedCodeInfo(); + + GeneratedCodeInfo(const GeneratedCodeInfo& from); + GeneratedCodeInfo(GeneratedCodeInfo&& from) noexcept + : GeneratedCodeInfo() { + *this = ::std::move(from); + } + + inline GeneratedCodeInfo& operator=(const GeneratedCodeInfo& from) { + CopyFrom(from); + return *this; + } + inline GeneratedCodeInfo& operator=(GeneratedCodeInfo&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance); + } + inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const GeneratedCodeInfo& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const GeneratedCodeInfo* internal_default_instance() { + return reinterpret_cast( + &_GeneratedCodeInfo_default_instance_); + } + static constexpr int kIndexInFileMessages = + 26; + + friend void swap(GeneratedCodeInfo& a, GeneratedCodeInfo& b) { + a.Swap(&b); + } + inline void Swap(GeneratedCodeInfo* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(GeneratedCodeInfo* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline GeneratedCodeInfo* New() const final { + return CreateMaybeMessage(nullptr); + } + + GeneratedCodeInfo* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const GeneratedCodeInfo& from); + void MergeFrom(const GeneratedCodeInfo& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(GeneratedCodeInfo* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.GeneratedCodeInfo"; + } + protected: + explicit GeneratedCodeInfo(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto); + return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + typedef GeneratedCodeInfo_Annotation Annotation; + + // accessors ------------------------------------------------------- + + enum : int { + kAnnotationFieldNumber = 1, + }; + // repeated .google.protobuf.GeneratedCodeInfo.Annotation annotation = 1; + int annotation_size() const; + private: + int _internal_annotation_size() const; + public: + void clear_annotation(); + PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* mutable_annotation(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >* + mutable_annotation(); + private: + const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& _internal_annotation(int index) const; + PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* _internal_add_annotation(); + public: + const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& annotation(int index) const; + PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* add_annotation(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >& + annotation() const; + + // @@protoc_insertion_point(class_scope:google.protobuf.GeneratedCodeInfo) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation > annotation_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// FileDescriptorSet + +// repeated .google.protobuf.FileDescriptorProto file = 1; +inline int FileDescriptorSet::_internal_file_size() const { + return file_.size(); +} +inline int FileDescriptorSet::file_size() const { + return _internal_file_size(); +} +inline void FileDescriptorSet::clear_file() { + file_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::mutable_file(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorSet.file) + return file_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >* +FileDescriptorSet::mutable_file() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorSet.file) + return &file_; +} +inline const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& FileDescriptorSet::_internal_file(int index) const { + return file_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& FileDescriptorSet::file(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorSet.file) + return _internal_file(index); +} +inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::_internal_add_file() { + return file_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::add_file() { + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorSet.file) + return _internal_add_file(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >& +FileDescriptorSet::file() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorSet.file) + return file_; +} + +// ------------------------------------------------------------------- + +// FileDescriptorProto + +// optional string name = 1; +inline bool FileDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool FileDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void FileDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& FileDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.name) + return _internal_name(); +} +inline void FileDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.name) +} +inline std::string* FileDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& FileDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void FileDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.name) +} +inline void FileDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.name) +} +inline void FileDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.name) +} +inline std::string* FileDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.name) +} + +// optional string package = 2; +inline bool FileDescriptorProto::_internal_has_package() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool FileDescriptorProto::has_package() const { + return _internal_has_package(); +} +inline void FileDescriptorProto::clear_package() { + package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& FileDescriptorProto::package() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.package) + return _internal_package(); +} +inline void FileDescriptorProto::set_package(const std::string& value) { + _internal_set_package(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.package) +} +inline std::string* FileDescriptorProto::mutable_package() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.package) + return _internal_mutable_package(); +} +inline const std::string& FileDescriptorProto::_internal_package() const { + return package_.Get(); +} +inline void FileDescriptorProto::_internal_set_package(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileDescriptorProto::set_package(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + package_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.package) +} +inline void FileDescriptorProto::set_package(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.package) +} +inline void FileDescriptorProto::set_package(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.package) +} +inline std::string* FileDescriptorProto::_internal_mutable_package() { + _has_bits_[0] |= 0x00000002u; + return package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileDescriptorProto::release_package() { + // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.package) + if (!_internal_has_package()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileDescriptorProto::set_allocated_package(std::string* package) { + if (package != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), package, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.package) +} + +// repeated string dependency = 3; +inline int FileDescriptorProto::_internal_dependency_size() const { + return dependency_.size(); +} +inline int FileDescriptorProto::dependency_size() const { + return _internal_dependency_size(); +} +inline void FileDescriptorProto::clear_dependency() { + dependency_.Clear(); +} +inline std::string* FileDescriptorProto::add_dependency() { + // @@protoc_insertion_point(field_add_mutable:google.protobuf.FileDescriptorProto.dependency) + return _internal_add_dependency(); +} +inline const std::string& FileDescriptorProto::_internal_dependency(int index) const { + return dependency_.Get(index); +} +inline const std::string& FileDescriptorProto::dependency(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.dependency) + return _internal_dependency(index); +} +inline std::string* FileDescriptorProto::mutable_dependency(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.dependency) + return dependency_.Mutable(index); +} +inline void FileDescriptorProto::set_dependency(int index, const std::string& value) { + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.dependency) + dependency_.Mutable(index)->assign(value); +} +inline void FileDescriptorProto::set_dependency(int index, std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.dependency) + dependency_.Mutable(index)->assign(std::move(value)); +} +inline void FileDescriptorProto::set_dependency(int index, const char* value) { + GOOGLE_DCHECK(value != nullptr); + dependency_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.dependency) +} +inline void FileDescriptorProto::set_dependency(int index, const char* value, size_t size) { + dependency_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.dependency) +} +inline std::string* FileDescriptorProto::_internal_add_dependency() { + return dependency_.Add(); +} +inline void FileDescriptorProto::add_dependency(const std::string& value) { + dependency_.Add()->assign(value); + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.dependency) +} +inline void FileDescriptorProto::add_dependency(std::string&& value) { + dependency_.Add(std::move(value)); + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.dependency) +} +inline void FileDescriptorProto::add_dependency(const char* value) { + GOOGLE_DCHECK(value != nullptr); + dependency_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:google.protobuf.FileDescriptorProto.dependency) +} +inline void FileDescriptorProto::add_dependency(const char* value, size_t size) { + dependency_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:google.protobuf.FileDescriptorProto.dependency) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& +FileDescriptorProto::dependency() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.dependency) + return dependency_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* +FileDescriptorProto::mutable_dependency() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.dependency) + return &dependency_; +} + +// repeated int32 public_dependency = 10; +inline int FileDescriptorProto::_internal_public_dependency_size() const { + return public_dependency_.size(); +} +inline int FileDescriptorProto::public_dependency_size() const { + return _internal_public_dependency_size(); +} +inline void FileDescriptorProto::clear_public_dependency() { + public_dependency_.Clear(); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::_internal_public_dependency(int index) const { + return public_dependency_.Get(index); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::public_dependency(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.public_dependency) + return _internal_public_dependency(index); +} +inline void FileDescriptorProto::set_public_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) { + public_dependency_.Set(index, value); + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.public_dependency) +} +inline void FileDescriptorProto::_internal_add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) { + public_dependency_.Add(value); +} +inline void FileDescriptorProto::add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_add_public_dependency(value); + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.public_dependency) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +FileDescriptorProto::_internal_public_dependency() const { + return public_dependency_; +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +FileDescriptorProto::public_dependency() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.public_dependency) + return _internal_public_dependency(); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +FileDescriptorProto::_internal_mutable_public_dependency() { + return &public_dependency_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +FileDescriptorProto::mutable_public_dependency() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.public_dependency) + return _internal_mutable_public_dependency(); +} + +// repeated int32 weak_dependency = 11; +inline int FileDescriptorProto::_internal_weak_dependency_size() const { + return weak_dependency_.size(); +} +inline int FileDescriptorProto::weak_dependency_size() const { + return _internal_weak_dependency_size(); +} +inline void FileDescriptorProto::clear_weak_dependency() { + weak_dependency_.Clear(); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::_internal_weak_dependency(int index) const { + return weak_dependency_.Get(index); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::weak_dependency(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.weak_dependency) + return _internal_weak_dependency(index); +} +inline void FileDescriptorProto::set_weak_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) { + weak_dependency_.Set(index, value); + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.weak_dependency) +} +inline void FileDescriptorProto::_internal_add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) { + weak_dependency_.Add(value); +} +inline void FileDescriptorProto::add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_add_weak_dependency(value); + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.weak_dependency) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +FileDescriptorProto::_internal_weak_dependency() const { + return weak_dependency_; +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +FileDescriptorProto::weak_dependency() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.weak_dependency) + return _internal_weak_dependency(); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +FileDescriptorProto::_internal_mutable_weak_dependency() { + return &weak_dependency_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +FileDescriptorProto::mutable_weak_dependency() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.weak_dependency) + return _internal_mutable_weak_dependency(); +} + +// repeated .google.protobuf.DescriptorProto message_type = 4; +inline int FileDescriptorProto::_internal_message_type_size() const { + return message_type_.size(); +} +inline int FileDescriptorProto::message_type_size() const { + return _internal_message_type_size(); +} +inline void FileDescriptorProto::clear_message_type() { + message_type_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::mutable_message_type(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.message_type) + return message_type_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >* +FileDescriptorProto::mutable_message_type() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.message_type) + return &message_type_; +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& FileDescriptorProto::_internal_message_type(int index) const { + return message_type_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& FileDescriptorProto::message_type(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.message_type) + return _internal_message_type(index); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::_internal_add_message_type() { + return message_type_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::add_message_type() { + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.message_type) + return _internal_add_message_type(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >& +FileDescriptorProto::message_type() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.message_type) + return message_type_; +} + +// repeated .google.protobuf.EnumDescriptorProto enum_type = 5; +inline int FileDescriptorProto::_internal_enum_type_size() const { + return enum_type_.size(); +} +inline int FileDescriptorProto::enum_type_size() const { + return _internal_enum_type_size(); +} +inline void FileDescriptorProto::clear_enum_type() { + enum_type_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::mutable_enum_type(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.enum_type) + return enum_type_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >* +FileDescriptorProto::mutable_enum_type() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.enum_type) + return &enum_type_; +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& FileDescriptorProto::_internal_enum_type(int index) const { + return enum_type_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& FileDescriptorProto::enum_type(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.enum_type) + return _internal_enum_type(index); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::_internal_add_enum_type() { + return enum_type_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::add_enum_type() { + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.enum_type) + return _internal_add_enum_type(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >& +FileDescriptorProto::enum_type() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.enum_type) + return enum_type_; +} + +// repeated .google.protobuf.ServiceDescriptorProto service = 6; +inline int FileDescriptorProto::_internal_service_size() const { + return service_.size(); +} +inline int FileDescriptorProto::service_size() const { + return _internal_service_size(); +} +inline void FileDescriptorProto::clear_service() { + service_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::mutable_service(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.service) + return service_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >* +FileDescriptorProto::mutable_service() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.service) + return &service_; +} +inline const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& FileDescriptorProto::_internal_service(int index) const { + return service_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& FileDescriptorProto::service(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.service) + return _internal_service(index); +} +inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::_internal_add_service() { + return service_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::add_service() { + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.service) + return _internal_add_service(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >& +FileDescriptorProto::service() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.service) + return service_; +} + +// repeated .google.protobuf.FieldDescriptorProto extension = 7; +inline int FileDescriptorProto::_internal_extension_size() const { + return extension_.size(); +} +inline int FileDescriptorProto::extension_size() const { + return _internal_extension_size(); +} +inline void FileDescriptorProto::clear_extension() { + extension_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::mutable_extension(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.extension) + return extension_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* +FileDescriptorProto::mutable_extension() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.extension) + return &extension_; +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& FileDescriptorProto::_internal_extension(int index) const { + return extension_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& FileDescriptorProto::extension(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.extension) + return _internal_extension(index); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::_internal_add_extension() { + return extension_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::add_extension() { + // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.extension) + return _internal_add_extension(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& +FileDescriptorProto::extension() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.extension) + return extension_; +} + +// optional .google.protobuf.FileOptions options = 8; +inline bool FileDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool FileDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void FileDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000008u; +} +inline const PROTOBUF_NAMESPACE_ID::FileOptions& FileDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::FileOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_FileOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::FileOptions& FileDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.options) + return _internal_options(); +} +inline void FileDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::FileOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FileDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000008u; + PROTOBUF_NAMESPACE_ID::FileOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.options) + _has_bits_[0] &= ~0x00000008u; + PROTOBUF_NAMESPACE_ID::FileOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000008u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.options) + return _internal_mutable_options(); +} +inline void FileDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::FileOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.options) +} + +// optional .google.protobuf.SourceCodeInfo source_code_info = 9; +inline bool FileDescriptorProto::_internal_has_source_code_info() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + PROTOBUF_ASSUME(!value || source_code_info_ != nullptr); + return value; +} +inline bool FileDescriptorProto::has_source_code_info() const { + return _internal_has_source_code_info(); +} +inline void FileDescriptorProto::clear_source_code_info() { + if (source_code_info_ != nullptr) source_code_info_->Clear(); + _has_bits_[0] &= ~0x00000010u; +} +inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& FileDescriptorProto::_internal_source_code_info() const { + const PROTOBUF_NAMESPACE_ID::SourceCodeInfo* p = source_code_info_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_SourceCodeInfo_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& FileDescriptorProto::source_code_info() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.source_code_info) + return _internal_source_code_info(); +} +inline void FileDescriptorProto::unsafe_arena_set_allocated_source_code_info( + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_code_info_); + } + source_code_info_ = source_code_info; + if (source_code_info) { + _has_bits_[0] |= 0x00000010u; + } else { + _has_bits_[0] &= ~0x00000010u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FileDescriptorProto.source_code_info) +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::release_source_code_info() { + _has_bits_[0] &= ~0x00000010u; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* temp = source_code_info_; + source_code_info_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::unsafe_arena_release_source_code_info() { + // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.source_code_info) + _has_bits_[0] &= ~0x00000010u; + PROTOBUF_NAMESPACE_ID::SourceCodeInfo* temp = source_code_info_; + source_code_info_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::_internal_mutable_source_code_info() { + _has_bits_[0] |= 0x00000010u; + if (source_code_info_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + source_code_info_ = p; + } + return source_code_info_; +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::mutable_source_code_info() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.source_code_info) + return _internal_mutable_source_code_info(); +} +inline void FileDescriptorProto::set_allocated_source_code_info(PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete source_code_info_; + } + if (source_code_info) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(source_code_info); + if (message_arena != submessage_arena) { + source_code_info = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, source_code_info, submessage_arena); + } + _has_bits_[0] |= 0x00000010u; + } else { + _has_bits_[0] &= ~0x00000010u; + } + source_code_info_ = source_code_info; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.source_code_info) +} + +// optional string syntax = 12; +inline bool FileDescriptorProto::_internal_has_syntax() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool FileDescriptorProto::has_syntax() const { + return _internal_has_syntax(); +} +inline void FileDescriptorProto::clear_syntax() { + syntax_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000004u; +} +inline const std::string& FileDescriptorProto::syntax() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.syntax) + return _internal_syntax(); +} +inline void FileDescriptorProto::set_syntax(const std::string& value) { + _internal_set_syntax(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.syntax) +} +inline std::string* FileDescriptorProto::mutable_syntax() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.syntax) + return _internal_mutable_syntax(); +} +inline const std::string& FileDescriptorProto::_internal_syntax() const { + return syntax_.Get(); +} +inline void FileDescriptorProto::_internal_set_syntax(const std::string& value) { + _has_bits_[0] |= 0x00000004u; + syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileDescriptorProto::set_syntax(std::string&& value) { + _has_bits_[0] |= 0x00000004u; + syntax_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.syntax) +} +inline void FileDescriptorProto::set_syntax(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000004u; + syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.syntax) +} +inline void FileDescriptorProto::set_syntax(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000004u; + syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.syntax) +} +inline std::string* FileDescriptorProto::_internal_mutable_syntax() { + _has_bits_[0] |= 0x00000004u; + return syntax_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileDescriptorProto::release_syntax() { + // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.syntax) + if (!_internal_has_syntax()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000004u; + return syntax_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileDescriptorProto::set_allocated_syntax(std::string* syntax) { + if (syntax != nullptr) { + _has_bits_[0] |= 0x00000004u; + } else { + _has_bits_[0] &= ~0x00000004u; + } + syntax_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), syntax, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.syntax) +} + +// ------------------------------------------------------------------- + +// DescriptorProto_ExtensionRange + +// optional int32 start = 1; +inline bool DescriptorProto_ExtensionRange::_internal_has_start() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool DescriptorProto_ExtensionRange::has_start() const { + return _internal_has_start(); +} +inline void DescriptorProto_ExtensionRange::clear_start() { + start_ = 0; + _has_bits_[0] &= ~0x00000002u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::_internal_start() const { + return start_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::start() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.start) + return _internal_start(); +} +inline void DescriptorProto_ExtensionRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000002u; + start_ = value; +} +inline void DescriptorProto_ExtensionRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_start(value); + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ExtensionRange.start) +} + +// optional int32 end = 2; +inline bool DescriptorProto_ExtensionRange::_internal_has_end() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool DescriptorProto_ExtensionRange::has_end() const { + return _internal_has_end(); +} +inline void DescriptorProto_ExtensionRange::clear_end() { + end_ = 0; + _has_bits_[0] &= ~0x00000004u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::_internal_end() const { + return end_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::end() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.end) + return _internal_end(); +} +inline void DescriptorProto_ExtensionRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000004u; + end_ = value; +} +inline void DescriptorProto_ExtensionRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_end(value); + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ExtensionRange.end) +} + +// optional .google.protobuf.ExtensionRangeOptions options = 3; +inline bool DescriptorProto_ExtensionRange::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool DescriptorProto_ExtensionRange::has_options() const { + return _internal_has_options(); +} +inline void DescriptorProto_ExtensionRange::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000001u; +} +inline const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& DescriptorProto_ExtensionRange::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_ExtensionRangeOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& DescriptorProto_ExtensionRange::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.options) + return _internal_options(); +} +inline void DescriptorProto_ExtensionRange::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.DescriptorProto.ExtensionRange.options) +} +inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::release_options() { + _has_bits_[0] &= ~0x00000001u; + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.ExtensionRange.options) + _has_bits_[0] &= ~0x00000001u; + PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::_internal_mutable_options() { + _has_bits_[0] |= 0x00000001u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.ExtensionRange.options) + return _internal_mutable_options(); +} +inline void DescriptorProto_ExtensionRange::set_allocated_options(PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.ExtensionRange.options) +} + +// ------------------------------------------------------------------- + +// DescriptorProto_ReservedRange + +// optional int32 start = 1; +inline bool DescriptorProto_ReservedRange::_internal_has_start() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool DescriptorProto_ReservedRange::has_start() const { + return _internal_has_start(); +} +inline void DescriptorProto_ReservedRange::clear_start() { + start_ = 0; + _has_bits_[0] &= ~0x00000001u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::_internal_start() const { + return start_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::start() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ReservedRange.start) + return _internal_start(); +} +inline void DescriptorProto_ReservedRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000001u; + start_ = value; +} +inline void DescriptorProto_ReservedRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_start(value); + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ReservedRange.start) +} + +// optional int32 end = 2; +inline bool DescriptorProto_ReservedRange::_internal_has_end() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool DescriptorProto_ReservedRange::has_end() const { + return _internal_has_end(); +} +inline void DescriptorProto_ReservedRange::clear_end() { + end_ = 0; + _has_bits_[0] &= ~0x00000002u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::_internal_end() const { + return end_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::end() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ReservedRange.end) + return _internal_end(); +} +inline void DescriptorProto_ReservedRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000002u; + end_ = value; +} +inline void DescriptorProto_ReservedRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_end(value); + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ReservedRange.end) +} + +// ------------------------------------------------------------------- + +// DescriptorProto + +// optional string name = 1; +inline bool DescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool DescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void DescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& DescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.name) + return _internal_name(); +} +inline void DescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.name) +} +inline std::string* DescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& DescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void DescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void DescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.DescriptorProto.name) +} +inline void DescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.DescriptorProto.name) +} +inline void DescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.DescriptorProto.name) +} +inline std::string* DescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* DescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void DescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.name) +} + +// repeated .google.protobuf.FieldDescriptorProto field = 2; +inline int DescriptorProto::_internal_field_size() const { + return field_.size(); +} +inline int DescriptorProto::field_size() const { + return _internal_field_size(); +} +inline void DescriptorProto::clear_field() { + field_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::mutable_field(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.field) + return field_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* +DescriptorProto::mutable_field() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.field) + return &field_; +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::_internal_field(int index) const { + return field_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::field(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.field) + return _internal_field(index); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::_internal_add_field() { + return field_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::add_field() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.field) + return _internal_add_field(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& +DescriptorProto::field() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.field) + return field_; +} + +// repeated .google.protobuf.FieldDescriptorProto extension = 6; +inline int DescriptorProto::_internal_extension_size() const { + return extension_.size(); +} +inline int DescriptorProto::extension_size() const { + return _internal_extension_size(); +} +inline void DescriptorProto::clear_extension() { + extension_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::mutable_extension(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.extension) + return extension_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >* +DescriptorProto::mutable_extension() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.extension) + return &extension_; +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::_internal_extension(int index) const { + return extension_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::extension(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.extension) + return _internal_extension(index); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::_internal_add_extension() { + return extension_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::add_extension() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.extension) + return _internal_add_extension(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >& +DescriptorProto::extension() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.extension) + return extension_; +} + +// repeated .google.protobuf.DescriptorProto nested_type = 3; +inline int DescriptorProto::_internal_nested_type_size() const { + return nested_type_.size(); +} +inline int DescriptorProto::nested_type_size() const { + return _internal_nested_type_size(); +} +inline void DescriptorProto::clear_nested_type() { + nested_type_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::mutable_nested_type(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.nested_type) + return nested_type_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >* +DescriptorProto::mutable_nested_type() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.nested_type) + return &nested_type_; +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& DescriptorProto::_internal_nested_type(int index) const { + return nested_type_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& DescriptorProto::nested_type(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.nested_type) + return _internal_nested_type(index); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::_internal_add_nested_type() { + return nested_type_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::add_nested_type() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.nested_type) + return _internal_add_nested_type(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >& +DescriptorProto::nested_type() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.nested_type) + return nested_type_; +} + +// repeated .google.protobuf.EnumDescriptorProto enum_type = 4; +inline int DescriptorProto::_internal_enum_type_size() const { + return enum_type_.size(); +} +inline int DescriptorProto::enum_type_size() const { + return _internal_enum_type_size(); +} +inline void DescriptorProto::clear_enum_type() { + enum_type_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::mutable_enum_type(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.enum_type) + return enum_type_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >* +DescriptorProto::mutable_enum_type() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.enum_type) + return &enum_type_; +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& DescriptorProto::_internal_enum_type(int index) const { + return enum_type_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& DescriptorProto::enum_type(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.enum_type) + return _internal_enum_type(index); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::_internal_add_enum_type() { + return enum_type_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::add_enum_type() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.enum_type) + return _internal_add_enum_type(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >& +DescriptorProto::enum_type() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.enum_type) + return enum_type_; +} + +// repeated .google.protobuf.DescriptorProto.ExtensionRange extension_range = 5; +inline int DescriptorProto::_internal_extension_range_size() const { + return extension_range_.size(); +} +inline int DescriptorProto::extension_range_size() const { + return _internal_extension_range_size(); +} +inline void DescriptorProto::clear_extension_range() { + extension_range_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::mutable_extension_range(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.extension_range) + return extension_range_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >* +DescriptorProto::mutable_extension_range() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.extension_range) + return &extension_range_; +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& DescriptorProto::_internal_extension_range(int index) const { + return extension_range_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& DescriptorProto::extension_range(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.extension_range) + return _internal_extension_range(index); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::_internal_add_extension_range() { + return extension_range_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::add_extension_range() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.extension_range) + return _internal_add_extension_range(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >& +DescriptorProto::extension_range() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.extension_range) + return extension_range_; +} + +// repeated .google.protobuf.OneofDescriptorProto oneof_decl = 8; +inline int DescriptorProto::_internal_oneof_decl_size() const { + return oneof_decl_.size(); +} +inline int DescriptorProto::oneof_decl_size() const { + return _internal_oneof_decl_size(); +} +inline void DescriptorProto::clear_oneof_decl() { + oneof_decl_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::mutable_oneof_decl(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.oneof_decl) + return oneof_decl_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >* +DescriptorProto::mutable_oneof_decl() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.oneof_decl) + return &oneof_decl_; +} +inline const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& DescriptorProto::_internal_oneof_decl(int index) const { + return oneof_decl_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& DescriptorProto::oneof_decl(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.oneof_decl) + return _internal_oneof_decl(index); +} +inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::_internal_add_oneof_decl() { + return oneof_decl_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::add_oneof_decl() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.oneof_decl) + return _internal_add_oneof_decl(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >& +DescriptorProto::oneof_decl() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.oneof_decl) + return oneof_decl_; +} + +// optional .google.protobuf.MessageOptions options = 7; +inline bool DescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool DescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void DescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000002u; +} +inline const PROTOBUF_NAMESPACE_ID::MessageOptions& DescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::MessageOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_MessageOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::MessageOptions& DescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.options) + return _internal_options(); +} +inline void DescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::MessageOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.DescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::MessageOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.options) + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::MessageOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000002u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.options) + return _internal_mutable_options(); +} +inline void DescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::MessageOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.options) +} + +// repeated .google.protobuf.DescriptorProto.ReservedRange reserved_range = 9; +inline int DescriptorProto::_internal_reserved_range_size() const { + return reserved_range_.size(); +} +inline int DescriptorProto::reserved_range_size() const { + return _internal_reserved_range_size(); +} +inline void DescriptorProto::clear_reserved_range() { + reserved_range_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::mutable_reserved_range(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.reserved_range) + return reserved_range_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >* +DescriptorProto::mutable_reserved_range() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.reserved_range) + return &reserved_range_; +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& DescriptorProto::_internal_reserved_range(int index) const { + return reserved_range_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& DescriptorProto::reserved_range(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.reserved_range) + return _internal_reserved_range(index); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::_internal_add_reserved_range() { + return reserved_range_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::add_reserved_range() { + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_range) + return _internal_add_reserved_range(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >& +DescriptorProto::reserved_range() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.reserved_range) + return reserved_range_; +} + +// repeated string reserved_name = 10; +inline int DescriptorProto::_internal_reserved_name_size() const { + return reserved_name_.size(); +} +inline int DescriptorProto::reserved_name_size() const { + return _internal_reserved_name_size(); +} +inline void DescriptorProto::clear_reserved_name() { + reserved_name_.Clear(); +} +inline std::string* DescriptorProto::add_reserved_name() { + // @@protoc_insertion_point(field_add_mutable:google.protobuf.DescriptorProto.reserved_name) + return _internal_add_reserved_name(); +} +inline const std::string& DescriptorProto::_internal_reserved_name(int index) const { + return reserved_name_.Get(index); +} +inline const std::string& DescriptorProto::reserved_name(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.reserved_name) + return _internal_reserved_name(index); +} +inline std::string* DescriptorProto::mutable_reserved_name(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.reserved_name) + return reserved_name_.Mutable(index); +} +inline void DescriptorProto::set_reserved_name(int index, const std::string& value) { + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.reserved_name) + reserved_name_.Mutable(index)->assign(value); +} +inline void DescriptorProto::set_reserved_name(int index, std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.reserved_name) + reserved_name_.Mutable(index)->assign(std::move(value)); +} +inline void DescriptorProto::set_reserved_name(int index, const char* value) { + GOOGLE_DCHECK(value != nullptr); + reserved_name_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:google.protobuf.DescriptorProto.reserved_name) +} +inline void DescriptorProto::set_reserved_name(int index, const char* value, size_t size) { + reserved_name_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.DescriptorProto.reserved_name) +} +inline std::string* DescriptorProto::_internal_add_reserved_name() { + return reserved_name_.Add(); +} +inline void DescriptorProto::add_reserved_name(const std::string& value) { + reserved_name_.Add()->assign(value); + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_name) +} +inline void DescriptorProto::add_reserved_name(std::string&& value) { + reserved_name_.Add(std::move(value)); + // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_name) +} +inline void DescriptorProto::add_reserved_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + reserved_name_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:google.protobuf.DescriptorProto.reserved_name) +} +inline void DescriptorProto::add_reserved_name(const char* value, size_t size) { + reserved_name_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:google.protobuf.DescriptorProto.reserved_name) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& +DescriptorProto::reserved_name() const { + // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.reserved_name) + return reserved_name_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* +DescriptorProto::mutable_reserved_name() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.reserved_name) + return &reserved_name_; +} + +// ------------------------------------------------------------------- + +// ExtensionRangeOptions + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int ExtensionRangeOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int ExtensionRangeOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void ExtensionRangeOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.ExtensionRangeOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +ExtensionRangeOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.ExtensionRangeOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ExtensionRangeOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ExtensionRangeOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.ExtensionRangeOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.ExtensionRangeOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +ExtensionRangeOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.ExtensionRangeOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// FieldDescriptorProto + +// optional string name = 1; +inline bool FieldDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void FieldDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& FieldDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.name) + return _internal_name(); +} +inline void FieldDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.name) +} +inline std::string* FieldDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& FieldDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void FieldDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FieldDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.name) +} +inline void FieldDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.name) +} +inline void FieldDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.name) +} +inline std::string* FieldDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FieldDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FieldDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.name) +} + +// optional int32 number = 3; +inline bool FieldDescriptorProto::_internal_has_number() const { + bool value = (_has_bits_[0] & 0x00000040u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_number() const { + return _internal_has_number(); +} +inline void FieldDescriptorProto::clear_number() { + number_ = 0; + _has_bits_[0] &= ~0x00000040u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::_internal_number() const { + return number_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::number() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.number) + return _internal_number(); +} +inline void FieldDescriptorProto::_internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000040u; + number_ = value; +} +inline void FieldDescriptorProto::set_number(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_number(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.number) +} + +// optional .google.protobuf.FieldDescriptorProto.Label label = 4; +inline bool FieldDescriptorProto::_internal_has_label() const { + bool value = (_has_bits_[0] & 0x00000200u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_label() const { + return _internal_has_label(); +} +inline void FieldDescriptorProto::clear_label() { + label_ = 1; + _has_bits_[0] &= ~0x00000200u; +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label FieldDescriptorProto::_internal_label() const { + return static_cast< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label >(label_); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label FieldDescriptorProto::label() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.label) + return _internal_label(); +} +inline void FieldDescriptorProto::_internal_set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value) { + assert(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label_IsValid(value)); + _has_bits_[0] |= 0x00000200u; + label_ = value; +} +inline void FieldDescriptorProto::set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value) { + _internal_set_label(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.label) +} + +// optional .google.protobuf.FieldDescriptorProto.Type type = 5; +inline bool FieldDescriptorProto::_internal_has_type() const { + bool value = (_has_bits_[0] & 0x00000400u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_type() const { + return _internal_has_type(); +} +inline void FieldDescriptorProto::clear_type() { + type_ = 1; + _has_bits_[0] &= ~0x00000400u; +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type FieldDescriptorProto::_internal_type() const { + return static_cast< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type >(type_); +} +inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type FieldDescriptorProto::type() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.type) + return _internal_type(); +} +inline void FieldDescriptorProto::_internal_set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value) { + assert(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type_IsValid(value)); + _has_bits_[0] |= 0x00000400u; + type_ = value; +} +inline void FieldDescriptorProto::set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value) { + _internal_set_type(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.type) +} + +// optional string type_name = 6; +inline bool FieldDescriptorProto::_internal_has_type_name() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_type_name() const { + return _internal_has_type_name(); +} +inline void FieldDescriptorProto::clear_type_name() { + type_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000004u; +} +inline const std::string& FieldDescriptorProto::type_name() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.type_name) + return _internal_type_name(); +} +inline void FieldDescriptorProto::set_type_name(const std::string& value) { + _internal_set_type_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.type_name) +} +inline std::string* FieldDescriptorProto::mutable_type_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.type_name) + return _internal_mutable_type_name(); +} +inline const std::string& FieldDescriptorProto::_internal_type_name() const { + return type_name_.Get(); +} +inline void FieldDescriptorProto::_internal_set_type_name(const std::string& value) { + _has_bits_[0] |= 0x00000004u; + type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FieldDescriptorProto::set_type_name(std::string&& value) { + _has_bits_[0] |= 0x00000004u; + type_name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.type_name) +} +inline void FieldDescriptorProto::set_type_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000004u; + type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.type_name) +} +inline void FieldDescriptorProto::set_type_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000004u; + type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.type_name) +} +inline std::string* FieldDescriptorProto::_internal_mutable_type_name() { + _has_bits_[0] |= 0x00000004u; + return type_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FieldDescriptorProto::release_type_name() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.type_name) + if (!_internal_has_type_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000004u; + return type_name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FieldDescriptorProto::set_allocated_type_name(std::string* type_name) { + if (type_name != nullptr) { + _has_bits_[0] |= 0x00000004u; + } else { + _has_bits_[0] &= ~0x00000004u; + } + type_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), type_name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.type_name) +} + +// optional string extendee = 2; +inline bool FieldDescriptorProto::_internal_has_extendee() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_extendee() const { + return _internal_has_extendee(); +} +inline void FieldDescriptorProto::clear_extendee() { + extendee_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& FieldDescriptorProto::extendee() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.extendee) + return _internal_extendee(); +} +inline void FieldDescriptorProto::set_extendee(const std::string& value) { + _internal_set_extendee(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.extendee) +} +inline std::string* FieldDescriptorProto::mutable_extendee() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.extendee) + return _internal_mutable_extendee(); +} +inline const std::string& FieldDescriptorProto::_internal_extendee() const { + return extendee_.Get(); +} +inline void FieldDescriptorProto::_internal_set_extendee(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FieldDescriptorProto::set_extendee(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + extendee_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.extendee) +} +inline void FieldDescriptorProto::set_extendee(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.extendee) +} +inline void FieldDescriptorProto::set_extendee(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.extendee) +} +inline std::string* FieldDescriptorProto::_internal_mutable_extendee() { + _has_bits_[0] |= 0x00000002u; + return extendee_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FieldDescriptorProto::release_extendee() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.extendee) + if (!_internal_has_extendee()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return extendee_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FieldDescriptorProto::set_allocated_extendee(std::string* extendee) { + if (extendee != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + extendee_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), extendee, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.extendee) +} + +// optional string default_value = 7; +inline bool FieldDescriptorProto::_internal_has_default_value() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_default_value() const { + return _internal_has_default_value(); +} +inline void FieldDescriptorProto::clear_default_value() { + default_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000008u; +} +inline const std::string& FieldDescriptorProto::default_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.default_value) + return _internal_default_value(); +} +inline void FieldDescriptorProto::set_default_value(const std::string& value) { + _internal_set_default_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.default_value) +} +inline std::string* FieldDescriptorProto::mutable_default_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.default_value) + return _internal_mutable_default_value(); +} +inline const std::string& FieldDescriptorProto::_internal_default_value() const { + return default_value_.Get(); +} +inline void FieldDescriptorProto::_internal_set_default_value(const std::string& value) { + _has_bits_[0] |= 0x00000008u; + default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FieldDescriptorProto::set_default_value(std::string&& value) { + _has_bits_[0] |= 0x00000008u; + default_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.default_value) +} +inline void FieldDescriptorProto::set_default_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000008u; + default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.default_value) +} +inline void FieldDescriptorProto::set_default_value(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000008u; + default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.default_value) +} +inline std::string* FieldDescriptorProto::_internal_mutable_default_value() { + _has_bits_[0] |= 0x00000008u; + return default_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FieldDescriptorProto::release_default_value() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.default_value) + if (!_internal_has_default_value()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000008u; + return default_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FieldDescriptorProto::set_allocated_default_value(std::string* default_value) { + if (default_value != nullptr) { + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + default_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), default_value, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.default_value) +} + +// optional int32 oneof_index = 9; +inline bool FieldDescriptorProto::_internal_has_oneof_index() const { + bool value = (_has_bits_[0] & 0x00000080u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_oneof_index() const { + return _internal_has_oneof_index(); +} +inline void FieldDescriptorProto::clear_oneof_index() { + oneof_index_ = 0; + _has_bits_[0] &= ~0x00000080u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::_internal_oneof_index() const { + return oneof_index_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::oneof_index() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.oneof_index) + return _internal_oneof_index(); +} +inline void FieldDescriptorProto::_internal_set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000080u; + oneof_index_ = value; +} +inline void FieldDescriptorProto::set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_oneof_index(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.oneof_index) +} + +// optional string json_name = 10; +inline bool FieldDescriptorProto::_internal_has_json_name() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_json_name() const { + return _internal_has_json_name(); +} +inline void FieldDescriptorProto::clear_json_name() { + json_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000010u; +} +inline const std::string& FieldDescriptorProto::json_name() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.json_name) + return _internal_json_name(); +} +inline void FieldDescriptorProto::set_json_name(const std::string& value) { + _internal_set_json_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.json_name) +} +inline std::string* FieldDescriptorProto::mutable_json_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.json_name) + return _internal_mutable_json_name(); +} +inline const std::string& FieldDescriptorProto::_internal_json_name() const { + return json_name_.Get(); +} +inline void FieldDescriptorProto::_internal_set_json_name(const std::string& value) { + _has_bits_[0] |= 0x00000010u; + json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FieldDescriptorProto::set_json_name(std::string&& value) { + _has_bits_[0] |= 0x00000010u; + json_name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.json_name) +} +inline void FieldDescriptorProto::set_json_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000010u; + json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.json_name) +} +inline void FieldDescriptorProto::set_json_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000010u; + json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.json_name) +} +inline std::string* FieldDescriptorProto::_internal_mutable_json_name() { + _has_bits_[0] |= 0x00000010u; + return json_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FieldDescriptorProto::release_json_name() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.json_name) + if (!_internal_has_json_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000010u; + return json_name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FieldDescriptorProto::set_allocated_json_name(std::string* json_name) { + if (json_name != nullptr) { + _has_bits_[0] |= 0x00000010u; + } else { + _has_bits_[0] &= ~0x00000010u; + } + json_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), json_name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.json_name) +} + +// optional .google.protobuf.FieldOptions options = 8; +inline bool FieldDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000020u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool FieldDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void FieldDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000020u; +} +inline const PROTOBUF_NAMESPACE_ID::FieldOptions& FieldDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::FieldOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_FieldOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::FieldOptions& FieldDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.options) + return _internal_options(); +} +inline void FieldDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::FieldOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000020u; + } else { + _has_bits_[0] &= ~0x00000020u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FieldDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000020u; + PROTOBUF_NAMESPACE_ID::FieldOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.options) + _has_bits_[0] &= ~0x00000020u; + PROTOBUF_NAMESPACE_ID::FieldOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000020u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.options) + return _internal_mutable_options(); +} +inline void FieldDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::FieldOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000020u; + } else { + _has_bits_[0] &= ~0x00000020u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.options) +} + +// optional bool proto3_optional = 17; +inline bool FieldDescriptorProto::_internal_has_proto3_optional() const { + bool value = (_has_bits_[0] & 0x00000100u) != 0; + return value; +} +inline bool FieldDescriptorProto::has_proto3_optional() const { + return _internal_has_proto3_optional(); +} +inline void FieldDescriptorProto::clear_proto3_optional() { + proto3_optional_ = false; + _has_bits_[0] &= ~0x00000100u; +} +inline bool FieldDescriptorProto::_internal_proto3_optional() const { + return proto3_optional_; +} +inline bool FieldDescriptorProto::proto3_optional() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.proto3_optional) + return _internal_proto3_optional(); +} +inline void FieldDescriptorProto::_internal_set_proto3_optional(bool value) { + _has_bits_[0] |= 0x00000100u; + proto3_optional_ = value; +} +inline void FieldDescriptorProto::set_proto3_optional(bool value) { + _internal_set_proto3_optional(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.proto3_optional) +} + +// ------------------------------------------------------------------- + +// OneofDescriptorProto + +// optional string name = 1; +inline bool OneofDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool OneofDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void OneofDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& OneofDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.OneofDescriptorProto.name) + return _internal_name(); +} +inline void OneofDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.OneofDescriptorProto.name) +} +inline std::string* OneofDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.OneofDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& OneofDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void OneofDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void OneofDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.OneofDescriptorProto.name) +} +inline void OneofDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.OneofDescriptorProto.name) +} +inline void OneofDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.OneofDescriptorProto.name) +} +inline std::string* OneofDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* OneofDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.OneofDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void OneofDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.OneofDescriptorProto.name) +} + +// optional .google.protobuf.OneofOptions options = 2; +inline bool OneofDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool OneofDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void OneofDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000002u; +} +inline const PROTOBUF_NAMESPACE_ID::OneofOptions& OneofDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::OneofOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_OneofOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::OneofOptions& OneofDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.OneofDescriptorProto.options) + return _internal_options(); +} +inline void OneofDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::OneofOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.OneofDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::OneofOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.OneofDescriptorProto.options) + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::OneofOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000002u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.OneofDescriptorProto.options) + return _internal_mutable_options(); +} +inline void OneofDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::OneofOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.OneofDescriptorProto.options) +} + +// ------------------------------------------------------------------- + +// EnumDescriptorProto_EnumReservedRange + +// optional int32 start = 1; +inline bool EnumDescriptorProto_EnumReservedRange::_internal_has_start() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool EnumDescriptorProto_EnumReservedRange::has_start() const { + return _internal_has_start(); +} +inline void EnumDescriptorProto_EnumReservedRange::clear_start() { + start_ = 0; + _has_bits_[0] &= ~0x00000001u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::_internal_start() const { + return start_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::start() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.EnumReservedRange.start) + return _internal_start(); +} +inline void EnumDescriptorProto_EnumReservedRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000001u; + start_ = value; +} +inline void EnumDescriptorProto_EnumReservedRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_start(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.EnumReservedRange.start) +} + +// optional int32 end = 2; +inline bool EnumDescriptorProto_EnumReservedRange::_internal_has_end() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool EnumDescriptorProto_EnumReservedRange::has_end() const { + return _internal_has_end(); +} +inline void EnumDescriptorProto_EnumReservedRange::clear_end() { + end_ = 0; + _has_bits_[0] &= ~0x00000002u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::_internal_end() const { + return end_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::end() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.EnumReservedRange.end) + return _internal_end(); +} +inline void EnumDescriptorProto_EnumReservedRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000002u; + end_ = value; +} +inline void EnumDescriptorProto_EnumReservedRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_end(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.EnumReservedRange.end) +} + +// ------------------------------------------------------------------- + +// EnumDescriptorProto + +// optional string name = 1; +inline bool EnumDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool EnumDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void EnumDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& EnumDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.name) + return _internal_name(); +} +inline void EnumDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.name) +} +inline std::string* EnumDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& EnumDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void EnumDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void EnumDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.EnumDescriptorProto.name) +} +inline void EnumDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.EnumDescriptorProto.name) +} +inline void EnumDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumDescriptorProto.name) +} +inline std::string* EnumDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* EnumDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.EnumDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void EnumDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumDescriptorProto.name) +} + +// repeated .google.protobuf.EnumValueDescriptorProto value = 2; +inline int EnumDescriptorProto::_internal_value_size() const { + return value_.size(); +} +inline int EnumDescriptorProto::value_size() const { + return _internal_value_size(); +} +inline void EnumDescriptorProto::clear_value() { + value_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::mutable_value(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.value) + return value_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >* +EnumDescriptorProto::mutable_value() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.value) + return &value_; +} +inline const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& EnumDescriptorProto::_internal_value(int index) const { + return value_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& EnumDescriptorProto::value(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.value) + return _internal_value(index); +} +inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::_internal_add_value() { + return value_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::add_value() { + // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.value) + return _internal_add_value(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >& +EnumDescriptorProto::value() const { + // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.value) + return value_; +} + +// optional .google.protobuf.EnumOptions options = 3; +inline bool EnumDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool EnumDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void EnumDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000002u; +} +inline const PROTOBUF_NAMESPACE_ID::EnumOptions& EnumDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::EnumOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_EnumOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::EnumOptions& EnumDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.options) + return _internal_options(); +} +inline void EnumDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::EnumOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.EnumDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::EnumOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.EnumDescriptorProto.options) + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::EnumOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000002u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.options) + return _internal_mutable_options(); +} +inline void EnumDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumDescriptorProto.options) +} + +// repeated .google.protobuf.EnumDescriptorProto.EnumReservedRange reserved_range = 4; +inline int EnumDescriptorProto::_internal_reserved_range_size() const { + return reserved_range_.size(); +} +inline int EnumDescriptorProto::reserved_range_size() const { + return _internal_reserved_range_size(); +} +inline void EnumDescriptorProto::clear_reserved_range() { + reserved_range_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::mutable_reserved_range(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.reserved_range) + return reserved_range_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >* +EnumDescriptorProto::mutable_reserved_range() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.reserved_range) + return &reserved_range_; +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& EnumDescriptorProto::_internal_reserved_range(int index) const { + return reserved_range_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& EnumDescriptorProto::reserved_range(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.reserved_range) + return _internal_reserved_range(index); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::_internal_add_reserved_range() { + return reserved_range_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::add_reserved_range() { + // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_range) + return _internal_add_reserved_range(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >& +EnumDescriptorProto::reserved_range() const { + // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.reserved_range) + return reserved_range_; +} + +// repeated string reserved_name = 5; +inline int EnumDescriptorProto::_internal_reserved_name_size() const { + return reserved_name_.size(); +} +inline int EnumDescriptorProto::reserved_name_size() const { + return _internal_reserved_name_size(); +} +inline void EnumDescriptorProto::clear_reserved_name() { + reserved_name_.Clear(); +} +inline std::string* EnumDescriptorProto::add_reserved_name() { + // @@protoc_insertion_point(field_add_mutable:google.protobuf.EnumDescriptorProto.reserved_name) + return _internal_add_reserved_name(); +} +inline const std::string& EnumDescriptorProto::_internal_reserved_name(int index) const { + return reserved_name_.Get(index); +} +inline const std::string& EnumDescriptorProto::reserved_name(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.reserved_name) + return _internal_reserved_name(index); +} +inline std::string* EnumDescriptorProto::mutable_reserved_name(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.reserved_name) + return reserved_name_.Mutable(index); +} +inline void EnumDescriptorProto::set_reserved_name(int index, const std::string& value) { + // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.reserved_name) + reserved_name_.Mutable(index)->assign(value); +} +inline void EnumDescriptorProto::set_reserved_name(int index, std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.reserved_name) + reserved_name_.Mutable(index)->assign(std::move(value)); +} +inline void EnumDescriptorProto::set_reserved_name(int index, const char* value) { + GOOGLE_DCHECK(value != nullptr); + reserved_name_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline void EnumDescriptorProto::set_reserved_name(int index, const char* value, size_t size) { + reserved_name_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline std::string* EnumDescriptorProto::_internal_add_reserved_name() { + return reserved_name_.Add(); +} +inline void EnumDescriptorProto::add_reserved_name(const std::string& value) { + reserved_name_.Add()->assign(value); + // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline void EnumDescriptorProto::add_reserved_name(std::string&& value) { + reserved_name_.Add(std::move(value)); + // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline void EnumDescriptorProto::add_reserved_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + reserved_name_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline void EnumDescriptorProto::add_reserved_name(const char* value, size_t size) { + reserved_name_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:google.protobuf.EnumDescriptorProto.reserved_name) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& +EnumDescriptorProto::reserved_name() const { + // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.reserved_name) + return reserved_name_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* +EnumDescriptorProto::mutable_reserved_name() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.reserved_name) + return &reserved_name_; +} + +// ------------------------------------------------------------------- + +// EnumValueDescriptorProto + +// optional string name = 1; +inline bool EnumValueDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool EnumValueDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void EnumValueDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& EnumValueDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.name) + return _internal_name(); +} +inline void EnumValueDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumValueDescriptorProto.name) +} +inline std::string* EnumValueDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& EnumValueDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void EnumValueDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void EnumValueDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.EnumValueDescriptorProto.name) +} +inline void EnumValueDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.EnumValueDescriptorProto.name) +} +inline void EnumValueDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumValueDescriptorProto.name) +} +inline std::string* EnumValueDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* EnumValueDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.EnumValueDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void EnumValueDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumValueDescriptorProto.name) +} + +// optional int32 number = 2; +inline bool EnumValueDescriptorProto::_internal_has_number() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool EnumValueDescriptorProto::has_number() const { + return _internal_has_number(); +} +inline void EnumValueDescriptorProto::clear_number() { + number_ = 0; + _has_bits_[0] &= ~0x00000004u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumValueDescriptorProto::_internal_number() const { + return number_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 EnumValueDescriptorProto::number() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.number) + return _internal_number(); +} +inline void EnumValueDescriptorProto::_internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000004u; + number_ = value; +} +inline void EnumValueDescriptorProto::set_number(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_number(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumValueDescriptorProto.number) +} + +// optional .google.protobuf.EnumValueOptions options = 3; +inline bool EnumValueDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool EnumValueDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void EnumValueDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000002u; +} +inline const PROTOBUF_NAMESPACE_ID::EnumValueOptions& EnumValueDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::EnumValueOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_EnumValueOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::EnumValueOptions& EnumValueDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.options) + return _internal_options(); +} +inline void EnumValueDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::EnumValueOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.EnumValueDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::EnumValueOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.EnumValueDescriptorProto.options) + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::EnumValueOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000002u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueDescriptorProto.options) + return _internal_mutable_options(); +} +inline void EnumValueDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumValueOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumValueDescriptorProto.options) +} + +// ------------------------------------------------------------------- + +// ServiceDescriptorProto + +// optional string name = 1; +inline bool ServiceDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool ServiceDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void ServiceDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& ServiceDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.name) + return _internal_name(); +} +inline void ServiceDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.ServiceDescriptorProto.name) +} +inline std::string* ServiceDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& ServiceDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void ServiceDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void ServiceDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.ServiceDescriptorProto.name) +} +inline void ServiceDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.ServiceDescriptorProto.name) +} +inline void ServiceDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.ServiceDescriptorProto.name) +} +inline std::string* ServiceDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* ServiceDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.ServiceDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void ServiceDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.ServiceDescriptorProto.name) +} + +// repeated .google.protobuf.MethodDescriptorProto method = 2; +inline int ServiceDescriptorProto::_internal_method_size() const { + return method_.size(); +} +inline int ServiceDescriptorProto::method_size() const { + return _internal_method_size(); +} +inline void ServiceDescriptorProto::clear_method() { + method_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::mutable_method(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.method) + return method_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >* +ServiceDescriptorProto::mutable_method() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.ServiceDescriptorProto.method) + return &method_; +} +inline const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& ServiceDescriptorProto::_internal_method(int index) const { + return method_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& ServiceDescriptorProto::method(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.method) + return _internal_method(index); +} +inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::_internal_add_method() { + return method_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::add_method() { + // @@protoc_insertion_point(field_add:google.protobuf.ServiceDescriptorProto.method) + return _internal_add_method(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >& +ServiceDescriptorProto::method() const { + // @@protoc_insertion_point(field_list:google.protobuf.ServiceDescriptorProto.method) + return method_; +} + +// optional .google.protobuf.ServiceOptions options = 3; +inline bool ServiceDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool ServiceDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void ServiceDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000002u; +} +inline const PROTOBUF_NAMESPACE_ID::ServiceOptions& ServiceDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::ServiceOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_ServiceOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::ServiceOptions& ServiceDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.options) + return _internal_options(); +} +inline void ServiceDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::ServiceOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.ServiceDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::ServiceOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.ServiceDescriptorProto.options) + _has_bits_[0] &= ~0x00000002u; + PROTOBUF_NAMESPACE_ID::ServiceOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000002u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.options) + return _internal_mutable_options(); +} +inline void ServiceDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::ServiceOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.ServiceDescriptorProto.options) +} + +// ------------------------------------------------------------------- + +// MethodDescriptorProto + +// optional string name = 1; +inline bool MethodDescriptorProto::_internal_has_name() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool MethodDescriptorProto::has_name() const { + return _internal_has_name(); +} +inline void MethodDescriptorProto::clear_name() { + name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& MethodDescriptorProto::name() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.name) + return _internal_name(); +} +inline void MethodDescriptorProto::set_name(const std::string& value) { + _internal_set_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.name) +} +inline std::string* MethodDescriptorProto::mutable_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.name) + return _internal_mutable_name(); +} +inline const std::string& MethodDescriptorProto::_internal_name() const { + return name_.Get(); +} +inline void MethodDescriptorProto::_internal_set_name(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void MethodDescriptorProto::set_name(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.name) +} +inline void MethodDescriptorProto::set_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.name) +} +inline void MethodDescriptorProto::set_name(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.name) +} +inline std::string* MethodDescriptorProto::_internal_mutable_name() { + _has_bits_[0] |= 0x00000001u; + return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* MethodDescriptorProto::release_name() { + // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.name) + if (!_internal_has_name()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void MethodDescriptorProto::set_allocated_name(std::string* name) { + if (name != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.name) +} + +// optional string input_type = 2; +inline bool MethodDescriptorProto::_internal_has_input_type() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool MethodDescriptorProto::has_input_type() const { + return _internal_has_input_type(); +} +inline void MethodDescriptorProto::clear_input_type() { + input_type_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& MethodDescriptorProto::input_type() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.input_type) + return _internal_input_type(); +} +inline void MethodDescriptorProto::set_input_type(const std::string& value) { + _internal_set_input_type(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.input_type) +} +inline std::string* MethodDescriptorProto::mutable_input_type() { + // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.input_type) + return _internal_mutable_input_type(); +} +inline const std::string& MethodDescriptorProto::_internal_input_type() const { + return input_type_.Get(); +} +inline void MethodDescriptorProto::_internal_set_input_type(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void MethodDescriptorProto::set_input_type(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + input_type_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.input_type) +} +inline void MethodDescriptorProto::set_input_type(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.input_type) +} +inline void MethodDescriptorProto::set_input_type(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.input_type) +} +inline std::string* MethodDescriptorProto::_internal_mutable_input_type() { + _has_bits_[0] |= 0x00000002u; + return input_type_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* MethodDescriptorProto::release_input_type() { + // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.input_type) + if (!_internal_has_input_type()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return input_type_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void MethodDescriptorProto::set_allocated_input_type(std::string* input_type) { + if (input_type != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + input_type_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), input_type, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.input_type) +} + +// optional string output_type = 3; +inline bool MethodDescriptorProto::_internal_has_output_type() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool MethodDescriptorProto::has_output_type() const { + return _internal_has_output_type(); +} +inline void MethodDescriptorProto::clear_output_type() { + output_type_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000004u; +} +inline const std::string& MethodDescriptorProto::output_type() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.output_type) + return _internal_output_type(); +} +inline void MethodDescriptorProto::set_output_type(const std::string& value) { + _internal_set_output_type(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.output_type) +} +inline std::string* MethodDescriptorProto::mutable_output_type() { + // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.output_type) + return _internal_mutable_output_type(); +} +inline const std::string& MethodDescriptorProto::_internal_output_type() const { + return output_type_.Get(); +} +inline void MethodDescriptorProto::_internal_set_output_type(const std::string& value) { + _has_bits_[0] |= 0x00000004u; + output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void MethodDescriptorProto::set_output_type(std::string&& value) { + _has_bits_[0] |= 0x00000004u; + output_type_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.output_type) +} +inline void MethodDescriptorProto::set_output_type(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000004u; + output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.output_type) +} +inline void MethodDescriptorProto::set_output_type(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000004u; + output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.output_type) +} +inline std::string* MethodDescriptorProto::_internal_mutable_output_type() { + _has_bits_[0] |= 0x00000004u; + return output_type_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* MethodDescriptorProto::release_output_type() { + // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.output_type) + if (!_internal_has_output_type()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000004u; + return output_type_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void MethodDescriptorProto::set_allocated_output_type(std::string* output_type) { + if (output_type != nullptr) { + _has_bits_[0] |= 0x00000004u; + } else { + _has_bits_[0] &= ~0x00000004u; + } + output_type_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), output_type, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.output_type) +} + +// optional .google.protobuf.MethodOptions options = 4; +inline bool MethodDescriptorProto::_internal_has_options() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + PROTOBUF_ASSUME(!value || options_ != nullptr); + return value; +} +inline bool MethodDescriptorProto::has_options() const { + return _internal_has_options(); +} +inline void MethodDescriptorProto::clear_options() { + if (options_ != nullptr) options_->Clear(); + _has_bits_[0] &= ~0x00000008u; +} +inline const PROTOBUF_NAMESPACE_ID::MethodOptions& MethodDescriptorProto::_internal_options() const { + const PROTOBUF_NAMESPACE_ID::MethodOptions* p = options_; + return p != nullptr ? *p : *reinterpret_cast( + &PROTOBUF_NAMESPACE_ID::_MethodOptions_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::MethodOptions& MethodDescriptorProto::options() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.options) + return _internal_options(); +} +inline void MethodDescriptorProto::unsafe_arena_set_allocated_options( + PROTOBUF_NAMESPACE_ID::MethodOptions* options) { + if (GetArena() == nullptr) { + delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_); + } + options_ = options; + if (options) { + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.MethodDescriptorProto.options) +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::release_options() { + _has_bits_[0] &= ~0x00000008u; + PROTOBUF_NAMESPACE_ID::MethodOptions* temp = options_; + options_ = nullptr; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + return temp; +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::unsafe_arena_release_options() { + // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.options) + _has_bits_[0] &= ~0x00000008u; + PROTOBUF_NAMESPACE_ID::MethodOptions* temp = options_; + options_ = nullptr; + return temp; +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::_internal_mutable_options() { + _has_bits_[0] |= 0x00000008u; + if (options_ == nullptr) { + auto* p = CreateMaybeMessage(GetArena()); + options_ = p; + } + return options_; +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::mutable_options() { + // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.options) + return _internal_mutable_options(); +} +inline void MethodDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::MethodOptions* options) { + ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena(); + if (message_arena == nullptr) { + delete options_; + } + if (options) { + ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = + ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options); + if (message_arena != submessage_arena) { + options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( + message_arena, options, submessage_arena); + } + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + options_ = options; + // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.options) +} + +// optional bool client_streaming = 5 [default = false]; +inline bool MethodDescriptorProto::_internal_has_client_streaming() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + return value; +} +inline bool MethodDescriptorProto::has_client_streaming() const { + return _internal_has_client_streaming(); +} +inline void MethodDescriptorProto::clear_client_streaming() { + client_streaming_ = false; + _has_bits_[0] &= ~0x00000010u; +} +inline bool MethodDescriptorProto::_internal_client_streaming() const { + return client_streaming_; +} +inline bool MethodDescriptorProto::client_streaming() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.client_streaming) + return _internal_client_streaming(); +} +inline void MethodDescriptorProto::_internal_set_client_streaming(bool value) { + _has_bits_[0] |= 0x00000010u; + client_streaming_ = value; +} +inline void MethodDescriptorProto::set_client_streaming(bool value) { + _internal_set_client_streaming(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.client_streaming) +} + +// optional bool server_streaming = 6 [default = false]; +inline bool MethodDescriptorProto::_internal_has_server_streaming() const { + bool value = (_has_bits_[0] & 0x00000020u) != 0; + return value; +} +inline bool MethodDescriptorProto::has_server_streaming() const { + return _internal_has_server_streaming(); +} +inline void MethodDescriptorProto::clear_server_streaming() { + server_streaming_ = false; + _has_bits_[0] &= ~0x00000020u; +} +inline bool MethodDescriptorProto::_internal_server_streaming() const { + return server_streaming_; +} +inline bool MethodDescriptorProto::server_streaming() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.server_streaming) + return _internal_server_streaming(); +} +inline void MethodDescriptorProto::_internal_set_server_streaming(bool value) { + _has_bits_[0] |= 0x00000020u; + server_streaming_ = value; +} +inline void MethodDescriptorProto::set_server_streaming(bool value) { + _internal_set_server_streaming(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.server_streaming) +} + +// ------------------------------------------------------------------- + +// FileOptions + +// optional string java_package = 1; +inline bool FileOptions::_internal_has_java_package() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool FileOptions::has_java_package() const { + return _internal_has_java_package(); +} +inline void FileOptions::clear_java_package() { + java_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& FileOptions::java_package() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_package) + return _internal_java_package(); +} +inline void FileOptions::set_java_package(const std::string& value) { + _internal_set_java_package(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_package) +} +inline std::string* FileOptions::mutable_java_package() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.java_package) + return _internal_mutable_java_package(); +} +inline const std::string& FileOptions::_internal_java_package() const { + return java_package_.Get(); +} +inline void FileOptions::_internal_set_java_package(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_java_package(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + java_package_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.java_package) +} +inline void FileOptions::set_java_package(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.java_package) +} +inline void FileOptions::set_java_package(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.java_package) +} +inline std::string* FileOptions::_internal_mutable_java_package() { + _has_bits_[0] |= 0x00000001u; + return java_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_java_package() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.java_package) + if (!_internal_has_java_package()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return java_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_java_package(std::string* java_package) { + if (java_package != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + java_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), java_package, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.java_package) +} + +// optional string java_outer_classname = 8; +inline bool FileOptions::_internal_has_java_outer_classname() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool FileOptions::has_java_outer_classname() const { + return _internal_has_java_outer_classname(); +} +inline void FileOptions::clear_java_outer_classname() { + java_outer_classname_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& FileOptions::java_outer_classname() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_outer_classname) + return _internal_java_outer_classname(); +} +inline void FileOptions::set_java_outer_classname(const std::string& value) { + _internal_set_java_outer_classname(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_outer_classname) +} +inline std::string* FileOptions::mutable_java_outer_classname() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.java_outer_classname) + return _internal_mutable_java_outer_classname(); +} +inline const std::string& FileOptions::_internal_java_outer_classname() const { + return java_outer_classname_.Get(); +} +inline void FileOptions::_internal_set_java_outer_classname(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_java_outer_classname(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + java_outer_classname_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.java_outer_classname) +} +inline void FileOptions::set_java_outer_classname(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.java_outer_classname) +} +inline void FileOptions::set_java_outer_classname(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.java_outer_classname) +} +inline std::string* FileOptions::_internal_mutable_java_outer_classname() { + _has_bits_[0] |= 0x00000002u; + return java_outer_classname_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_java_outer_classname() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.java_outer_classname) + if (!_internal_has_java_outer_classname()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return java_outer_classname_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_java_outer_classname(std::string* java_outer_classname) { + if (java_outer_classname != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + java_outer_classname_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), java_outer_classname, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.java_outer_classname) +} + +// optional bool java_multiple_files = 10 [default = false]; +inline bool FileOptions::_internal_has_java_multiple_files() const { + bool value = (_has_bits_[0] & 0x00000400u) != 0; + return value; +} +inline bool FileOptions::has_java_multiple_files() const { + return _internal_has_java_multiple_files(); +} +inline void FileOptions::clear_java_multiple_files() { + java_multiple_files_ = false; + _has_bits_[0] &= ~0x00000400u; +} +inline bool FileOptions::_internal_java_multiple_files() const { + return java_multiple_files_; +} +inline bool FileOptions::java_multiple_files() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_multiple_files) + return _internal_java_multiple_files(); +} +inline void FileOptions::_internal_set_java_multiple_files(bool value) { + _has_bits_[0] |= 0x00000400u; + java_multiple_files_ = value; +} +inline void FileOptions::set_java_multiple_files(bool value) { + _internal_set_java_multiple_files(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_multiple_files) +} + +// optional bool java_generate_equals_and_hash = 20 [deprecated = true]; +inline bool FileOptions::_internal_has_java_generate_equals_and_hash() const { + bool value = (_has_bits_[0] & 0x00000800u) != 0; + return value; +} +inline bool FileOptions::has_java_generate_equals_and_hash() const { + return _internal_has_java_generate_equals_and_hash(); +} +inline void FileOptions::clear_java_generate_equals_and_hash() { + java_generate_equals_and_hash_ = false; + _has_bits_[0] &= ~0x00000800u; +} +inline bool FileOptions::_internal_java_generate_equals_and_hash() const { + return java_generate_equals_and_hash_; +} +inline bool FileOptions::java_generate_equals_and_hash() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_generate_equals_and_hash) + return _internal_java_generate_equals_and_hash(); +} +inline void FileOptions::_internal_set_java_generate_equals_and_hash(bool value) { + _has_bits_[0] |= 0x00000800u; + java_generate_equals_and_hash_ = value; +} +inline void FileOptions::set_java_generate_equals_and_hash(bool value) { + _internal_set_java_generate_equals_and_hash(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_generate_equals_and_hash) +} + +// optional bool java_string_check_utf8 = 27 [default = false]; +inline bool FileOptions::_internal_has_java_string_check_utf8() const { + bool value = (_has_bits_[0] & 0x00001000u) != 0; + return value; +} +inline bool FileOptions::has_java_string_check_utf8() const { + return _internal_has_java_string_check_utf8(); +} +inline void FileOptions::clear_java_string_check_utf8() { + java_string_check_utf8_ = false; + _has_bits_[0] &= ~0x00001000u; +} +inline bool FileOptions::_internal_java_string_check_utf8() const { + return java_string_check_utf8_; +} +inline bool FileOptions::java_string_check_utf8() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_string_check_utf8) + return _internal_java_string_check_utf8(); +} +inline void FileOptions::_internal_set_java_string_check_utf8(bool value) { + _has_bits_[0] |= 0x00001000u; + java_string_check_utf8_ = value; +} +inline void FileOptions::set_java_string_check_utf8(bool value) { + _internal_set_java_string_check_utf8(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_string_check_utf8) +} + +// optional .google.protobuf.FileOptions.OptimizeMode optimize_for = 9 [default = SPEED]; +inline bool FileOptions::_internal_has_optimize_for() const { + bool value = (_has_bits_[0] & 0x00040000u) != 0; + return value; +} +inline bool FileOptions::has_optimize_for() const { + return _internal_has_optimize_for(); +} +inline void FileOptions::clear_optimize_for() { + optimize_for_ = 1; + _has_bits_[0] &= ~0x00040000u; +} +inline PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode FileOptions::_internal_optimize_for() const { + return static_cast< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode >(optimize_for_); +} +inline PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode FileOptions::optimize_for() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.optimize_for) + return _internal_optimize_for(); +} +inline void FileOptions::_internal_set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value) { + assert(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode_IsValid(value)); + _has_bits_[0] |= 0x00040000u; + optimize_for_ = value; +} +inline void FileOptions::set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value) { + _internal_set_optimize_for(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.optimize_for) +} + +// optional string go_package = 11; +inline bool FileOptions::_internal_has_go_package() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool FileOptions::has_go_package() const { + return _internal_has_go_package(); +} +inline void FileOptions::clear_go_package() { + go_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000004u; +} +inline const std::string& FileOptions::go_package() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.go_package) + return _internal_go_package(); +} +inline void FileOptions::set_go_package(const std::string& value) { + _internal_set_go_package(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.go_package) +} +inline std::string* FileOptions::mutable_go_package() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.go_package) + return _internal_mutable_go_package(); +} +inline const std::string& FileOptions::_internal_go_package() const { + return go_package_.Get(); +} +inline void FileOptions::_internal_set_go_package(const std::string& value) { + _has_bits_[0] |= 0x00000004u; + go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_go_package(std::string&& value) { + _has_bits_[0] |= 0x00000004u; + go_package_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.go_package) +} +inline void FileOptions::set_go_package(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000004u; + go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.go_package) +} +inline void FileOptions::set_go_package(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000004u; + go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.go_package) +} +inline std::string* FileOptions::_internal_mutable_go_package() { + _has_bits_[0] |= 0x00000004u; + return go_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_go_package() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.go_package) + if (!_internal_has_go_package()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000004u; + return go_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_go_package(std::string* go_package) { + if (go_package != nullptr) { + _has_bits_[0] |= 0x00000004u; + } else { + _has_bits_[0] &= ~0x00000004u; + } + go_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), go_package, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.go_package) +} + +// optional bool cc_generic_services = 16 [default = false]; +inline bool FileOptions::_internal_has_cc_generic_services() const { + bool value = (_has_bits_[0] & 0x00002000u) != 0; + return value; +} +inline bool FileOptions::has_cc_generic_services() const { + return _internal_has_cc_generic_services(); +} +inline void FileOptions::clear_cc_generic_services() { + cc_generic_services_ = false; + _has_bits_[0] &= ~0x00002000u; +} +inline bool FileOptions::_internal_cc_generic_services() const { + return cc_generic_services_; +} +inline bool FileOptions::cc_generic_services() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.cc_generic_services) + return _internal_cc_generic_services(); +} +inline void FileOptions::_internal_set_cc_generic_services(bool value) { + _has_bits_[0] |= 0x00002000u; + cc_generic_services_ = value; +} +inline void FileOptions::set_cc_generic_services(bool value) { + _internal_set_cc_generic_services(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.cc_generic_services) +} + +// optional bool java_generic_services = 17 [default = false]; +inline bool FileOptions::_internal_has_java_generic_services() const { + bool value = (_has_bits_[0] & 0x00004000u) != 0; + return value; +} +inline bool FileOptions::has_java_generic_services() const { + return _internal_has_java_generic_services(); +} +inline void FileOptions::clear_java_generic_services() { + java_generic_services_ = false; + _has_bits_[0] &= ~0x00004000u; +} +inline bool FileOptions::_internal_java_generic_services() const { + return java_generic_services_; +} +inline bool FileOptions::java_generic_services() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_generic_services) + return _internal_java_generic_services(); +} +inline void FileOptions::_internal_set_java_generic_services(bool value) { + _has_bits_[0] |= 0x00004000u; + java_generic_services_ = value; +} +inline void FileOptions::set_java_generic_services(bool value) { + _internal_set_java_generic_services(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_generic_services) +} + +// optional bool py_generic_services = 18 [default = false]; +inline bool FileOptions::_internal_has_py_generic_services() const { + bool value = (_has_bits_[0] & 0x00008000u) != 0; + return value; +} +inline bool FileOptions::has_py_generic_services() const { + return _internal_has_py_generic_services(); +} +inline void FileOptions::clear_py_generic_services() { + py_generic_services_ = false; + _has_bits_[0] &= ~0x00008000u; +} +inline bool FileOptions::_internal_py_generic_services() const { + return py_generic_services_; +} +inline bool FileOptions::py_generic_services() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.py_generic_services) + return _internal_py_generic_services(); +} +inline void FileOptions::_internal_set_py_generic_services(bool value) { + _has_bits_[0] |= 0x00008000u; + py_generic_services_ = value; +} +inline void FileOptions::set_py_generic_services(bool value) { + _internal_set_py_generic_services(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.py_generic_services) +} + +// optional bool php_generic_services = 42 [default = false]; +inline bool FileOptions::_internal_has_php_generic_services() const { + bool value = (_has_bits_[0] & 0x00010000u) != 0; + return value; +} +inline bool FileOptions::has_php_generic_services() const { + return _internal_has_php_generic_services(); +} +inline void FileOptions::clear_php_generic_services() { + php_generic_services_ = false; + _has_bits_[0] &= ~0x00010000u; +} +inline bool FileOptions::_internal_php_generic_services() const { + return php_generic_services_; +} +inline bool FileOptions::php_generic_services() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_generic_services) + return _internal_php_generic_services(); +} +inline void FileOptions::_internal_set_php_generic_services(bool value) { + _has_bits_[0] |= 0x00010000u; + php_generic_services_ = value; +} +inline void FileOptions::set_php_generic_services(bool value) { + _internal_set_php_generic_services(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_generic_services) +} + +// optional bool deprecated = 23 [default = false]; +inline bool FileOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00020000u) != 0; + return value; +} +inline bool FileOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void FileOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00020000u; +} +inline bool FileOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool FileOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.deprecated) + return _internal_deprecated(); +} +inline void FileOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00020000u; + deprecated_ = value; +} +inline void FileOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.deprecated) +} + +// optional bool cc_enable_arenas = 31 [default = true]; +inline bool FileOptions::_internal_has_cc_enable_arenas() const { + bool value = (_has_bits_[0] & 0x00080000u) != 0; + return value; +} +inline bool FileOptions::has_cc_enable_arenas() const { + return _internal_has_cc_enable_arenas(); +} +inline void FileOptions::clear_cc_enable_arenas() { + cc_enable_arenas_ = true; + _has_bits_[0] &= ~0x00080000u; +} +inline bool FileOptions::_internal_cc_enable_arenas() const { + return cc_enable_arenas_; +} +inline bool FileOptions::cc_enable_arenas() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.cc_enable_arenas) + return _internal_cc_enable_arenas(); +} +inline void FileOptions::_internal_set_cc_enable_arenas(bool value) { + _has_bits_[0] |= 0x00080000u; + cc_enable_arenas_ = value; +} +inline void FileOptions::set_cc_enable_arenas(bool value) { + _internal_set_cc_enable_arenas(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.cc_enable_arenas) +} + +// optional string objc_class_prefix = 36; +inline bool FileOptions::_internal_has_objc_class_prefix() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + return value; +} +inline bool FileOptions::has_objc_class_prefix() const { + return _internal_has_objc_class_prefix(); +} +inline void FileOptions::clear_objc_class_prefix() { + objc_class_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000008u; +} +inline const std::string& FileOptions::objc_class_prefix() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.objc_class_prefix) + return _internal_objc_class_prefix(); +} +inline void FileOptions::set_objc_class_prefix(const std::string& value) { + _internal_set_objc_class_prefix(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.objc_class_prefix) +} +inline std::string* FileOptions::mutable_objc_class_prefix() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.objc_class_prefix) + return _internal_mutable_objc_class_prefix(); +} +inline const std::string& FileOptions::_internal_objc_class_prefix() const { + return objc_class_prefix_.Get(); +} +inline void FileOptions::_internal_set_objc_class_prefix(const std::string& value) { + _has_bits_[0] |= 0x00000008u; + objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_objc_class_prefix(std::string&& value) { + _has_bits_[0] |= 0x00000008u; + objc_class_prefix_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.objc_class_prefix) +} +inline void FileOptions::set_objc_class_prefix(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000008u; + objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.objc_class_prefix) +} +inline void FileOptions::set_objc_class_prefix(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000008u; + objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.objc_class_prefix) +} +inline std::string* FileOptions::_internal_mutable_objc_class_prefix() { + _has_bits_[0] |= 0x00000008u; + return objc_class_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_objc_class_prefix() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.objc_class_prefix) + if (!_internal_has_objc_class_prefix()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000008u; + return objc_class_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_objc_class_prefix(std::string* objc_class_prefix) { + if (objc_class_prefix != nullptr) { + _has_bits_[0] |= 0x00000008u; + } else { + _has_bits_[0] &= ~0x00000008u; + } + objc_class_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), objc_class_prefix, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.objc_class_prefix) +} + +// optional string csharp_namespace = 37; +inline bool FileOptions::_internal_has_csharp_namespace() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + return value; +} +inline bool FileOptions::has_csharp_namespace() const { + return _internal_has_csharp_namespace(); +} +inline void FileOptions::clear_csharp_namespace() { + csharp_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000010u; +} +inline const std::string& FileOptions::csharp_namespace() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.csharp_namespace) + return _internal_csharp_namespace(); +} +inline void FileOptions::set_csharp_namespace(const std::string& value) { + _internal_set_csharp_namespace(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.csharp_namespace) +} +inline std::string* FileOptions::mutable_csharp_namespace() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.csharp_namespace) + return _internal_mutable_csharp_namespace(); +} +inline const std::string& FileOptions::_internal_csharp_namespace() const { + return csharp_namespace_.Get(); +} +inline void FileOptions::_internal_set_csharp_namespace(const std::string& value) { + _has_bits_[0] |= 0x00000010u; + csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_csharp_namespace(std::string&& value) { + _has_bits_[0] |= 0x00000010u; + csharp_namespace_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.csharp_namespace) +} +inline void FileOptions::set_csharp_namespace(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000010u; + csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.csharp_namespace) +} +inline void FileOptions::set_csharp_namespace(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000010u; + csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.csharp_namespace) +} +inline std::string* FileOptions::_internal_mutable_csharp_namespace() { + _has_bits_[0] |= 0x00000010u; + return csharp_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_csharp_namespace() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.csharp_namespace) + if (!_internal_has_csharp_namespace()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000010u; + return csharp_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_csharp_namespace(std::string* csharp_namespace) { + if (csharp_namespace != nullptr) { + _has_bits_[0] |= 0x00000010u; + } else { + _has_bits_[0] &= ~0x00000010u; + } + csharp_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), csharp_namespace, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.csharp_namespace) +} + +// optional string swift_prefix = 39; +inline bool FileOptions::_internal_has_swift_prefix() const { + bool value = (_has_bits_[0] & 0x00000020u) != 0; + return value; +} +inline bool FileOptions::has_swift_prefix() const { + return _internal_has_swift_prefix(); +} +inline void FileOptions::clear_swift_prefix() { + swift_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000020u; +} +inline const std::string& FileOptions::swift_prefix() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.swift_prefix) + return _internal_swift_prefix(); +} +inline void FileOptions::set_swift_prefix(const std::string& value) { + _internal_set_swift_prefix(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.swift_prefix) +} +inline std::string* FileOptions::mutable_swift_prefix() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.swift_prefix) + return _internal_mutable_swift_prefix(); +} +inline const std::string& FileOptions::_internal_swift_prefix() const { + return swift_prefix_.Get(); +} +inline void FileOptions::_internal_set_swift_prefix(const std::string& value) { + _has_bits_[0] |= 0x00000020u; + swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_swift_prefix(std::string&& value) { + _has_bits_[0] |= 0x00000020u; + swift_prefix_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.swift_prefix) +} +inline void FileOptions::set_swift_prefix(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000020u; + swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.swift_prefix) +} +inline void FileOptions::set_swift_prefix(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000020u; + swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.swift_prefix) +} +inline std::string* FileOptions::_internal_mutable_swift_prefix() { + _has_bits_[0] |= 0x00000020u; + return swift_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_swift_prefix() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.swift_prefix) + if (!_internal_has_swift_prefix()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000020u; + return swift_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_swift_prefix(std::string* swift_prefix) { + if (swift_prefix != nullptr) { + _has_bits_[0] |= 0x00000020u; + } else { + _has_bits_[0] &= ~0x00000020u; + } + swift_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), swift_prefix, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.swift_prefix) +} + +// optional string php_class_prefix = 40; +inline bool FileOptions::_internal_has_php_class_prefix() const { + bool value = (_has_bits_[0] & 0x00000040u) != 0; + return value; +} +inline bool FileOptions::has_php_class_prefix() const { + return _internal_has_php_class_prefix(); +} +inline void FileOptions::clear_php_class_prefix() { + php_class_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000040u; +} +inline const std::string& FileOptions::php_class_prefix() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_class_prefix) + return _internal_php_class_prefix(); +} +inline void FileOptions::set_php_class_prefix(const std::string& value) { + _internal_set_php_class_prefix(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_class_prefix) +} +inline std::string* FileOptions::mutable_php_class_prefix() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_class_prefix) + return _internal_mutable_php_class_prefix(); +} +inline const std::string& FileOptions::_internal_php_class_prefix() const { + return php_class_prefix_.Get(); +} +inline void FileOptions::_internal_set_php_class_prefix(const std::string& value) { + _has_bits_[0] |= 0x00000040u; + php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_php_class_prefix(std::string&& value) { + _has_bits_[0] |= 0x00000040u; + php_class_prefix_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_class_prefix) +} +inline void FileOptions::set_php_class_prefix(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000040u; + php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_class_prefix) +} +inline void FileOptions::set_php_class_prefix(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000040u; + php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_class_prefix) +} +inline std::string* FileOptions::_internal_mutable_php_class_prefix() { + _has_bits_[0] |= 0x00000040u; + return php_class_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_php_class_prefix() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_class_prefix) + if (!_internal_has_php_class_prefix()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000040u; + return php_class_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_php_class_prefix(std::string* php_class_prefix) { + if (php_class_prefix != nullptr) { + _has_bits_[0] |= 0x00000040u; + } else { + _has_bits_[0] &= ~0x00000040u; + } + php_class_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_class_prefix, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_class_prefix) +} + +// optional string php_namespace = 41; +inline bool FileOptions::_internal_has_php_namespace() const { + bool value = (_has_bits_[0] & 0x00000080u) != 0; + return value; +} +inline bool FileOptions::has_php_namespace() const { + return _internal_has_php_namespace(); +} +inline void FileOptions::clear_php_namespace() { + php_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000080u; +} +inline const std::string& FileOptions::php_namespace() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_namespace) + return _internal_php_namespace(); +} +inline void FileOptions::set_php_namespace(const std::string& value) { + _internal_set_php_namespace(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_namespace) +} +inline std::string* FileOptions::mutable_php_namespace() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_namespace) + return _internal_mutable_php_namespace(); +} +inline const std::string& FileOptions::_internal_php_namespace() const { + return php_namespace_.Get(); +} +inline void FileOptions::_internal_set_php_namespace(const std::string& value) { + _has_bits_[0] |= 0x00000080u; + php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_php_namespace(std::string&& value) { + _has_bits_[0] |= 0x00000080u; + php_namespace_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_namespace) +} +inline void FileOptions::set_php_namespace(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000080u; + php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_namespace) +} +inline void FileOptions::set_php_namespace(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000080u; + php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_namespace) +} +inline std::string* FileOptions::_internal_mutable_php_namespace() { + _has_bits_[0] |= 0x00000080u; + return php_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_php_namespace() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_namespace) + if (!_internal_has_php_namespace()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000080u; + return php_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_php_namespace(std::string* php_namespace) { + if (php_namespace != nullptr) { + _has_bits_[0] |= 0x00000080u; + } else { + _has_bits_[0] &= ~0x00000080u; + } + php_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_namespace, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_namespace) +} + +// optional string php_metadata_namespace = 44; +inline bool FileOptions::_internal_has_php_metadata_namespace() const { + bool value = (_has_bits_[0] & 0x00000100u) != 0; + return value; +} +inline bool FileOptions::has_php_metadata_namespace() const { + return _internal_has_php_metadata_namespace(); +} +inline void FileOptions::clear_php_metadata_namespace() { + php_metadata_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000100u; +} +inline const std::string& FileOptions::php_metadata_namespace() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_metadata_namespace) + return _internal_php_metadata_namespace(); +} +inline void FileOptions::set_php_metadata_namespace(const std::string& value) { + _internal_set_php_metadata_namespace(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_metadata_namespace) +} +inline std::string* FileOptions::mutable_php_metadata_namespace() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_metadata_namespace) + return _internal_mutable_php_metadata_namespace(); +} +inline const std::string& FileOptions::_internal_php_metadata_namespace() const { + return php_metadata_namespace_.Get(); +} +inline void FileOptions::_internal_set_php_metadata_namespace(const std::string& value) { + _has_bits_[0] |= 0x00000100u; + php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_php_metadata_namespace(std::string&& value) { + _has_bits_[0] |= 0x00000100u; + php_metadata_namespace_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_metadata_namespace) +} +inline void FileOptions::set_php_metadata_namespace(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000100u; + php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_metadata_namespace) +} +inline void FileOptions::set_php_metadata_namespace(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000100u; + php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_metadata_namespace) +} +inline std::string* FileOptions::_internal_mutable_php_metadata_namespace() { + _has_bits_[0] |= 0x00000100u; + return php_metadata_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_php_metadata_namespace() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_metadata_namespace) + if (!_internal_has_php_metadata_namespace()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000100u; + return php_metadata_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_php_metadata_namespace(std::string* php_metadata_namespace) { + if (php_metadata_namespace != nullptr) { + _has_bits_[0] |= 0x00000100u; + } else { + _has_bits_[0] &= ~0x00000100u; + } + php_metadata_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_metadata_namespace, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_metadata_namespace) +} + +// optional string ruby_package = 45; +inline bool FileOptions::_internal_has_ruby_package() const { + bool value = (_has_bits_[0] & 0x00000200u) != 0; + return value; +} +inline bool FileOptions::has_ruby_package() const { + return _internal_has_ruby_package(); +} +inline void FileOptions::clear_ruby_package() { + ruby_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000200u; +} +inline const std::string& FileOptions::ruby_package() const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.ruby_package) + return _internal_ruby_package(); +} +inline void FileOptions::set_ruby_package(const std::string& value) { + _internal_set_ruby_package(value); + // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.ruby_package) +} +inline std::string* FileOptions::mutable_ruby_package() { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.ruby_package) + return _internal_mutable_ruby_package(); +} +inline const std::string& FileOptions::_internal_ruby_package() const { + return ruby_package_.Get(); +} +inline void FileOptions::_internal_set_ruby_package(const std::string& value) { + _has_bits_[0] |= 0x00000200u; + ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void FileOptions::set_ruby_package(std::string&& value) { + _has_bits_[0] |= 0x00000200u; + ruby_package_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.ruby_package) +} +inline void FileOptions::set_ruby_package(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000200u; + ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.ruby_package) +} +inline void FileOptions::set_ruby_package(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000200u; + ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.ruby_package) +} +inline std::string* FileOptions::_internal_mutable_ruby_package() { + _has_bits_[0] |= 0x00000200u; + return ruby_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* FileOptions::release_ruby_package() { + // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.ruby_package) + if (!_internal_has_ruby_package()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000200u; + return ruby_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void FileOptions::set_allocated_ruby_package(std::string* ruby_package) { + if (ruby_package != nullptr) { + _has_bits_[0] |= 0x00000200u; + } else { + _has_bits_[0] &= ~0x00000200u; + } + ruby_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ruby_package, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.ruby_package) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int FileOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int FileOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void FileOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +FileOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FileOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FileOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.FileOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +FileOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.FileOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// MessageOptions + +// optional bool message_set_wire_format = 1 [default = false]; +inline bool MessageOptions::_internal_has_message_set_wire_format() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool MessageOptions::has_message_set_wire_format() const { + return _internal_has_message_set_wire_format(); +} +inline void MessageOptions::clear_message_set_wire_format() { + message_set_wire_format_ = false; + _has_bits_[0] &= ~0x00000001u; +} +inline bool MessageOptions::_internal_message_set_wire_format() const { + return message_set_wire_format_; +} +inline bool MessageOptions::message_set_wire_format() const { + // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.message_set_wire_format) + return _internal_message_set_wire_format(); +} +inline void MessageOptions::_internal_set_message_set_wire_format(bool value) { + _has_bits_[0] |= 0x00000001u; + message_set_wire_format_ = value; +} +inline void MessageOptions::set_message_set_wire_format(bool value) { + _internal_set_message_set_wire_format(value); + // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.message_set_wire_format) +} + +// optional bool no_standard_descriptor_accessor = 2 [default = false]; +inline bool MessageOptions::_internal_has_no_standard_descriptor_accessor() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool MessageOptions::has_no_standard_descriptor_accessor() const { + return _internal_has_no_standard_descriptor_accessor(); +} +inline void MessageOptions::clear_no_standard_descriptor_accessor() { + no_standard_descriptor_accessor_ = false; + _has_bits_[0] &= ~0x00000002u; +} +inline bool MessageOptions::_internal_no_standard_descriptor_accessor() const { + return no_standard_descriptor_accessor_; +} +inline bool MessageOptions::no_standard_descriptor_accessor() const { + // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.no_standard_descriptor_accessor) + return _internal_no_standard_descriptor_accessor(); +} +inline void MessageOptions::_internal_set_no_standard_descriptor_accessor(bool value) { + _has_bits_[0] |= 0x00000002u; + no_standard_descriptor_accessor_ = value; +} +inline void MessageOptions::set_no_standard_descriptor_accessor(bool value) { + _internal_set_no_standard_descriptor_accessor(value); + // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.no_standard_descriptor_accessor) +} + +// optional bool deprecated = 3 [default = false]; +inline bool MessageOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool MessageOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void MessageOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000004u; +} +inline bool MessageOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool MessageOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.deprecated) + return _internal_deprecated(); +} +inline void MessageOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000004u; + deprecated_ = value; +} +inline void MessageOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.deprecated) +} + +// optional bool map_entry = 7; +inline bool MessageOptions::_internal_has_map_entry() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + return value; +} +inline bool MessageOptions::has_map_entry() const { + return _internal_has_map_entry(); +} +inline void MessageOptions::clear_map_entry() { + map_entry_ = false; + _has_bits_[0] &= ~0x00000008u; +} +inline bool MessageOptions::_internal_map_entry() const { + return map_entry_; +} +inline bool MessageOptions::map_entry() const { + // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.map_entry) + return _internal_map_entry(); +} +inline void MessageOptions::_internal_set_map_entry(bool value) { + _has_bits_[0] |= 0x00000008u; + map_entry_ = value; +} +inline void MessageOptions::set_map_entry(bool value) { + _internal_set_map_entry(value); + // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.map_entry) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int MessageOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int MessageOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void MessageOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.MessageOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +MessageOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.MessageOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MessageOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MessageOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.MessageOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +MessageOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.MessageOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// FieldOptions + +// optional .google.protobuf.FieldOptions.CType ctype = 1 [default = STRING]; +inline bool FieldOptions::_internal_has_ctype() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool FieldOptions::has_ctype() const { + return _internal_has_ctype(); +} +inline void FieldOptions::clear_ctype() { + ctype_ = 0; + _has_bits_[0] &= ~0x00000001u; +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions_CType FieldOptions::_internal_ctype() const { + return static_cast< PROTOBUF_NAMESPACE_ID::FieldOptions_CType >(ctype_); +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions_CType FieldOptions::ctype() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.ctype) + return _internal_ctype(); +} +inline void FieldOptions::_internal_set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value) { + assert(PROTOBUF_NAMESPACE_ID::FieldOptions_CType_IsValid(value)); + _has_bits_[0] |= 0x00000001u; + ctype_ = value; +} +inline void FieldOptions::set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value) { + _internal_set_ctype(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.ctype) +} + +// optional bool packed = 2; +inline bool FieldOptions::_internal_has_packed() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool FieldOptions::has_packed() const { + return _internal_has_packed(); +} +inline void FieldOptions::clear_packed() { + packed_ = false; + _has_bits_[0] &= ~0x00000002u; +} +inline bool FieldOptions::_internal_packed() const { + return packed_; +} +inline bool FieldOptions::packed() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.packed) + return _internal_packed(); +} +inline void FieldOptions::_internal_set_packed(bool value) { + _has_bits_[0] |= 0x00000002u; + packed_ = value; +} +inline void FieldOptions::set_packed(bool value) { + _internal_set_packed(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.packed) +} + +// optional .google.protobuf.FieldOptions.JSType jstype = 6 [default = JS_NORMAL]; +inline bool FieldOptions::_internal_has_jstype() const { + bool value = (_has_bits_[0] & 0x00000020u) != 0; + return value; +} +inline bool FieldOptions::has_jstype() const { + return _internal_has_jstype(); +} +inline void FieldOptions::clear_jstype() { + jstype_ = 0; + _has_bits_[0] &= ~0x00000020u; +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions_JSType FieldOptions::_internal_jstype() const { + return static_cast< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType >(jstype_); +} +inline PROTOBUF_NAMESPACE_ID::FieldOptions_JSType FieldOptions::jstype() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.jstype) + return _internal_jstype(); +} +inline void FieldOptions::_internal_set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value) { + assert(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType_IsValid(value)); + _has_bits_[0] |= 0x00000020u; + jstype_ = value; +} +inline void FieldOptions::set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value) { + _internal_set_jstype(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.jstype) +} + +// optional bool lazy = 5 [default = false]; +inline bool FieldOptions::_internal_has_lazy() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool FieldOptions::has_lazy() const { + return _internal_has_lazy(); +} +inline void FieldOptions::clear_lazy() { + lazy_ = false; + _has_bits_[0] &= ~0x00000004u; +} +inline bool FieldOptions::_internal_lazy() const { + return lazy_; +} +inline bool FieldOptions::lazy() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.lazy) + return _internal_lazy(); +} +inline void FieldOptions::_internal_set_lazy(bool value) { + _has_bits_[0] |= 0x00000004u; + lazy_ = value; +} +inline void FieldOptions::set_lazy(bool value) { + _internal_set_lazy(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.lazy) +} + +// optional bool deprecated = 3 [default = false]; +inline bool FieldOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + return value; +} +inline bool FieldOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void FieldOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000008u; +} +inline bool FieldOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool FieldOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.deprecated) + return _internal_deprecated(); +} +inline void FieldOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000008u; + deprecated_ = value; +} +inline void FieldOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.deprecated) +} + +// optional bool weak = 10 [default = false]; +inline bool FieldOptions::_internal_has_weak() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + return value; +} +inline bool FieldOptions::has_weak() const { + return _internal_has_weak(); +} +inline void FieldOptions::clear_weak() { + weak_ = false; + _has_bits_[0] &= ~0x00000010u; +} +inline bool FieldOptions::_internal_weak() const { + return weak_; +} +inline bool FieldOptions::weak() const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.weak) + return _internal_weak(); +} +inline void FieldOptions::_internal_set_weak(bool value) { + _has_bits_[0] |= 0x00000010u; + weak_ = value; +} +inline void FieldOptions::set_weak(bool value) { + _internal_set_weak(value); + // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.weak) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int FieldOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int FieldOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void FieldOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +FieldOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FieldOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FieldOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FieldOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.FieldOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +FieldOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.FieldOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// OneofOptions + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int OneofOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int OneofOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void OneofOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.OneofOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +OneofOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.OneofOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& OneofOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& OneofOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.OneofOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.OneofOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +OneofOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.OneofOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// EnumOptions + +// optional bool allow_alias = 2; +inline bool EnumOptions::_internal_has_allow_alias() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool EnumOptions::has_allow_alias() const { + return _internal_has_allow_alias(); +} +inline void EnumOptions::clear_allow_alias() { + allow_alias_ = false; + _has_bits_[0] &= ~0x00000001u; +} +inline bool EnumOptions::_internal_allow_alias() const { + return allow_alias_; +} +inline bool EnumOptions::allow_alias() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.allow_alias) + return _internal_allow_alias(); +} +inline void EnumOptions::_internal_set_allow_alias(bool value) { + _has_bits_[0] |= 0x00000001u; + allow_alias_ = value; +} +inline void EnumOptions::set_allow_alias(bool value) { + _internal_set_allow_alias(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumOptions.allow_alias) +} + +// optional bool deprecated = 3 [default = false]; +inline bool EnumOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool EnumOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void EnumOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000002u; +} +inline bool EnumOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool EnumOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.deprecated) + return _internal_deprecated(); +} +inline void EnumOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000002u; + deprecated_ = value; +} +inline void EnumOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumOptions.deprecated) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int EnumOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int EnumOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void EnumOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +EnumOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.EnumOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +EnumOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.EnumOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// EnumValueOptions + +// optional bool deprecated = 1 [default = false]; +inline bool EnumValueOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool EnumValueOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void EnumValueOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000001u; +} +inline bool EnumValueOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool EnumValueOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumValueOptions.deprecated) + return _internal_deprecated(); +} +inline void EnumValueOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000001u; + deprecated_ = value; +} +inline void EnumValueOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.EnumValueOptions.deprecated) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int EnumValueOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int EnumValueOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void EnumValueOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +EnumValueOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumValueOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumValueOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumValueOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.EnumValueOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.EnumValueOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +EnumValueOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.EnumValueOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// ServiceOptions + +// optional bool deprecated = 33 [default = false]; +inline bool ServiceOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool ServiceOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void ServiceOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000001u; +} +inline bool ServiceOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool ServiceOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.ServiceOptions.deprecated) + return _internal_deprecated(); +} +inline void ServiceOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000001u; + deprecated_ = value; +} +inline void ServiceOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.ServiceOptions.deprecated) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int ServiceOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int ServiceOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void ServiceOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +ServiceOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.ServiceOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ServiceOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ServiceOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.ServiceOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.ServiceOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +ServiceOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.ServiceOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// MethodOptions + +// optional bool deprecated = 33 [default = false]; +inline bool MethodOptions::_internal_has_deprecated() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool MethodOptions::has_deprecated() const { + return _internal_has_deprecated(); +} +inline void MethodOptions::clear_deprecated() { + deprecated_ = false; + _has_bits_[0] &= ~0x00000001u; +} +inline bool MethodOptions::_internal_deprecated() const { + return deprecated_; +} +inline bool MethodOptions::deprecated() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.deprecated) + return _internal_deprecated(); +} +inline void MethodOptions::_internal_set_deprecated(bool value) { + _has_bits_[0] |= 0x00000001u; + deprecated_ = value; +} +inline void MethodOptions::set_deprecated(bool value) { + _internal_set_deprecated(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodOptions.deprecated) +} + +// optional .google.protobuf.MethodOptions.IdempotencyLevel idempotency_level = 34 [default = IDEMPOTENCY_UNKNOWN]; +inline bool MethodOptions::_internal_has_idempotency_level() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool MethodOptions::has_idempotency_level() const { + return _internal_has_idempotency_level(); +} +inline void MethodOptions::clear_idempotency_level() { + idempotency_level_ = 0; + _has_bits_[0] &= ~0x00000002u; +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel MethodOptions::_internal_idempotency_level() const { + return static_cast< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel >(idempotency_level_); +} +inline PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel MethodOptions::idempotency_level() const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.idempotency_level) + return _internal_idempotency_level(); +} +inline void MethodOptions::_internal_set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value) { + assert(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel_IsValid(value)); + _has_bits_[0] |= 0x00000002u; + idempotency_level_ = value; +} +inline void MethodOptions::set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value) { + _internal_set_idempotency_level(value); + // @@protoc_insertion_point(field_set:google.protobuf.MethodOptions.idempotency_level) +} + +// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999; +inline int MethodOptions::_internal_uninterpreted_option_size() const { + return uninterpreted_option_.size(); +} +inline int MethodOptions::uninterpreted_option_size() const { + return _internal_uninterpreted_option_size(); +} +inline void MethodOptions::clear_uninterpreted_option() { + uninterpreted_option_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::mutable_uninterpreted_option(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.MethodOptions.uninterpreted_option) + return uninterpreted_option_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >* +MethodOptions::mutable_uninterpreted_option() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.MethodOptions.uninterpreted_option) + return &uninterpreted_option_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MethodOptions::_internal_uninterpreted_option(int index) const { + return uninterpreted_option_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MethodOptions::uninterpreted_option(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.uninterpreted_option) + return _internal_uninterpreted_option(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::_internal_add_uninterpreted_option() { + return uninterpreted_option_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::add_uninterpreted_option() { + // @@protoc_insertion_point(field_add:google.protobuf.MethodOptions.uninterpreted_option) + return _internal_add_uninterpreted_option(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >& +MethodOptions::uninterpreted_option() const { + // @@protoc_insertion_point(field_list:google.protobuf.MethodOptions.uninterpreted_option) + return uninterpreted_option_; +} + +// ------------------------------------------------------------------- + +// UninterpretedOption_NamePart + +// required string name_part = 1; +inline bool UninterpretedOption_NamePart::_internal_has_name_part() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool UninterpretedOption_NamePart::has_name_part() const { + return _internal_has_name_part(); +} +inline void UninterpretedOption_NamePart::clear_name_part() { + name_part_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& UninterpretedOption_NamePart::name_part() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.NamePart.name_part) + return _internal_name_part(); +} +inline void UninterpretedOption_NamePart::set_name_part(const std::string& value) { + _internal_set_name_part(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.NamePart.name_part) +} +inline std::string* UninterpretedOption_NamePart::mutable_name_part() { + // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.NamePart.name_part) + return _internal_mutable_name_part(); +} +inline const std::string& UninterpretedOption_NamePart::_internal_name_part() const { + return name_part_.Get(); +} +inline void UninterpretedOption_NamePart::_internal_set_name_part(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void UninterpretedOption_NamePart::set_name_part(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + name_part_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.NamePart.name_part) +} +inline void UninterpretedOption_NamePart::set_name_part(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.NamePart.name_part) +} +inline void UninterpretedOption_NamePart::set_name_part(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.NamePart.name_part) +} +inline std::string* UninterpretedOption_NamePart::_internal_mutable_name_part() { + _has_bits_[0] |= 0x00000001u; + return name_part_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* UninterpretedOption_NamePart::release_name_part() { + // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.NamePart.name_part) + if (!_internal_has_name_part()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return name_part_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void UninterpretedOption_NamePart::set_allocated_name_part(std::string* name_part) { + if (name_part != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + name_part_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name_part, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.NamePart.name_part) +} + +// required bool is_extension = 2; +inline bool UninterpretedOption_NamePart::_internal_has_is_extension() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool UninterpretedOption_NamePart::has_is_extension() const { + return _internal_has_is_extension(); +} +inline void UninterpretedOption_NamePart::clear_is_extension() { + is_extension_ = false; + _has_bits_[0] &= ~0x00000002u; +} +inline bool UninterpretedOption_NamePart::_internal_is_extension() const { + return is_extension_; +} +inline bool UninterpretedOption_NamePart::is_extension() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.NamePart.is_extension) + return _internal_is_extension(); +} +inline void UninterpretedOption_NamePart::_internal_set_is_extension(bool value) { + _has_bits_[0] |= 0x00000002u; + is_extension_ = value; +} +inline void UninterpretedOption_NamePart::set_is_extension(bool value) { + _internal_set_is_extension(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.NamePart.is_extension) +} + +// ------------------------------------------------------------------- + +// UninterpretedOption + +// repeated .google.protobuf.UninterpretedOption.NamePart name = 2; +inline int UninterpretedOption::_internal_name_size() const { + return name_.size(); +} +inline int UninterpretedOption::name_size() const { + return _internal_name_size(); +} +inline void UninterpretedOption::clear_name() { + name_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::mutable_name(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.name) + return name_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >* +UninterpretedOption::mutable_name() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.UninterpretedOption.name) + return &name_; +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& UninterpretedOption::_internal_name(int index) const { + return name_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& UninterpretedOption::name(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.name) + return _internal_name(index); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::_internal_add_name() { + return name_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::add_name() { + // @@protoc_insertion_point(field_add:google.protobuf.UninterpretedOption.name) + return _internal_add_name(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >& +UninterpretedOption::name() const { + // @@protoc_insertion_point(field_list:google.protobuf.UninterpretedOption.name) + return name_; +} + +// optional string identifier_value = 3; +inline bool UninterpretedOption::_internal_has_identifier_value() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool UninterpretedOption::has_identifier_value() const { + return _internal_has_identifier_value(); +} +inline void UninterpretedOption::clear_identifier_value() { + identifier_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& UninterpretedOption::identifier_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.identifier_value) + return _internal_identifier_value(); +} +inline void UninterpretedOption::set_identifier_value(const std::string& value) { + _internal_set_identifier_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.identifier_value) +} +inline std::string* UninterpretedOption::mutable_identifier_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.identifier_value) + return _internal_mutable_identifier_value(); +} +inline const std::string& UninterpretedOption::_internal_identifier_value() const { + return identifier_value_.Get(); +} +inline void UninterpretedOption::_internal_set_identifier_value(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void UninterpretedOption::set_identifier_value(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + identifier_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.identifier_value) +} +inline void UninterpretedOption::set_identifier_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.identifier_value) +} +inline void UninterpretedOption::set_identifier_value(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.identifier_value) +} +inline std::string* UninterpretedOption::_internal_mutable_identifier_value() { + _has_bits_[0] |= 0x00000001u; + return identifier_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* UninterpretedOption::release_identifier_value() { + // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.identifier_value) + if (!_internal_has_identifier_value()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return identifier_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void UninterpretedOption::set_allocated_identifier_value(std::string* identifier_value) { + if (identifier_value != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + identifier_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), identifier_value, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.identifier_value) +} + +// optional uint64 positive_int_value = 4; +inline bool UninterpretedOption::_internal_has_positive_int_value() const { + bool value = (_has_bits_[0] & 0x00000008u) != 0; + return value; +} +inline bool UninterpretedOption::has_positive_int_value() const { + return _internal_has_positive_int_value(); +} +inline void UninterpretedOption::clear_positive_int_value() { + positive_int_value_ = PROTOBUF_ULONGLONG(0); + _has_bits_[0] &= ~0x00000008u; +} +inline ::PROTOBUF_NAMESPACE_ID::uint64 UninterpretedOption::_internal_positive_int_value() const { + return positive_int_value_; +} +inline ::PROTOBUF_NAMESPACE_ID::uint64 UninterpretedOption::positive_int_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.positive_int_value) + return _internal_positive_int_value(); +} +inline void UninterpretedOption::_internal_set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value) { + _has_bits_[0] |= 0x00000008u; + positive_int_value_ = value; +} +inline void UninterpretedOption::set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value) { + _internal_set_positive_int_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.positive_int_value) +} + +// optional int64 negative_int_value = 5; +inline bool UninterpretedOption::_internal_has_negative_int_value() const { + bool value = (_has_bits_[0] & 0x00000010u) != 0; + return value; +} +inline bool UninterpretedOption::has_negative_int_value() const { + return _internal_has_negative_int_value(); +} +inline void UninterpretedOption::clear_negative_int_value() { + negative_int_value_ = PROTOBUF_LONGLONG(0); + _has_bits_[0] &= ~0x00000010u; +} +inline ::PROTOBUF_NAMESPACE_ID::int64 UninterpretedOption::_internal_negative_int_value() const { + return negative_int_value_; +} +inline ::PROTOBUF_NAMESPACE_ID::int64 UninterpretedOption::negative_int_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.negative_int_value) + return _internal_negative_int_value(); +} +inline void UninterpretedOption::_internal_set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value) { + _has_bits_[0] |= 0x00000010u; + negative_int_value_ = value; +} +inline void UninterpretedOption::set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value) { + _internal_set_negative_int_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.negative_int_value) +} + +// optional double double_value = 6; +inline bool UninterpretedOption::_internal_has_double_value() const { + bool value = (_has_bits_[0] & 0x00000020u) != 0; + return value; +} +inline bool UninterpretedOption::has_double_value() const { + return _internal_has_double_value(); +} +inline void UninterpretedOption::clear_double_value() { + double_value_ = 0; + _has_bits_[0] &= ~0x00000020u; +} +inline double UninterpretedOption::_internal_double_value() const { + return double_value_; +} +inline double UninterpretedOption::double_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.double_value) + return _internal_double_value(); +} +inline void UninterpretedOption::_internal_set_double_value(double value) { + _has_bits_[0] |= 0x00000020u; + double_value_ = value; +} +inline void UninterpretedOption::set_double_value(double value) { + _internal_set_double_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.double_value) +} + +// optional bytes string_value = 7; +inline bool UninterpretedOption::_internal_has_string_value() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool UninterpretedOption::has_string_value() const { + return _internal_has_string_value(); +} +inline void UninterpretedOption::clear_string_value() { + string_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& UninterpretedOption::string_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.string_value) + return _internal_string_value(); +} +inline void UninterpretedOption::set_string_value(const std::string& value) { + _internal_set_string_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.string_value) +} +inline std::string* UninterpretedOption::mutable_string_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.string_value) + return _internal_mutable_string_value(); +} +inline const std::string& UninterpretedOption::_internal_string_value() const { + return string_value_.Get(); +} +inline void UninterpretedOption::_internal_set_string_value(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void UninterpretedOption::set_string_value(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + string_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.string_value) +} +inline void UninterpretedOption::set_string_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.string_value) +} +inline void UninterpretedOption::set_string_value(const void* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.string_value) +} +inline std::string* UninterpretedOption::_internal_mutable_string_value() { + _has_bits_[0] |= 0x00000002u; + return string_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* UninterpretedOption::release_string_value() { + // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.string_value) + if (!_internal_has_string_value()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return string_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void UninterpretedOption::set_allocated_string_value(std::string* string_value) { + if (string_value != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + string_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), string_value, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.string_value) +} + +// optional string aggregate_value = 8; +inline bool UninterpretedOption::_internal_has_aggregate_value() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool UninterpretedOption::has_aggregate_value() const { + return _internal_has_aggregate_value(); +} +inline void UninterpretedOption::clear_aggregate_value() { + aggregate_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000004u; +} +inline const std::string& UninterpretedOption::aggregate_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.aggregate_value) + return _internal_aggregate_value(); +} +inline void UninterpretedOption::set_aggregate_value(const std::string& value) { + _internal_set_aggregate_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.aggregate_value) +} +inline std::string* UninterpretedOption::mutable_aggregate_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.aggregate_value) + return _internal_mutable_aggregate_value(); +} +inline const std::string& UninterpretedOption::_internal_aggregate_value() const { + return aggregate_value_.Get(); +} +inline void UninterpretedOption::_internal_set_aggregate_value(const std::string& value) { + _has_bits_[0] |= 0x00000004u; + aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void UninterpretedOption::set_aggregate_value(std::string&& value) { + _has_bits_[0] |= 0x00000004u; + aggregate_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.aggregate_value) +} +inline void UninterpretedOption::set_aggregate_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000004u; + aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.aggregate_value) +} +inline void UninterpretedOption::set_aggregate_value(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000004u; + aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.aggregate_value) +} +inline std::string* UninterpretedOption::_internal_mutable_aggregate_value() { + _has_bits_[0] |= 0x00000004u; + return aggregate_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* UninterpretedOption::release_aggregate_value() { + // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.aggregate_value) + if (!_internal_has_aggregate_value()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000004u; + return aggregate_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void UninterpretedOption::set_allocated_aggregate_value(std::string* aggregate_value) { + if (aggregate_value != nullptr) { + _has_bits_[0] |= 0x00000004u; + } else { + _has_bits_[0] &= ~0x00000004u; + } + aggregate_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), aggregate_value, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.aggregate_value) +} + +// ------------------------------------------------------------------- + +// SourceCodeInfo_Location + +// repeated int32 path = 1 [packed = true]; +inline int SourceCodeInfo_Location::_internal_path_size() const { + return path_.size(); +} +inline int SourceCodeInfo_Location::path_size() const { + return _internal_path_size(); +} +inline void SourceCodeInfo_Location::clear_path() { + path_.Clear(); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::_internal_path(int index) const { + return path_.Get(index); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::path(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.path) + return _internal_path(index); +} +inline void SourceCodeInfo_Location::set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) { + path_.Set(index, value); + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.path) +} +inline void SourceCodeInfo_Location::_internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value) { + path_.Add(value); +} +inline void SourceCodeInfo_Location::add_path(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_add_path(value); + // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.path) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +SourceCodeInfo_Location::_internal_path() const { + return path_; +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +SourceCodeInfo_Location::path() const { + // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.path) + return _internal_path(); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +SourceCodeInfo_Location::_internal_mutable_path() { + return &path_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +SourceCodeInfo_Location::mutable_path() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.path) + return _internal_mutable_path(); +} + +// repeated int32 span = 2 [packed = true]; +inline int SourceCodeInfo_Location::_internal_span_size() const { + return span_.size(); +} +inline int SourceCodeInfo_Location::span_size() const { + return _internal_span_size(); +} +inline void SourceCodeInfo_Location::clear_span() { + span_.Clear(); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::_internal_span(int index) const { + return span_.Get(index); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::span(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.span) + return _internal_span(index); +} +inline void SourceCodeInfo_Location::set_span(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) { + span_.Set(index, value); + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.span) +} +inline void SourceCodeInfo_Location::_internal_add_span(::PROTOBUF_NAMESPACE_ID::int32 value) { + span_.Add(value); +} +inline void SourceCodeInfo_Location::add_span(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_add_span(value); + // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.span) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +SourceCodeInfo_Location::_internal_span() const { + return span_; +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +SourceCodeInfo_Location::span() const { + // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.span) + return _internal_span(); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +SourceCodeInfo_Location::_internal_mutable_span() { + return &span_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +SourceCodeInfo_Location::mutable_span() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.span) + return _internal_mutable_span(); +} + +// optional string leading_comments = 3; +inline bool SourceCodeInfo_Location::_internal_has_leading_comments() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool SourceCodeInfo_Location::has_leading_comments() const { + return _internal_has_leading_comments(); +} +inline void SourceCodeInfo_Location::clear_leading_comments() { + leading_comments_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& SourceCodeInfo_Location::leading_comments() const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.leading_comments) + return _internal_leading_comments(); +} +inline void SourceCodeInfo_Location::set_leading_comments(const std::string& value) { + _internal_set_leading_comments(value); + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_comments) +} +inline std::string* SourceCodeInfo_Location::mutable_leading_comments() { + // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.leading_comments) + return _internal_mutable_leading_comments(); +} +inline const std::string& SourceCodeInfo_Location::_internal_leading_comments() const { + return leading_comments_.Get(); +} +inline void SourceCodeInfo_Location::_internal_set_leading_comments(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void SourceCodeInfo_Location::set_leading_comments(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + leading_comments_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceCodeInfo.Location.leading_comments) +} +inline void SourceCodeInfo_Location::set_leading_comments(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.leading_comments) +} +inline void SourceCodeInfo_Location::set_leading_comments(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.leading_comments) +} +inline std::string* SourceCodeInfo_Location::_internal_mutable_leading_comments() { + _has_bits_[0] |= 0x00000001u; + return leading_comments_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* SourceCodeInfo_Location::release_leading_comments() { + // @@protoc_insertion_point(field_release:google.protobuf.SourceCodeInfo.Location.leading_comments) + if (!_internal_has_leading_comments()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return leading_comments_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void SourceCodeInfo_Location::set_allocated_leading_comments(std::string* leading_comments) { + if (leading_comments != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + leading_comments_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), leading_comments, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceCodeInfo.Location.leading_comments) +} + +// optional string trailing_comments = 4; +inline bool SourceCodeInfo_Location::_internal_has_trailing_comments() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool SourceCodeInfo_Location::has_trailing_comments() const { + return _internal_has_trailing_comments(); +} +inline void SourceCodeInfo_Location::clear_trailing_comments() { + trailing_comments_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000002u; +} +inline const std::string& SourceCodeInfo_Location::trailing_comments() const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.trailing_comments) + return _internal_trailing_comments(); +} +inline void SourceCodeInfo_Location::set_trailing_comments(const std::string& value) { + _internal_set_trailing_comments(value); + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.trailing_comments) +} +inline std::string* SourceCodeInfo_Location::mutable_trailing_comments() { + // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.trailing_comments) + return _internal_mutable_trailing_comments(); +} +inline const std::string& SourceCodeInfo_Location::_internal_trailing_comments() const { + return trailing_comments_.Get(); +} +inline void SourceCodeInfo_Location::_internal_set_trailing_comments(const std::string& value) { + _has_bits_[0] |= 0x00000002u; + trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void SourceCodeInfo_Location::set_trailing_comments(std::string&& value) { + _has_bits_[0] |= 0x00000002u; + trailing_comments_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceCodeInfo.Location.trailing_comments) +} +inline void SourceCodeInfo_Location::set_trailing_comments(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000002u; + trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.trailing_comments) +} +inline void SourceCodeInfo_Location::set_trailing_comments(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000002u; + trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.trailing_comments) +} +inline std::string* SourceCodeInfo_Location::_internal_mutable_trailing_comments() { + _has_bits_[0] |= 0x00000002u; + return trailing_comments_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* SourceCodeInfo_Location::release_trailing_comments() { + // @@protoc_insertion_point(field_release:google.protobuf.SourceCodeInfo.Location.trailing_comments) + if (!_internal_has_trailing_comments()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000002u; + return trailing_comments_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void SourceCodeInfo_Location::set_allocated_trailing_comments(std::string* trailing_comments) { + if (trailing_comments != nullptr) { + _has_bits_[0] |= 0x00000002u; + } else { + _has_bits_[0] &= ~0x00000002u; + } + trailing_comments_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), trailing_comments, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceCodeInfo.Location.trailing_comments) +} + +// repeated string leading_detached_comments = 6; +inline int SourceCodeInfo_Location::_internal_leading_detached_comments_size() const { + return leading_detached_comments_.size(); +} +inline int SourceCodeInfo_Location::leading_detached_comments_size() const { + return _internal_leading_detached_comments_size(); +} +inline void SourceCodeInfo_Location::clear_leading_detached_comments() { + leading_detached_comments_.Clear(); +} +inline std::string* SourceCodeInfo_Location::add_leading_detached_comments() { + // @@protoc_insertion_point(field_add_mutable:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + return _internal_add_leading_detached_comments(); +} +inline const std::string& SourceCodeInfo_Location::_internal_leading_detached_comments(int index) const { + return leading_detached_comments_.Get(index); +} +inline const std::string& SourceCodeInfo_Location::leading_detached_comments(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + return _internal_leading_detached_comments(index); +} +inline std::string* SourceCodeInfo_Location::mutable_leading_detached_comments(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + return leading_detached_comments_.Mutable(index); +} +inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const std::string& value) { + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + leading_detached_comments_.Mutable(index)->assign(value); +} +inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + leading_detached_comments_.Mutable(index)->assign(std::move(value)); +} +inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const char* value) { + GOOGLE_DCHECK(value != nullptr); + leading_detached_comments_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const char* value, size_t size) { + leading_detached_comments_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline std::string* SourceCodeInfo_Location::_internal_add_leading_detached_comments() { + return leading_detached_comments_.Add(); +} +inline void SourceCodeInfo_Location::add_leading_detached_comments(const std::string& value) { + leading_detached_comments_.Add()->assign(value); + // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline void SourceCodeInfo_Location::add_leading_detached_comments(std::string&& value) { + leading_detached_comments_.Add(std::move(value)); + // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline void SourceCodeInfo_Location::add_leading_detached_comments(const char* value) { + GOOGLE_DCHECK(value != nullptr); + leading_detached_comments_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline void SourceCodeInfo_Location::add_leading_detached_comments(const char* value, size_t size) { + leading_detached_comments_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& +SourceCodeInfo_Location::leading_detached_comments() const { + // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + return leading_detached_comments_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* +SourceCodeInfo_Location::mutable_leading_detached_comments() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.leading_detached_comments) + return &leading_detached_comments_; +} + +// ------------------------------------------------------------------- + +// SourceCodeInfo + +// repeated .google.protobuf.SourceCodeInfo.Location location = 1; +inline int SourceCodeInfo::_internal_location_size() const { + return location_.size(); +} +inline int SourceCodeInfo::location_size() const { + return _internal_location_size(); +} +inline void SourceCodeInfo::clear_location() { + location_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::mutable_location(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.location) + return location_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >* +SourceCodeInfo::mutable_location() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.location) + return &location_; +} +inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& SourceCodeInfo::_internal_location(int index) const { + return location_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& SourceCodeInfo::location(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.location) + return _internal_location(index); +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::_internal_add_location() { + return location_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::add_location() { + // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.location) + return _internal_add_location(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >& +SourceCodeInfo::location() const { + // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.location) + return location_; +} + +// ------------------------------------------------------------------- + +// GeneratedCodeInfo_Annotation + +// repeated int32 path = 1 [packed = true]; +inline int GeneratedCodeInfo_Annotation::_internal_path_size() const { + return path_.size(); +} +inline int GeneratedCodeInfo_Annotation::path_size() const { + return _internal_path_size(); +} +inline void GeneratedCodeInfo_Annotation::clear_path() { + path_.Clear(); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_path(int index) const { + return path_.Get(index); +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::path(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.path) + return _internal_path(index); +} +inline void GeneratedCodeInfo_Annotation::set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) { + path_.Set(index, value); + // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.path) +} +inline void GeneratedCodeInfo_Annotation::_internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value) { + path_.Add(value); +} +inline void GeneratedCodeInfo_Annotation::add_path(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_add_path(value); + // @@protoc_insertion_point(field_add:google.protobuf.GeneratedCodeInfo.Annotation.path) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +GeneratedCodeInfo_Annotation::_internal_path() const { + return path_; +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >& +GeneratedCodeInfo_Annotation::path() const { + // @@protoc_insertion_point(field_list:google.protobuf.GeneratedCodeInfo.Annotation.path) + return _internal_path(); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +GeneratedCodeInfo_Annotation::_internal_mutable_path() { + return &path_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >* +GeneratedCodeInfo_Annotation::mutable_path() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.GeneratedCodeInfo.Annotation.path) + return _internal_mutable_path(); +} + +// optional string source_file = 2; +inline bool GeneratedCodeInfo_Annotation::_internal_has_source_file() const { + bool value = (_has_bits_[0] & 0x00000001u) != 0; + return value; +} +inline bool GeneratedCodeInfo_Annotation::has_source_file() const { + return _internal_has_source_file(); +} +inline void GeneratedCodeInfo_Annotation::clear_source_file() { + source_file_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + _has_bits_[0] &= ~0x00000001u; +} +inline const std::string& GeneratedCodeInfo_Annotation::source_file() const { + // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.source_file) + return _internal_source_file(); +} +inline void GeneratedCodeInfo_Annotation::set_source_file(const std::string& value) { + _internal_set_source_file(value); + // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.source_file) +} +inline std::string* GeneratedCodeInfo_Annotation::mutable_source_file() { + // @@protoc_insertion_point(field_mutable:google.protobuf.GeneratedCodeInfo.Annotation.source_file) + return _internal_mutable_source_file(); +} +inline const std::string& GeneratedCodeInfo_Annotation::_internal_source_file() const { + return source_file_.Get(); +} +inline void GeneratedCodeInfo_Annotation::_internal_set_source_file(const std::string& value) { + _has_bits_[0] |= 0x00000001u; + source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void GeneratedCodeInfo_Annotation::set_source_file(std::string&& value) { + _has_bits_[0] |= 0x00000001u; + source_file_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.GeneratedCodeInfo.Annotation.source_file) +} +inline void GeneratedCodeInfo_Annotation::set_source_file(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000001u; + source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.GeneratedCodeInfo.Annotation.source_file) +} +inline void GeneratedCodeInfo_Annotation::set_source_file(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000001u; + source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.GeneratedCodeInfo.Annotation.source_file) +} +inline std::string* GeneratedCodeInfo_Annotation::_internal_mutable_source_file() { + _has_bits_[0] |= 0x00000001u; + return source_file_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* GeneratedCodeInfo_Annotation::release_source_file() { + // @@protoc_insertion_point(field_release:google.protobuf.GeneratedCodeInfo.Annotation.source_file) + if (!_internal_has_source_file()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000001u; + return source_file_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void GeneratedCodeInfo_Annotation::set_allocated_source_file(std::string* source_file) { + if (source_file != nullptr) { + _has_bits_[0] |= 0x00000001u; + } else { + _has_bits_[0] &= ~0x00000001u; + } + source_file_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), source_file, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.GeneratedCodeInfo.Annotation.source_file) +} + +// optional int32 begin = 3; +inline bool GeneratedCodeInfo_Annotation::_internal_has_begin() const { + bool value = (_has_bits_[0] & 0x00000002u) != 0; + return value; +} +inline bool GeneratedCodeInfo_Annotation::has_begin() const { + return _internal_has_begin(); +} +inline void GeneratedCodeInfo_Annotation::clear_begin() { + begin_ = 0; + _has_bits_[0] &= ~0x00000002u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_begin() const { + return begin_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::begin() const { + // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.begin) + return _internal_begin(); +} +inline void GeneratedCodeInfo_Annotation::_internal_set_begin(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000002u; + begin_ = value; +} +inline void GeneratedCodeInfo_Annotation::set_begin(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_begin(value); + // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.begin) +} + +// optional int32 end = 4; +inline bool GeneratedCodeInfo_Annotation::_internal_has_end() const { + bool value = (_has_bits_[0] & 0x00000004u) != 0; + return value; +} +inline bool GeneratedCodeInfo_Annotation::has_end() const { + return _internal_has_end(); +} +inline void GeneratedCodeInfo_Annotation::clear_end() { + end_ = 0; + _has_bits_[0] &= ~0x00000004u; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_end() const { + return end_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::end() const { + // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.end) + return _internal_end(); +} +inline void GeneratedCodeInfo_Annotation::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _has_bits_[0] |= 0x00000004u; + end_ = value; +} +inline void GeneratedCodeInfo_Annotation::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_end(value); + // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.end) +} + +// ------------------------------------------------------------------- + +// GeneratedCodeInfo + +// repeated .google.protobuf.GeneratedCodeInfo.Annotation annotation = 1; +inline int GeneratedCodeInfo::_internal_annotation_size() const { + return annotation_.size(); +} +inline int GeneratedCodeInfo::annotation_size() const { + return _internal_annotation_size(); +} +inline void GeneratedCodeInfo::clear_annotation() { + annotation_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::mutable_annotation(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.GeneratedCodeInfo.annotation) + return annotation_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >* +GeneratedCodeInfo::mutable_annotation() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.GeneratedCodeInfo.annotation) + return &annotation_; +} +inline const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& GeneratedCodeInfo::_internal_annotation(int index) const { + return annotation_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& GeneratedCodeInfo::annotation(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.annotation) + return _internal_annotation(index); +} +inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::_internal_add_annotation() { + return annotation_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::add_annotation() { + // @@protoc_insertion_point(field_add:google.protobuf.GeneratedCodeInfo.annotation) + return _internal_add_annotation(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >& +GeneratedCodeInfo::annotation() const { + // @@protoc_insertion_point(field_list:google.protobuf.GeneratedCodeInfo.annotation) + return annotation_; +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +PROTOBUF_NAMESPACE_OPEN + +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type>() { + return PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type_descriptor(); +} +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label>() { + return PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label_descriptor(); +} +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode>() { + return PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode_descriptor(); +} +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldOptions_CType> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldOptions_CType>() { + return PROTOBUF_NAMESPACE_ID::FieldOptions_CType_descriptor(); +} +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType>() { + return PROTOBUF_NAMESPACE_ID::FieldOptions_JSType_descriptor(); +} +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel>() { + return PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel_descriptor(); +} + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..769fd2ba6f068eb8d94e18fdbc7f97452d9a9e06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h @@ -0,0 +1,282 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/duration.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fduration_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fduration_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fduration_2eproto; +PROTOBUF_NAMESPACE_OPEN +class Duration; +class DurationDefaultTypeInternal; +PROTOBUF_EXPORT extern DurationDefaultTypeInternal _Duration_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Duration* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT Duration PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Duration) */ { + public: + inline Duration() : Duration(nullptr) {} + virtual ~Duration(); + + Duration(const Duration& from); + Duration(Duration&& from) noexcept + : Duration() { + *this = ::std::move(from); + } + + inline Duration& operator=(const Duration& from) { + CopyFrom(from); + return *this; + } + inline Duration& operator=(Duration&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Duration& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Duration* internal_default_instance() { + return reinterpret_cast( + &_Duration_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(Duration& a, Duration& b) { + a.Swap(&b); + } + inline void Swap(Duration* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Duration* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Duration* New() const final { + return CreateMaybeMessage(nullptr); + } + + Duration* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Duration& from); + void MergeFrom(const Duration& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Duration* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Duration"; + } + protected: + explicit Duration(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fduration_2eproto); + return ::descriptor_table_google_2fprotobuf_2fduration_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kSecondsFieldNumber = 1, + kNanosFieldNumber = 2, + }; + // int64 seconds = 1; + void clear_seconds(); + ::PROTOBUF_NAMESPACE_ID::int64 seconds() const; + void set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value); + private: + ::PROTOBUF_NAMESPACE_ID::int64 _internal_seconds() const; + void _internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value); + public: + + // int32 nanos = 2; + void clear_nanos(); + ::PROTOBUF_NAMESPACE_ID::int32 nanos() const; + void set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_nanos() const; + void _internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Duration) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::int64 seconds_; + ::PROTOBUF_NAMESPACE_ID::int32 nanos_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fduration_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// Duration + +// int64 seconds = 1; +inline void Duration::clear_seconds() { + seconds_ = PROTOBUF_LONGLONG(0); +} +inline ::PROTOBUF_NAMESPACE_ID::int64 Duration::_internal_seconds() const { + return seconds_; +} +inline ::PROTOBUF_NAMESPACE_ID::int64 Duration::seconds() const { + // @@protoc_insertion_point(field_get:google.protobuf.Duration.seconds) + return _internal_seconds(); +} +inline void Duration::_internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) { + + seconds_ = value; +} +inline void Duration::set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) { + _internal_set_seconds(value); + // @@protoc_insertion_point(field_set:google.protobuf.Duration.seconds) +} + +// int32 nanos = 2; +inline void Duration::clear_nanos() { + nanos_ = 0; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 Duration::_internal_nanos() const { + return nanos_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 Duration::nanos() const { + // @@protoc_insertion_point(field_get:google.protobuf.Duration.nanos) + return _internal_nanos(); +} +inline void Duration::_internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) { + + nanos_ = value; +} +inline void Duration::set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_nanos(value); + // @@protoc_insertion_point(field_set:google.protobuf.Duration.nanos) +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..9ccde906f78b2181cd9aeaa3733bdb7fc3b3dc67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h @@ -0,0 +1,218 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/empty.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fempty_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fempty_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fempty_2eproto; +PROTOBUF_NAMESPACE_OPEN +class Empty; +class EmptyDefaultTypeInternal; +PROTOBUF_EXPORT extern EmptyDefaultTypeInternal _Empty_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Empty* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT Empty PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Empty) */ { + public: + inline Empty() : Empty(nullptr) {} + virtual ~Empty(); + + Empty(const Empty& from); + Empty(Empty&& from) noexcept + : Empty() { + *this = ::std::move(from); + } + + inline Empty& operator=(const Empty& from) { + CopyFrom(from); + return *this; + } + inline Empty& operator=(Empty&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Empty& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Empty* internal_default_instance() { + return reinterpret_cast( + &_Empty_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(Empty& a, Empty& b) { + a.Swap(&b); + } + inline void Swap(Empty* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Empty* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Empty* New() const final { + return CreateMaybeMessage(nullptr); + } + + Empty* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Empty& from); + void MergeFrom(const Empty& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Empty* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Empty"; + } + protected: + explicit Empty(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fempty_2eproto); + return ::descriptor_table_google_2fprotobuf_2fempty_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + // @@protoc_insertion_point(class_scope:google.protobuf.Empty) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fempty_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// Empty + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h new file mode 100644 index 0000000000000000000000000000000000000000..64257d58ffef9d1094a797b0ec9ce315315ee42f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: jasonh@google.com (Jason Hsueh) +// +// This header is logically internal, but is made public because it is used +// from protocol-compiler-generated code, which may reside in other components. +// It provides reflection support for generated enums, and is included in +// generated .pb.h files and should have minimal dependencies. The methods are +// implemented in generated_message_reflection.cc. + +#ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ +#define GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ + +#include + +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { +class EnumDescriptor; +} // namespace protobuf +} // namespace google + +namespace google { +namespace protobuf { + +// Returns the EnumDescriptor for enum type E, which must be a +// proto-declared enum type. Code generated by the protocol compiler +// will include specializations of this template for each enum type declared. +template +const EnumDescriptor* GetEnumDescriptor(); + +namespace internal { + +// Helper for EnumType_Parse functions: try to parse the string 'name' as +// an enum name of the given type, returning true and filling in value on +// success, or returning false and leaving value unchanged on failure. +PROTOBUF_EXPORT bool ParseNamedEnum(const EnumDescriptor* descriptor, + ConstStringParam name, int* value); + +template +bool ParseNamedEnum(const EnumDescriptor* descriptor, ConstStringParam name, + EnumType* value) { + int tmp; + if (!ParseNamedEnum(descriptor, name, &tmp)) return false; + *value = static_cast(tmp); + return true; +} + +// Just a wrapper around printing the name of a value. The main point of this +// function is not to be inlined, so that you can do this without including +// descriptor.h. +PROTOBUF_EXPORT const std::string& NameOfEnum(const EnumDescriptor* descriptor, + int value); + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h new file mode 100644 index 0000000000000000000000000000000000000000..45f5083336bebfda4e5dd65dcb1e68c9e6196daf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h @@ -0,0 +1,88 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ +#define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ + +#include + +#include +#include + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { + +// This type trait can be used to cause templates to only match proto2 enum +// types. +template +struct is_proto_enum : ::std::false_type {}; + +namespace internal { + +// The table entry format for storing enum name-to-value mapping used with lite +// protos. This struct and the following related functions should only be used +// by protobuf generated code. +struct EnumEntry { + StringPiece name; + int value; +}; + +// Looks up a numeric enum value given the string name. +PROTOBUF_EXPORT bool LookUpEnumValue(const EnumEntry* enums, size_t size, + StringPiece name, int* value); + +// Looks up an enum name given the numeric value. +PROTOBUF_EXPORT int LookUpEnumName(const EnumEntry* enums, + const int* sorted_indices, size_t size, + int value); + +// Initializes the list of enum names in std::string form. +PROTOBUF_EXPORT bool InitializeEnumStrings( + const EnumEntry* enums, const int* sorted_indices, size_t size, + internal::ExplicitlyConstructed* enum_strings); + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h new file mode 100644 index 0000000000000000000000000000000000000000..4b68e93b9b782a74eefcb0cddf844ec0f6a4da8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h @@ -0,0 +1,265 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains miscellaneous helper code used by generated code -- +// including lite types -- but which should not be used directly by users. + +#ifndef GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ +#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include // Add direct dep on port for pb.cc +#include +#include +#include +#include +#include + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { + +class Arena; +class Message; + +namespace io { +class CodedInputStream; +} + +namespace internal { + +template +inline To DownCast(From* f) { + return PROTOBUF_NAMESPACE_ID::internal::down_cast(f); +} +template +inline To DownCast(From& f) { + return PROTOBUF_NAMESPACE_ID::internal::down_cast(f); +} + + +PROTOBUF_EXPORT void InitProtobufDefaults(); + +// This used by proto1 +PROTOBUF_EXPORT inline const std::string& GetEmptyString() { + InitProtobufDefaults(); + return GetEmptyStringAlreadyInited(); +} + + +// True if IsInitialized() is true for all elements of t. Type is expected +// to be a RepeatedPtrField. It's useful to have this +// helper here to keep the protobuf compiler from ever having to emit loops in +// IsInitialized() methods. We want the C++ compiler to inline this or not +// as it sees fit. +template +bool AllAreInitialized(const RepeatedPtrField& t) { + for (int i = t.size(); --i >= 0;) { + if (!t.Get(i).IsInitialized()) return false; + } + return true; +} + +// "Weak" variant of AllAreInitialized, used to implement implicit weak fields. +// This version operates on MessageLite to avoid introducing a dependency on the +// concrete message type. +template +bool AllAreInitializedWeak(const RepeatedPtrField& t) { + for (int i = t.size(); --i >= 0;) { + if (!reinterpret_cast(t) + .Get >(i) + .IsInitialized()) { + return false; + } + } + return true; +} + +inline bool IsPresent(const void* base, uint32 hasbit) { + const uint32* has_bits_array = static_cast(base); + return (has_bits_array[hasbit / 32] & (1u << (hasbit & 31))) != 0; +} + +inline bool IsOneofPresent(const void* base, uint32 offset, uint32 tag) { + const uint32* oneof = + reinterpret_cast(static_cast(base) + offset); + return *oneof == tag >> 3; +} + +typedef void (*SpecialSerializer)(const uint8* base, uint32 offset, uint32 tag, + uint32 has_offset, + io::CodedOutputStream* output); + +PROTOBUF_EXPORT void ExtensionSerializer(const uint8* base, uint32 offset, + uint32 tag, uint32 has_offset, + io::CodedOutputStream* output); +PROTOBUF_EXPORT void UnknownFieldSerializerLite(const uint8* base, + uint32 offset, uint32 tag, + uint32 has_offset, + io::CodedOutputStream* output); + +PROTOBUF_EXPORT MessageLite* DuplicateIfNonNullInternal(MessageLite* message); +PROTOBUF_EXPORT MessageLite* GetOwnedMessageInternal(Arena* message_arena, + MessageLite* submessage, + Arena* submessage_arena); +PROTOBUF_EXPORT void GenericSwap(MessageLite* m1, MessageLite* m2); +// We specialize GenericSwap for non-lite messages to benefit from reflection. +PROTOBUF_EXPORT void GenericSwap(Message* m1, Message* m2); + +template +T* DuplicateIfNonNull(T* message) { + // The casts must be reinterpret_cast<> because T might be a forward-declared + // type that the compiler doesn't know is related to MessageLite. + return reinterpret_cast( + DuplicateIfNonNullInternal(reinterpret_cast(message))); +} + +template +T* GetOwnedMessage(Arena* message_arena, T* submessage, + Arena* submessage_arena) { + // The casts must be reinterpret_cast<> because T might be a forward-declared + // type that the compiler doesn't know is related to MessageLite. + return reinterpret_cast(GetOwnedMessageInternal( + message_arena, reinterpret_cast(submessage), + submessage_arena)); +} + +// Hide atomic from the public header and allow easy change to regular int +// on platforms where the atomic might have a perf impact. +class PROTOBUF_EXPORT CachedSize { + public: + int Get() const { return size_.load(std::memory_order_relaxed); } + void Set(int size) { size_.store(size, std::memory_order_relaxed); } + + private: + std::atomic size_{0}; +}; + +// SCCInfo represents information of a strongly connected component of +// mutual dependent messages. +struct PROTOBUF_EXPORT SCCInfoBase { + // We use 0 for the Initialized state, because test eax,eax, jnz is smaller + // and is subject to macro fusion. + enum { + kInitialized = 0, // final state + kRunning = 1, + kUninitialized = -1, // initial state + }; +#if defined(_MSC_VER) && !defined(__clang__) + // MSVC doesn't make std::atomic constant initialized. This union trick + // makes it so. + union { + int visit_status_to_make_linker_init; + std::atomic visit_status; + }; +#else + std::atomic visit_status; +#endif + int num_deps; + int num_implicit_weak_deps; + void (*init_func)(); + // This is followed by an array of num_deps + // const SCCInfoBase* deps[]; +}; + +// Zero-length arrays are a language extension available in GCC and Clang but +// not MSVC. +#ifdef __GNUC__ +#define PROTOBUF_ARRAY_SIZE(n) (n) +#else +#define PROTOBUF_ARRAY_SIZE(n) ((n) ? (n) : 1) +#endif + +template +struct SCCInfo { + SCCInfoBase base; + // Semantically this is const SCCInfo* which is is a templated type. + // The obvious inheriting from SCCInfoBase mucks with struct initialization. + // Attempts showed the compiler was generating dynamic initialization code. + // This deps array consists of base.num_deps pointers to SCCInfoBase followed + // by base.num_implicit_weak_deps pointers to SCCInfoBase*. We need the extra + // pointer indirection for implicit weak fields. We cannot use a union type + // here, since that would prevent the array from being linker-initialized. + void* deps[PROTOBUF_ARRAY_SIZE(N)]; +}; + +#undef PROTOBUF_ARRAY_SIZE + +PROTOBUF_EXPORT void InitSCCImpl(SCCInfoBase* scc); + +inline void InitSCC(SCCInfoBase* scc) { + auto status = scc->visit_status.load(std::memory_order_acquire); + if (PROTOBUF_PREDICT_FALSE(status != SCCInfoBase::kInitialized)) + InitSCCImpl(scc); +} + +PROTOBUF_EXPORT void DestroyMessage(const void* message); +PROTOBUF_EXPORT void DestroyString(const void* s); +// Destroy (not delete) the message +inline void OnShutdownDestroyMessage(const void* ptr) { + OnShutdownRun(DestroyMessage, ptr); +} +// Destroy the string (call std::string destructor) +inline void OnShutdownDestroyString(const std::string* ptr) { + OnShutdownRun(DestroyString, ptr); +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h new file mode 100644 index 0000000000000000000000000000000000000000..14337107a154f25afb83039a633a3cfe3c8367e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h @@ -0,0 +1,265 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__ +#define GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__ + +#include +#include + +#include +#include + +// Must be included last. +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { + +class Arena; + +namespace internal { + +// InlinedStringField wraps a std::string instance and exposes an API similar to +// ArenaStringPtr's wrapping of a std::string* instance. As std::string is +// never allocated on the Arena, we expose only the *NoArena methods of +// ArenaStringPtr. +// +// default_value parameters are taken for consistency with ArenaStringPtr, but +// are not used for most methods. With inlining, these should be removed from +// the generated binary. +class PROTOBUF_EXPORT InlinedStringField { + public: + InlinedStringField() PROTOBUF_ALWAYS_INLINE; + explicit InlinedStringField(const std::string& default_value); + + void AssignWithDefault(const std::string* default_value, + const InlinedStringField& from) PROTOBUF_ALWAYS_INLINE; + + void ClearToEmpty(const std::string* default_value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + ClearToEmptyNoArena(default_value); + } + void ClearNonDefaultToEmpty() PROTOBUF_ALWAYS_INLINE { + ClearNonDefaultToEmptyNoArena(); + } + void ClearToEmptyNoArena(const std::string* /*default_value*/) + PROTOBUF_ALWAYS_INLINE { + ClearNonDefaultToEmptyNoArena(); + } + void ClearNonDefaultToEmptyNoArena() PROTOBUF_ALWAYS_INLINE; + + void ClearToDefault(const std::string* default_value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + ClearToDefaultNoArena(default_value); + } + void ClearToDefaultNoArena(const std::string* default_value) + PROTOBUF_ALWAYS_INLINE; + + void Destroy(const std::string* default_value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + DestroyNoArena(default_value); + } + void DestroyNoArena(const std::string* default_value) PROTOBUF_ALWAYS_INLINE; + + const std::string& Get() const PROTOBUF_ALWAYS_INLINE { return GetNoArena(); } + const std::string& GetNoArena() const PROTOBUF_ALWAYS_INLINE; + + std::string* Mutable(const std::string* default_value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + return MutableNoArena(default_value); + } + std::string* MutableNoArena(const std::string* default_value) + PROTOBUF_ALWAYS_INLINE; + + std::string* Release(const std::string* default_value, Arena* /*arena*/) { + return ReleaseNoArena(default_value); + } + std::string* ReleaseNonDefault(const std::string* default_value, + Arena* /*arena*/) { + return ReleaseNonDefaultNoArena(default_value); + } + std::string* ReleaseNoArena(const std::string* default_value) { + return ReleaseNonDefaultNoArena(default_value); + } + std::string* ReleaseNonDefaultNoArena(const std::string* default_value); + + void Set(const std::string* default_value, StringPiece value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + SetNoArena(default_value, value); + } + void SetLite(const std::string* default_value, StringPiece value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + SetNoArena(default_value, value); + } + void SetNoArena(const std::string* default_value, + StringPiece value) PROTOBUF_ALWAYS_INLINE; + + void Set(const std::string* default_value, const std::string& value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + SetNoArena(default_value, value); + } + void SetLite(const std::string* default_value, const std::string& value, + Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE { + SetNoArena(default_value, value); + } + void SetNoArena(const std::string* default_value, + const std::string& value) PROTOBUF_ALWAYS_INLINE; + + void SetNoArena(const std::string* default_value, + std::string&& value) PROTOBUF_ALWAYS_INLINE; + void SetAllocated(const std::string* default_value, std::string* value, + Arena* /*arena*/) { + SetAllocatedNoArena(default_value, value); + } + void SetAllocatedNoArena(const std::string* default_value, + std::string* value); + void Swap(InlinedStringField* from) PROTOBUF_ALWAYS_INLINE; + std::string* UnsafeMutablePointer(); + void UnsafeSetDefault(const std::string* default_value); + std::string* UnsafeArenaRelease(const std::string* default_value, + Arena* arena); + void UnsafeArenaSetAllocated(const std::string* default_value, + std::string* value, Arena* arena); + + bool IsDefault(const std::string* /*default_value*/) { return false; } + + private: + std::string value_; +}; + +inline InlinedStringField::InlinedStringField() {} + +inline InlinedStringField::InlinedStringField(const std::string& default_value) + : value_(default_value) {} + +inline void InlinedStringField::AssignWithDefault( + const std::string* /*default_value*/, const InlinedStringField& from) { + value_ = from.value_; +} + +inline const std::string& InlinedStringField::GetNoArena() const { + return value_; +} + +inline std::string* InlinedStringField::MutableNoArena(const std::string*) { + return &value_; +} + +inline void InlinedStringField::SetAllocatedNoArena( + const std::string* default_value, std::string* value) { + if (value == NULL) { + value_.assign(*default_value); + } else { + value_.assign(std::move(*value)); + delete value; + } +} + +inline void InlinedStringField::DestroyNoArena(const std::string*) { + // This is invoked from the generated message's ArenaDtor, which is used to + // clean up objects not allocated on the Arena. + this->~InlinedStringField(); +} + +inline void InlinedStringField::ClearNonDefaultToEmptyNoArena() { + value_.clear(); +} + +inline void InlinedStringField::ClearToDefaultNoArena( + const std::string* default_value) { + value_.assign(*default_value); +} + +inline std::string* InlinedStringField::ReleaseNonDefaultNoArena( + const std::string* default_value) { + std::string* released = new std::string(*default_value); + value_.swap(*released); + return released; +} + +inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/, + StringPiece value) { + value_.assign(value.data(), value.length()); +} + +inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/, + const std::string& value) { + value_.assign(value); +} + +inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/, + std::string&& value) { + value_.assign(std::move(value)); +} + +inline void InlinedStringField::Swap(InlinedStringField* from) { + value_.swap(from->value_); +} + +inline std::string* InlinedStringField::UnsafeMutablePointer() { + return &value_; +} + +inline void InlinedStringField::UnsafeSetDefault( + const std::string* default_value) { + value_.assign(*default_value); +} + +inline std::string* InlinedStringField::UnsafeArenaRelease( + const std::string* default_value, Arena* /*arena*/) { + return ReleaseNoArena(default_value); +} + +inline void InlinedStringField::UnsafeArenaSetAllocated( + const std::string* default_value, std::string* value, Arena* /*arena*/) { + if (value == NULL) { + value_.assign(*default_value); + } else { + value_.assign(*value); + } +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h new file mode 100644 index 0000000000000000000000000000000000000000..540c914b1d675aab8ba3a843500c098f4959863f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h @@ -0,0 +1,1280 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file defines the map container and its helpers to support protobuf maps. +// +// The Map and MapIterator types are provided by this header file. +// Please avoid using other types defined here, unless they are public +// types within Map or MapIterator, such as Map::value_type. + +#ifndef GOOGLE_PROTOBUF_MAP_H__ +#define GOOGLE_PROTOBUF_MAP_H__ + +#include +#include +#include +#include // To support Visual Studio 2008 +#include +#include +#include +#include + +#if defined(__cpp_lib_string_view) +#include +#endif // defined(__cpp_lib_string_view) + +#include +#include +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { + +template +class Map; + +class MapIterator; + +template +struct is_proto_enum; + +namespace internal { +template +class MapFieldLite; + +template +class MapField; + +template +class TypeDefinedMapFieldBase; + +class DynamicMapField; + +class GeneratedMessageReflection; + +// re-implement std::allocator to use arena allocator for memory allocation. +// Used for Map implementation. Users should not use this class +// directly. +template +class MapAllocator { + public: + using value_type = U; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using size_type = size_t; + using difference_type = ptrdiff_t; + + MapAllocator() : arena_(nullptr) {} + explicit MapAllocator(Arena* arena) : arena_(arena) {} + template + MapAllocator(const MapAllocator& allocator) // NOLINT(runtime/explicit) + : arena_(allocator.arena()) {} + + pointer allocate(size_type n, const void* /* hint */ = nullptr) { + // If arena is not given, malloc needs to be called which doesn't + // construct element object. + if (arena_ == nullptr) { + return static_cast(::operator new(n * sizeof(value_type))); + } else { + return reinterpret_cast( + Arena::CreateArray(arena_, n * sizeof(value_type))); + } + } + + void deallocate(pointer p, size_type n) { + if (arena_ == nullptr) { +#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation) + ::operator delete(p, n * sizeof(value_type)); +#else + (void)n; + ::operator delete(p); +#endif + } + } + +#if __cplusplus >= 201103L && !defined(GOOGLE_PROTOBUF_OS_APPLE) && \ + !defined(GOOGLE_PROTOBUF_OS_NACL) && \ + !defined(GOOGLE_PROTOBUF_OS_EMSCRIPTEN) + template + void construct(NodeType* p, Args&&... args) { + // Clang 3.6 doesn't compile static casting to void* directly. (Issue + // #1266) According C++ standard 5.2.9/1: "The static_cast operator shall + // not cast away constness". So first the maybe const pointer is casted to + // const void* and after the const void* is const casted. + new (const_cast(static_cast(p))) + NodeType(std::forward(args)...); + } + + template + void destroy(NodeType* p) { + p->~NodeType(); + } +#else + void construct(pointer p, const_reference t) { new (p) value_type(t); } + + void destroy(pointer p) { p->~value_type(); } +#endif + + template + struct rebind { + using other = MapAllocator; + }; + + template + bool operator==(const MapAllocator& other) const { + return arena_ == other.arena_; + } + + template + bool operator!=(const MapAllocator& other) const { + return arena_ != other.arena_; + } + + // To support Visual Studio 2008 + size_type max_size() const { + // parentheses around (std::...:max) prevents macro warning of max() + return (std::numeric_limits::max)(); + } + + // To support gcc-4.4, which does not properly + // support templated friend classes + Arena* arena() const { return arena_; } + + private: + using DestructorSkippable_ = void; + Arena* const arena_; +}; + +template +using KeyForTree = + typename std::conditional::value, T, + std::reference_wrapper>::type; + +// Default case: Not transparent. +// We use std::hash/std::less and all the lookup functions +// only accept `key_type`. +template +struct TransparentSupport { + using hash = std::hash; + using less = std::less; + + static bool Equals(const key_type& a, const key_type& b) { return a == b; } + + template + using key_arg = key_type; +}; + +#if defined(__cpp_lib_string_view) +// If std::string_view is available, we add transparent support for std::string +// keys. We use std::hash as it supports the input types we +// care about. The lookup functions accept arbitrary `K`. This will include any +// key type that is convertible to std::string_view. +template <> +struct TransparentSupport { + static std::string_view ImplicitConvert(std::string_view str) { return str; } + // If the element is not convertible to std::string_view, try to convert to + // std::string first. + // The template makes this overload lose resolution when both have the same + // rank otherwise. + template + static std::string_view ImplicitConvert(const std::string& str) { + return str; + } + + struct hash : private std::hash { + using is_transparent = void; + + template + size_t operator()(const T& str) const { + return base()(ImplicitConvert(str)); + } + + private: + const std::hash& base() const { return *this; } + }; + struct less { + using is_transparent = void; + + template + bool operator()(const T& t, const U& u) const { + return ImplicitConvert(t) < ImplicitConvert(u); + } + }; + + template + static bool Equals(const T& t, const U& u) { + return ImplicitConvert(t) == ImplicitConvert(u); + } + + template + using key_arg = K; +}; +#endif // defined(__cpp_lib_string_view) + +} // namespace internal + +// This is the class for Map's internal value_type. Instead of using +// std::pair as value_type, we use this class which provides us more control of +// its process of construction and destruction. +template +struct MapPair { + using first_type = const Key; + using second_type = T; + + MapPair(const Key& other_first, const T& other_second) + : first(other_first), second(other_second) {} + explicit MapPair(const Key& other_first) : first(other_first), second() {} + MapPair(const MapPair& other) : first(other.first), second(other.second) {} + + ~MapPair() {} + + // Implicitly convertible to std::pair of compatible types. + template + operator std::pair() const { // NOLINT(runtime/explicit) + return std::pair(first, second); + } + + const Key first; + T second; + + private: + friend class Arena; + friend class Map; +}; + +// Map is an associative container type used to store protobuf map +// fields. Each Map instance may or may not use a different hash function, a +// different iteration order, and so on. E.g., please don't examine +// implementation details to decide if the following would work: +// Map m0, m1; +// m0[0] = m1[0] = m0[1] = m1[1] = 0; +// assert(m0.begin()->first == m1.begin()->first); // Bug! +// +// Map's interface is similar to std::unordered_map, except that Map is not +// designed to play well with exceptions. +template +class Map { + public: + using key_type = Key; + using mapped_type = T; + using value_type = MapPair; + + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + + using size_type = size_t; + using hasher = typename internal::TransparentSupport::hash; + + Map() : arena_(nullptr), default_enum_value_(0) { Init(); } + explicit Map(Arena* arena) : arena_(arena), default_enum_value_(0) { Init(); } + + Map(const Map& other) + : arena_(nullptr), default_enum_value_(other.default_enum_value_) { + Init(); + insert(other.begin(), other.end()); + } + + Map(Map&& other) noexcept : Map() { + if (other.arena_) { + *this = other; + } else { + swap(other); + } + } + Map& operator=(Map&& other) noexcept { + if (this != &other) { + if (arena_ != other.arena_) { + *this = other; + } else { + swap(other); + } + } + return *this; + } + + template + Map(const InputIt& first, const InputIt& last) + : arena_(nullptr), default_enum_value_(0) { + Init(); + insert(first, last); + } + + ~Map() { + if (arena_ == nullptr) { + clear(); + delete elements_; + } + } + + private: + void Init() { elements_ = Arena::CreateMessage(arena_, 0); } + + using Allocator = internal::MapAllocator; + + // InnerMap is a generic hash-based map. It doesn't contain any + // protocol-buffer-specific logic. It is a chaining hash map with the + // additional feature that some buckets can be converted to use an ordered + // container. This ensures O(lg n) bounds on find, insert, and erase, while + // avoiding the overheads of ordered containers most of the time. + // + // The implementation doesn't need the full generality of unordered_map, + // and it doesn't have it. More bells and whistles can be added as needed. + // Some implementation details: + // 1. The hash function has type hasher and the equality function + // equal_to. We inherit from hasher to save space + // (empty-base-class optimization). + // 2. The number of buckets is a power of two. + // 3. Buckets are converted to trees in pairs: if we convert bucket b then + // buckets b and b^1 will share a tree. Invariant: buckets b and b^1 have + // the same non-null value iff they are sharing a tree. (An alternative + // implementation strategy would be to have a tag bit per bucket.) + // 4. As is typical for hash_map and such, the Keys and Values are always + // stored in linked list nodes. Pointers to elements are never invalidated + // until the element is deleted. + // 5. The trees' payload type is pointer to linked-list node. Tree-converting + // a bucket doesn't copy Key-Value pairs. + // 6. Once we've tree-converted a bucket, it is never converted back. However, + // the items a tree contains may wind up assigned to trees or lists upon a + // rehash. + // 7. The code requires no C++ features from C++14 or later. + // 8. Mutations to a map do not invalidate the map's iterators, pointers to + // elements, or references to elements. + // 9. Except for erase(iterator), any non-const method can reorder iterators. + // 10. InnerMap uses KeyForTree when using the Tree representation, which + // is either `Key`, if Key is a scalar, or `reference_wrapper` + // otherwise. This avoids unncessary copies of string keys, for example. + class InnerMap : private hasher { + public: + explicit InnerMap(size_type n) : InnerMap(nullptr, n) {} + InnerMap(Arena* arena, size_type n) + : hasher(), + num_elements_(0), + seed_(Seed()), + table_(nullptr), + alloc_(arena) { + n = TableSize(n); + table_ = CreateEmptyTable(n); + num_buckets_ = index_of_first_non_null_ = n; + } + + ~InnerMap() { + if (table_ != nullptr) { + clear(); + Dealloc(table_, num_buckets_); + } + } + + private: + enum { kMinTableSize = 8 }; + + // Linked-list nodes, as one would expect for a chaining hash table. + struct Node { + value_type kv; + Node* next; + }; + + // Trees. The payload type is a copy of Key, so that we can query the tree + // with Keys that are not in any particular data structure. + // The value is a void* pointing to Node. We use void* instead of Node* to + // avoid code bloat. That way there is only one instantiation of the tree + // class per key type. + using TreeAllocator = typename Allocator::template rebind< + std::pair, void*>>::other; + using Tree = std::map, void*, + typename internal::TransparentSupport::less, + TreeAllocator>; + using TreeIterator = typename Tree::iterator; + + static Node* NodeFromTreeIterator(TreeIterator it) { + return static_cast(it->second); + } + + // iterator and const_iterator are instantiations of iterator_base. + template + class iterator_base { + public: + using reference = KeyValueType&; + using pointer = KeyValueType*; + + // Invariants: + // node_ is always correct. This is handy because the most common + // operations are operator* and operator-> and they only use node_. + // When node_ is set to a non-null value, all the other non-const fields + // are updated to be correct also, but those fields can become stale + // if the underlying map is modified. When those fields are needed they + // are rechecked, and updated if necessary. + iterator_base() : node_(nullptr), m_(nullptr), bucket_index_(0) {} + + explicit iterator_base(const InnerMap* m) : m_(m) { + SearchFrom(m->index_of_first_non_null_); + } + + // Any iterator_base can convert to any other. This is overkill, and we + // rely on the enclosing class to use it wisely. The standard "iterator + // can convert to const_iterator" is OK but the reverse direction is not. + template + explicit iterator_base(const iterator_base& it) + : node_(it.node_), m_(it.m_), bucket_index_(it.bucket_index_) {} + + iterator_base(Node* n, const InnerMap* m, size_type index) + : node_(n), m_(m), bucket_index_(index) {} + + iterator_base(TreeIterator tree_it, const InnerMap* m, size_type index) + : node_(NodeFromTreeIterator(tree_it)), m_(m), bucket_index_(index) { + // Invariant: iterators that use buckets with trees have an even + // bucket_index_. + GOOGLE_DCHECK_EQ(bucket_index_ % 2, 0u); + } + + // Advance through buckets, looking for the first that isn't empty. + // If nothing non-empty is found then leave node_ == nullptr. + void SearchFrom(size_type start_bucket) { + GOOGLE_DCHECK(m_->index_of_first_non_null_ == m_->num_buckets_ || + m_->table_[m_->index_of_first_non_null_] != nullptr); + node_ = nullptr; + for (bucket_index_ = start_bucket; bucket_index_ < m_->num_buckets_; + bucket_index_++) { + if (m_->TableEntryIsNonEmptyList(bucket_index_)) { + node_ = static_cast(m_->table_[bucket_index_]); + break; + } else if (m_->TableEntryIsTree(bucket_index_)) { + Tree* tree = static_cast(m_->table_[bucket_index_]); + GOOGLE_DCHECK(!tree->empty()); + node_ = NodeFromTreeIterator(tree->begin()); + break; + } + } + } + + reference operator*() const { return node_->kv; } + pointer operator->() const { return &(operator*()); } + + friend bool operator==(const iterator_base& a, const iterator_base& b) { + return a.node_ == b.node_; + } + friend bool operator!=(const iterator_base& a, const iterator_base& b) { + return a.node_ != b.node_; + } + + iterator_base& operator++() { + if (node_->next == nullptr) { + TreeIterator tree_it; + const bool is_list = revalidate_if_necessary(&tree_it); + if (is_list) { + SearchFrom(bucket_index_ + 1); + } else { + GOOGLE_DCHECK_EQ(bucket_index_ & 1, 0u); + Tree* tree = static_cast(m_->table_[bucket_index_]); + if (++tree_it == tree->end()) { + SearchFrom(bucket_index_ + 2); + } else { + node_ = NodeFromTreeIterator(tree_it); + } + } + } else { + node_ = node_->next; + } + return *this; + } + + iterator_base operator++(int /* unused */) { + iterator_base tmp = *this; + ++*this; + return tmp; + } + + // Assumes node_ and m_ are correct and non-null, but other fields may be + // stale. Fix them as needed. Then return true iff node_ points to a + // Node in a list. If false is returned then *it is modified to be + // a valid iterator for node_. + bool revalidate_if_necessary(TreeIterator* it) { + GOOGLE_DCHECK(node_ != nullptr && m_ != nullptr); + // Force bucket_index_ to be in range. + bucket_index_ &= (m_->num_buckets_ - 1); + // Common case: the bucket we think is relevant points to node_. + if (m_->table_[bucket_index_] == static_cast(node_)) return true; + // Less common: the bucket is a linked list with node_ somewhere in it, + // but not at the head. + if (m_->TableEntryIsNonEmptyList(bucket_index_)) { + Node* l = static_cast(m_->table_[bucket_index_]); + while ((l = l->next) != nullptr) { + if (l == node_) { + return true; + } + } + } + // Well, bucket_index_ still might be correct, but probably + // not. Revalidate just to be sure. This case is rare enough that we + // don't worry about potential optimizations, such as having a custom + // find-like method that compares Node* instead of the key. + iterator_base i(m_->find(node_->kv.first, it)); + bucket_index_ = i.bucket_index_; + return m_->TableEntryIsList(bucket_index_); + } + + Node* node_; + const InnerMap* m_; + size_type bucket_index_; + }; + + public: + using iterator = iterator_base; + using const_iterator = iterator_base; + + iterator begin() { return iterator(this); } + iterator end() { return iterator(); } + const_iterator begin() const { return const_iterator(this); } + const_iterator end() const { return const_iterator(); } + + void clear() { + for (size_type b = 0; b < num_buckets_; b++) { + if (TableEntryIsNonEmptyList(b)) { + Node* node = static_cast(table_[b]); + table_[b] = nullptr; + do { + Node* next = node->next; + DestroyNode(node); + node = next; + } while (node != nullptr); + } else if (TableEntryIsTree(b)) { + Tree* tree = static_cast(table_[b]); + GOOGLE_DCHECK(table_[b] == table_[b + 1] && (b & 1) == 0); + table_[b] = table_[b + 1] = nullptr; + typename Tree::iterator tree_it = tree->begin(); + do { + Node* node = NodeFromTreeIterator(tree_it); + typename Tree::iterator next = tree_it; + ++next; + tree->erase(tree_it); + DestroyNode(node); + tree_it = next; + } while (tree_it != tree->end()); + DestroyTree(tree); + b++; + } + } + num_elements_ = 0; + index_of_first_non_null_ = num_buckets_; + } + + const hasher& hash_function() const { return *this; } + + static size_type max_size() { + return static_cast(1) << (sizeof(void**) >= 8 ? 60 : 28); + } + size_type size() const { return num_elements_; } + bool empty() const { return size() == 0; } + + template + iterator find(const K& k) { + return iterator(FindHelper(k).first); + } + + // Insert the key into the map, if not present. In that case, the value will + // be value initialized. + std::pair insert(const Key& k) { + std::pair p = FindHelper(k); + // Case 1: key was already present. + if (p.first.node_ != nullptr) + return std::make_pair(iterator(p.first), false); + // Case 2: insert. + if (ResizeIfLoadIsOutOfRange(num_elements_ + 1)) { + p = FindHelper(k); + } + const size_type b = p.second; // bucket number + Node* node; + if (alloc_.arena() == nullptr) { + node = new Node{value_type(k), nullptr}; + } else { + node = Alloc(1); + Arena::CreateInArenaStorage(const_cast(&node->kv.first), + alloc_.arena(), k); + Arena::CreateInArenaStorage(&node->kv.second, alloc_.arena()); + } + + iterator result = InsertUnique(b, node); + ++num_elements_; + return std::make_pair(result, true); + } + + value_type& operator[](const Key& k) { return *insert(k).first; } + + void erase(iterator it) { + GOOGLE_DCHECK_EQ(it.m_, this); + typename Tree::iterator tree_it; + const bool is_list = it.revalidate_if_necessary(&tree_it); + size_type b = it.bucket_index_; + Node* const item = it.node_; + if (is_list) { + GOOGLE_DCHECK(TableEntryIsNonEmptyList(b)); + Node* head = static_cast(table_[b]); + head = EraseFromLinkedList(item, head); + table_[b] = static_cast(head); + } else { + GOOGLE_DCHECK(TableEntryIsTree(b)); + Tree* tree = static_cast(table_[b]); + tree->erase(tree_it); + if (tree->empty()) { + // Force b to be the minimum of b and b ^ 1. This is important + // only because we want index_of_first_non_null_ to be correct. + b &= ~static_cast(1); + DestroyTree(tree); + table_[b] = table_[b + 1] = nullptr; + } + } + DestroyNode(item); + --num_elements_; + if (PROTOBUF_PREDICT_FALSE(b == index_of_first_non_null_)) { + while (index_of_first_non_null_ < num_buckets_ && + table_[index_of_first_non_null_] == nullptr) { + ++index_of_first_non_null_; + } + } + } + + private: + const_iterator find(const Key& k, TreeIterator* it) const { + return FindHelper(k, it).first; + } + template + std::pair FindHelper(const K& k) const { + return FindHelper(k, nullptr); + } + template + std::pair FindHelper(const K& k, + TreeIterator* it) const { + size_type b = BucketNumber(k); + if (TableEntryIsNonEmptyList(b)) { + Node* node = static_cast(table_[b]); + do { + if (internal::TransparentSupport::Equals(node->kv.first, k)) { + return std::make_pair(const_iterator(node, this, b), b); + } else { + node = node->next; + } + } while (node != nullptr); + } else if (TableEntryIsTree(b)) { + GOOGLE_DCHECK_EQ(table_[b], table_[b ^ 1]); + b &= ~static_cast(1); + Tree* tree = static_cast(table_[b]); + auto tree_it = tree->find(k); + if (tree_it != tree->end()) { + if (it != nullptr) *it = tree_it; + return std::make_pair(const_iterator(tree_it, this, b), b); + } + } + return std::make_pair(end(), b); + } + + // Insert the given Node in bucket b. If that would make bucket b too big, + // and bucket b is not a tree, create a tree for buckets b and b^1 to share. + // Requires count(*KeyPtrFromNodePtr(node)) == 0 and that b is the correct + // bucket. num_elements_ is not modified. + iterator InsertUnique(size_type b, Node* node) { + GOOGLE_DCHECK(index_of_first_non_null_ == num_buckets_ || + table_[index_of_first_non_null_] != nullptr); + // In practice, the code that led to this point may have already + // determined whether we are inserting into an empty list, a short list, + // or whatever. But it's probably cheap enough to recompute that here; + // it's likely that we're inserting into an empty or short list. + iterator result; + GOOGLE_DCHECK(find(node->kv.first) == end()); + if (TableEntryIsEmpty(b)) { + result = InsertUniqueInList(b, node); + } else if (TableEntryIsNonEmptyList(b)) { + if (PROTOBUF_PREDICT_FALSE(TableEntryIsTooLong(b))) { + TreeConvert(b); + result = InsertUniqueInTree(b, node); + GOOGLE_DCHECK_EQ(result.bucket_index_, b & ~static_cast(1)); + } else { + // Insert into a pre-existing list. This case cannot modify + // index_of_first_non_null_, so we skip the code to update it. + return InsertUniqueInList(b, node); + } + } else { + // Insert into a pre-existing tree. This case cannot modify + // index_of_first_non_null_, so we skip the code to update it. + return InsertUniqueInTree(b, node); + } + // parentheses around (std::min) prevents macro expansion of min(...) + index_of_first_non_null_ = + (std::min)(index_of_first_non_null_, result.bucket_index_); + return result; + } + + // Returns whether we should insert after the head of the list. For + // non-optimized builds, we randomly decide whether to insert right at the + // head of the list or just after the head. This helps add a little bit of + // non-determinism to the map ordering. + bool ShouldInsertAfterHead(void* node) { +#ifdef NDEBUG + return false; +#else + // Doing modulo with a prime mixes the bits more. + return (reinterpret_cast(node) ^ seed_) % 13 > 6; +#endif + } + + // Helper for InsertUnique. Handles the case where bucket b is a + // not-too-long linked list. + iterator InsertUniqueInList(size_type b, Node* node) { + if (table_[b] != nullptr && ShouldInsertAfterHead(node)) { + Node* first = static_cast(table_[b]); + node->next = first->next; + first->next = node; + return iterator(node, this, b); + } + + node->next = static_cast(table_[b]); + table_[b] = static_cast(node); + return iterator(node, this, b); + } + + // Helper for InsertUnique. Handles the case where bucket b points to a + // Tree. + iterator InsertUniqueInTree(size_type b, Node* node) { + GOOGLE_DCHECK_EQ(table_[b], table_[b ^ 1]); + // Maintain the invariant that node->next is null for all Nodes in Trees. + node->next = nullptr; + return iterator( + static_cast(table_[b])->insert({node->kv.first, node}).first, + this, b & ~static_cast(1)); + } + + // Returns whether it did resize. Currently this is only used when + // num_elements_ increases, though it could be used in other situations. + // It checks for load too low as well as load too high: because any number + // of erases can occur between inserts, the load could be as low as 0 here. + // Resizing to a lower size is not always helpful, but failing to do so can + // destroy the expected big-O bounds for some operations. By having the + // policy that sometimes we resize down as well as up, clients can easily + // keep O(size()) = O(number of buckets) if they want that. + bool ResizeIfLoadIsOutOfRange(size_type new_size) { + const size_type kMaxMapLoadTimes16 = 12; // controls RAM vs CPU tradeoff + const size_type hi_cutoff = num_buckets_ * kMaxMapLoadTimes16 / 16; + const size_type lo_cutoff = hi_cutoff / 4; + // We don't care how many elements are in trees. If a lot are, + // we may resize even though there are many empty buckets. In + // practice, this seems fine. + if (PROTOBUF_PREDICT_FALSE(new_size >= hi_cutoff)) { + if (num_buckets_ <= max_size() / 2) { + Resize(num_buckets_ * 2); + return true; + } + } else if (PROTOBUF_PREDICT_FALSE(new_size <= lo_cutoff && + num_buckets_ > kMinTableSize)) { + size_type lg2_of_size_reduction_factor = 1; + // It's possible we want to shrink a lot here... size() could even be 0. + // So, estimate how much to shrink by making sure we don't shrink so + // much that we would need to grow the table after a few inserts. + const size_type hypothetical_size = new_size * 5 / 4 + 1; + while ((hypothetical_size << lg2_of_size_reduction_factor) < + hi_cutoff) { + ++lg2_of_size_reduction_factor; + } + size_type new_num_buckets = std::max( + kMinTableSize, num_buckets_ >> lg2_of_size_reduction_factor); + if (new_num_buckets != num_buckets_) { + Resize(new_num_buckets); + return true; + } + } + return false; + } + + // Resize to the given number of buckets. + void Resize(size_t new_num_buckets) { + GOOGLE_DCHECK_GE(new_num_buckets, kMinTableSize); + void** const old_table = table_; + const size_type old_table_size = num_buckets_; + num_buckets_ = new_num_buckets; + table_ = CreateEmptyTable(num_buckets_); + const size_type start = index_of_first_non_null_; + index_of_first_non_null_ = num_buckets_; + for (size_type i = start; i < old_table_size; i++) { + if (TableEntryIsNonEmptyList(old_table, i)) { + TransferList(old_table, i); + } else if (TableEntryIsTree(old_table, i)) { + TransferTree(old_table, i++); + } + } + Dealloc(old_table, old_table_size); + } + + void TransferList(void* const* table, size_type index) { + Node* node = static_cast(table[index]); + do { + Node* next = node->next; + InsertUnique(BucketNumber(node->kv.first), node); + node = next; + } while (node != nullptr); + } + + void TransferTree(void* const* table, size_type index) { + Tree* tree = static_cast(table[index]); + typename Tree::iterator tree_it = tree->begin(); + do { + InsertUnique(BucketNumber(std::cref(tree_it->first).get()), + NodeFromTreeIterator(tree_it)); + } while (++tree_it != tree->end()); + DestroyTree(tree); + } + + Node* EraseFromLinkedList(Node* item, Node* head) { + if (head == item) { + return head->next; + } else { + head->next = EraseFromLinkedList(item, head->next); + return head; + } + } + + bool TableEntryIsEmpty(size_type b) const { + return TableEntryIsEmpty(table_, b); + } + bool TableEntryIsNonEmptyList(size_type b) const { + return TableEntryIsNonEmptyList(table_, b); + } + bool TableEntryIsTree(size_type b) const { + return TableEntryIsTree(table_, b); + } + bool TableEntryIsList(size_type b) const { + return TableEntryIsList(table_, b); + } + static bool TableEntryIsEmpty(void* const* table, size_type b) { + return table[b] == nullptr; + } + static bool TableEntryIsNonEmptyList(void* const* table, size_type b) { + return table[b] != nullptr && table[b] != table[b ^ 1]; + } + static bool TableEntryIsTree(void* const* table, size_type b) { + return !TableEntryIsEmpty(table, b) && + !TableEntryIsNonEmptyList(table, b); + } + static bool TableEntryIsList(void* const* table, size_type b) { + return !TableEntryIsTree(table, b); + } + + void TreeConvert(size_type b) { + GOOGLE_DCHECK(!TableEntryIsTree(b) && !TableEntryIsTree(b ^ 1)); + Tree* tree = + Arena::Create(alloc_.arena(), typename Tree::key_compare(), + typename Tree::allocator_type(alloc_)); + size_type count = CopyListToTree(b, tree) + CopyListToTree(b ^ 1, tree); + GOOGLE_DCHECK_EQ(count, tree->size()); + table_[b] = table_[b ^ 1] = static_cast(tree); + } + + // Copy a linked list in the given bucket to a tree. + // Returns the number of things it copied. + size_type CopyListToTree(size_type b, Tree* tree) { + size_type count = 0; + Node* node = static_cast(table_[b]); + while (node != nullptr) { + tree->insert({node->kv.first, node}); + ++count; + Node* next = node->next; + node->next = nullptr; + node = next; + } + return count; + } + + // Return whether table_[b] is a linked list that seems awfully long. + // Requires table_[b] to point to a non-empty linked list. + bool TableEntryIsTooLong(size_type b) { + const size_type kMaxLength = 8; + size_type count = 0; + Node* node = static_cast(table_[b]); + do { + ++count; + node = node->next; + } while (node != nullptr); + // Invariant: no linked list ever is more than kMaxLength in length. + GOOGLE_DCHECK_LE(count, kMaxLength); + return count >= kMaxLength; + } + + template + size_type BucketNumber(const K& k) const { + // We xor the hash value against the random seed so that we effectively + // have a random hash function. + uint64 h = hash_function()(k) ^ seed_; + + // We use the multiplication method to determine the bucket number from + // the hash value. The constant kPhi (suggested by Knuth) is roughly + // (sqrt(5) - 1) / 2 * 2^64. + constexpr uint64 kPhi = uint64{0x9e3779b97f4a7c15}; + return ((kPhi * h) >> 32) & (num_buckets_ - 1); + } + + // Return a power of two no less than max(kMinTableSize, n). + // Assumes either n < kMinTableSize or n is a power of two. + size_type TableSize(size_type n) { + return n < static_cast(kMinTableSize) + ? static_cast(kMinTableSize) + : n; + } + + // Use alloc_ to allocate an array of n objects of type U. + template + U* Alloc(size_type n) { + using alloc_type = typename Allocator::template rebind::other; + return alloc_type(alloc_).allocate(n); + } + + // Use alloc_ to deallocate an array of n objects of type U. + template + void Dealloc(U* t, size_type n) { + using alloc_type = typename Allocator::template rebind::other; + alloc_type(alloc_).deallocate(t, n); + } + + void DestroyNode(Node* node) { + if (alloc_.arena() == nullptr) { + delete node; + } + } + + void DestroyTree(Tree* tree) { + if (alloc_.arena() == nullptr) { + delete tree; + } + } + + void** CreateEmptyTable(size_type n) { + GOOGLE_DCHECK(n >= kMinTableSize); + GOOGLE_DCHECK_EQ(n & (n - 1), 0); + void** result = Alloc(n); + memset(result, 0, n * sizeof(result[0])); + return result; + } + + // Return a randomish value. + size_type Seed() const { + // We get a little bit of randomness from the address of the map. The + // lower bits are not very random, due to alignment, so we discard them + // and shift the higher bits into their place. + size_type s = reinterpret_cast(this) >> 12; +#if defined(__x86_64__) && defined(__GNUC__) && \ + !defined(GOOGLE_PROTOBUF_NO_RDTSC) + uint32 hi, lo; + asm("rdtsc" : "=a"(lo), "=d"(hi)); + s += ((static_cast(hi) << 32) | lo); +#endif + return s; + } + + friend class Arena; + using InternalArenaConstructable_ = void; + using DestructorSkippable_ = void; + + size_type num_elements_; + size_type num_buckets_; + size_type seed_; + size_type index_of_first_non_null_; + void** table_; // an array with num_buckets_ entries + Allocator alloc_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(InnerMap); + }; // end of class InnerMap + + template + using key_arg = typename internal::TransparentSupport< + key_type>::template key_arg; + + public: + // Iterators + class const_iterator { + using InnerIt = typename InnerMap::const_iterator; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = typename Map::value_type; + using difference_type = ptrdiff_t; + using pointer = const value_type*; + using reference = const value_type&; + + const_iterator() {} + explicit const_iterator(const InnerIt& it) : it_(it) {} + + const_reference operator*() const { return *it_; } + const_pointer operator->() const { return &(operator*()); } + + const_iterator& operator++() { + ++it_; + return *this; + } + const_iterator operator++(int) { return const_iterator(it_++); } + + friend bool operator==(const const_iterator& a, const const_iterator& b) { + return a.it_ == b.it_; + } + friend bool operator!=(const const_iterator& a, const const_iterator& b) { + return !(a == b); + } + + private: + InnerIt it_; + }; + + class iterator { + using InnerIt = typename InnerMap::iterator; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = typename Map::value_type; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + iterator() {} + explicit iterator(const InnerIt& it) : it_(it) {} + + reference operator*() const { return *it_; } + pointer operator->() const { return &(operator*()); } + + iterator& operator++() { + ++it_; + return *this; + } + iterator operator++(int) { return iterator(it_++); } + + // Allow implicit conversion to const_iterator. + operator const_iterator() const { // NOLINT(runtime/explicit) + return const_iterator(typename InnerMap::const_iterator(it_)); + } + + friend bool operator==(const iterator& a, const iterator& b) { + return a.it_ == b.it_; + } + friend bool operator!=(const iterator& a, const iterator& b) { + return !(a == b); + } + + private: + friend class Map; + + InnerIt it_; + }; + + iterator begin() { return iterator(elements_->begin()); } + iterator end() { return iterator(elements_->end()); } + const_iterator begin() const { + return const_iterator(iterator(elements_->begin())); + } + const_iterator end() const { + return const_iterator(iterator(elements_->end())); + } + const_iterator cbegin() const { return begin(); } + const_iterator cend() const { return end(); } + + // Capacity + size_type size() const { return elements_->size(); } + bool empty() const { return size() == 0; } + + // Element access + T& operator[](const key_type& key) { return (*elements_)[key].second; } + + template + const T& at(const key_arg& key) const { + const_iterator it = find(key); + GOOGLE_CHECK(it != end()) << "key not found: " << static_cast(key); + return it->second; + } + + template + T& at(const key_arg& key) { + iterator it = find(key); + GOOGLE_CHECK(it != end()) << "key not found: " << static_cast(key); + return it->second; + } + + // Lookup + template + size_type count(const key_arg& key) const { + return find(key) == end() ? 0 : 1; + } + + template + const_iterator find(const key_arg& key) const { + return const_iterator(iterator(elements_->find(key))); + } + template + iterator find(const key_arg& key) { + return iterator(elements_->find(key)); + } + + template + bool contains(const key_arg& key) const { + return find(key) != end(); + } + + template + std::pair equal_range( + const key_arg& key) const { + const_iterator it = find(key); + if (it == end()) { + return std::pair(it, it); + } else { + const_iterator begin = it++; + return std::pair(begin, it); + } + } + + template + std::pair equal_range(const key_arg& key) { + iterator it = find(key); + if (it == end()) { + return std::pair(it, it); + } else { + iterator begin = it++; + return std::pair(begin, it); + } + } + + // insert + std::pair insert(const value_type& value) { + std::pair p = + elements_->insert(value.first); + if (p.second) { + p.first->second = value.second; + } + return std::pair(iterator(p.first), p.second); + } + template + void insert(InputIt first, InputIt last) { + for (InputIt it = first; it != last; ++it) { + iterator exist_it = find(it->first); + if (exist_it == end()) { + operator[](it->first) = it->second; + } + } + } + void insert(std::initializer_list values) { + insert(values.begin(), values.end()); + } + + // Erase and clear + template + size_type erase(const key_arg& key) { + iterator it = find(key); + if (it == end()) { + return 0; + } else { + erase(it); + return 1; + } + } + iterator erase(iterator pos) { + iterator i = pos++; + elements_->erase(i.it_); + return pos; + } + void erase(iterator first, iterator last) { + while (first != last) { + first = erase(first); + } + } + void clear() { elements_->clear(); } + + // Assign + Map& operator=(const Map& other) { + if (this != &other) { + clear(); + insert(other.begin(), other.end()); + } + return *this; + } + + void swap(Map& other) { + if (arena_ == other.arena_) { + std::swap(default_enum_value_, other.default_enum_value_); + std::swap(elements_, other.elements_); + } else { + // TODO(zuguang): optimize this. The temporary copy can be allocated + // in the same arena as the other message, and the "other = copy" can + // be replaced with the fast-path swap above. + Map copy = *this; + *this = other; + other = copy; + } + } + + // Access to hasher. Currently this returns a copy, but it may + // be modified to return a const reference in the future. + hasher hash_function() const { return elements_->hash_function(); } + + private: + // Set default enum value only for proto2 map field whose value is enum type. + void SetDefaultEnumValue(int default_enum_value) { + default_enum_value_ = default_enum_value; + } + + Arena* arena_; + int default_enum_value_; + InnerMap* elements_; + + friend class Arena; + using InternalArenaConstructable_ = void; + using DestructorSkippable_ = void; + template + friend class internal::MapFieldLite; +}; + +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_MAP_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..bc4a6cc718cd19e17c4d00398147b0846d9bfbe6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h @@ -0,0 +1,362 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_MAP_FIELD_INL_H__ +#define GOOGLE_PROTOBUF_MAP_FIELD_INL_H__ + +#include + +#include +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { +namespace internal { +// UnwrapMapKey template +template +T UnwrapMapKey(const MapKey& map_key); +template <> +inline int32 UnwrapMapKey(const MapKey& map_key) { + return map_key.GetInt32Value(); +} +template <> +inline uint32 UnwrapMapKey(const MapKey& map_key) { + return map_key.GetUInt32Value(); +} +template <> +inline int64 UnwrapMapKey(const MapKey& map_key) { + return map_key.GetInt64Value(); +} +template <> +inline uint64 UnwrapMapKey(const MapKey& map_key) { + return map_key.GetUInt64Value(); +} +template <> +inline bool UnwrapMapKey(const MapKey& map_key) { + return map_key.GetBoolValue(); +} +template <> +inline std::string UnwrapMapKey(const MapKey& map_key) { + return map_key.GetStringValue(); +} + +// SetMapKey template +template +inline void SetMapKey(MapKey* map_key, const T& value); +template <> +inline void SetMapKey(MapKey* map_key, const int32& value) { + map_key->SetInt32Value(value); +} +template <> +inline void SetMapKey(MapKey* map_key, const uint32& value) { + map_key->SetUInt32Value(value); +} +template <> +inline void SetMapKey(MapKey* map_key, const int64& value) { + map_key->SetInt64Value(value); +} +template <> +inline void SetMapKey(MapKey* map_key, const uint64& value) { + map_key->SetUInt64Value(value); +} +template <> +inline void SetMapKey(MapKey* map_key, const bool& value) { + map_key->SetBoolValue(value); +} +template <> +inline void SetMapKey(MapKey* map_key, const std::string& value) { + map_key->SetStringValue(value); +} + +// ------------------------TypeDefinedMapFieldBase--------------- +template +typename Map::const_iterator& +TypeDefinedMapFieldBase::InternalGetIterator( + const MapIterator* map_iter) const { + return *reinterpret_cast::const_iterator*>( + map_iter->iter_); +} + +template +void TypeDefinedMapFieldBase::MapBegin(MapIterator* map_iter) const { + InternalGetIterator(map_iter) = GetMap().begin(); + SetMapIteratorValue(map_iter); +} + +template +void TypeDefinedMapFieldBase::MapEnd(MapIterator* map_iter) const { + InternalGetIterator(map_iter) = GetMap().end(); +} + +template +bool TypeDefinedMapFieldBase::EqualIterator( + const MapIterator& a, const MapIterator& b) const { + return InternalGetIterator(&a) == InternalGetIterator(&b); +} + +template +void TypeDefinedMapFieldBase::IncreaseIterator( + MapIterator* map_iter) const { + ++InternalGetIterator(map_iter); + SetMapIteratorValue(map_iter); +} + +template +void TypeDefinedMapFieldBase::InitializeIterator( + MapIterator* map_iter) const { + map_iter->iter_ = new typename Map::const_iterator; + GOOGLE_CHECK(map_iter->iter_ != NULL); +} + +template +void TypeDefinedMapFieldBase::DeleteIterator( + MapIterator* map_iter) const { + delete reinterpret_cast::const_iterator*>( + map_iter->iter_); +} + +template +void TypeDefinedMapFieldBase::CopyIterator( + MapIterator* this_iter, const MapIterator& that_iter) const { + InternalGetIterator(this_iter) = InternalGetIterator(&that_iter); + this_iter->key_.SetType(that_iter.key_.type()); + // MapValueRef::type() fails when containing data is null. However, if + // this_iter points to MapEnd, data can be null. + this_iter->value_.SetType( + static_cast(that_iter.value_.type_)); + SetMapIteratorValue(this_iter); +} + +// ---------------------------------------------------------------------- + +template +int MapField::size() const { + MapFieldBase::SyncMapWithRepeatedField(); + return static_cast(impl_.GetMap().size()); +} + +template +void MapField::Clear() { + if (this->MapFieldBase::repeated_field_ != nullptr) { + RepeatedPtrField* repeated_field = + reinterpret_cast*>( + this->MapFieldBase::repeated_field_); + repeated_field->Clear(); + } + + impl_.MutableMap()->clear(); + // Data in map and repeated field are both empty, but we can't set status + // CLEAN. Because clear is a generated API, we cannot invalidate previous + // reference to map. + MapFieldBase::SetMapDirty(); +} + +template +void MapField::SetMapIteratorValue(MapIterator* map_iter) + const { + const Map& map = impl_.GetMap(); + typename Map::const_iterator iter = + TypeDefinedMapFieldBase::InternalGetIterator(map_iter); + if (iter == map.end()) return; + SetMapKey(&map_iter->key_, iter->first); + map_iter->value_.SetValue(&iter->second); +} + +template +bool MapField::ContainsMapKey(const MapKey& map_key) const { + const Map& map = impl_.GetMap(); + const Key& key = UnwrapMapKey(map_key); + typename Map::const_iterator iter = map.find(key); + return iter != map.end(); +} + +template +bool MapField::InsertOrLookupMapValue(const MapKey& map_key, + MapValueRef* val) { + // Always use mutable map because users may change the map value by + // MapValueRef. + Map* map = MutableMap(); + const Key& key = UnwrapMapKey(map_key); + typename Map::iterator iter = map->find(key); + if (map->end() == iter) { + val->SetValue(&((*map)[key])); + return true; + } + // Key is already in the map. Make sure (*map)[key] is not called. + // [] may reorder the map and iterators. + val->SetValue(&(iter->second)); + return false; +} + +template +bool MapField::DeleteMapValue(const MapKey& map_key) { + const Key& key = UnwrapMapKey(map_key); + return MutableMap()->erase(key); +} + +template +void MapField::MergeFrom(const MapFieldBase& other) { + MapFieldBase::SyncMapWithRepeatedField(); + const MapField& other_field = static_cast(other); + other_field.SyncMapWithRepeatedField(); + impl_.MergeFrom(other_field.impl_); + MapFieldBase::SetMapDirty(); +} + +template +void MapField::Swap(MapFieldBase* other) { + MapField* other_field = down_cast(other); + std::swap(this->MapFieldBase::repeated_field_, other_field->repeated_field_); + impl_.Swap(&other_field->impl_); + // a relaxed swap of the atomic + auto other_state = other_field->state_.load(std::memory_order_relaxed); + auto this_state = this->MapFieldBase::state_.load(std::memory_order_relaxed); + other_field->state_.store(this_state, std::memory_order_relaxed); + this->MapFieldBase::state_.store(other_state, std::memory_order_relaxed); +} + +template +void MapField::SyncRepeatedFieldWithMapNoLock() const { + if (this->MapFieldBase::repeated_field_ == NULL) { + if (this->MapFieldBase::arena_ == NULL) { + this->MapFieldBase::repeated_field_ = new RepeatedPtrField(); + } else { + this->MapFieldBase::repeated_field_ = + Arena::CreateMessage >( + this->MapFieldBase::arena_); + } + } + const Map& map = impl_.GetMap(); + RepeatedPtrField* repeated_field = + reinterpret_cast*>( + this->MapFieldBase::repeated_field_); + + repeated_field->Clear(); + + // The only way we can get at this point is through reflection and the + // only way we can get the reflection object is by having called GetReflection + // on the encompassing field. So that type must have existed and hence we + // know that this MapEntry default_type has also already been constructed. + // So it's safe to just call internal_default_instance(). + const Message* default_entry = Derived::internal_default_instance(); + for (typename Map::const_iterator it = map.begin(); it != map.end(); + ++it) { + EntryType* new_entry = + down_cast(default_entry->New(this->MapFieldBase::arena_)); + repeated_field->AddAllocated(new_entry); + (*new_entry->mutable_key()) = it->first; + (*new_entry->mutable_value()) = it->second; + } +} + +template +void MapField::SyncMapWithRepeatedFieldNoLock() const { + Map* map = const_cast(this)->impl_.MutableMap(); + RepeatedPtrField* repeated_field = + reinterpret_cast*>( + this->MapFieldBase::repeated_field_); + GOOGLE_CHECK(this->MapFieldBase::repeated_field_ != NULL); + map->clear(); + for (typename RepeatedPtrField::iterator it = + repeated_field->begin(); + it != repeated_field->end(); ++it) { + // Cast is needed because Map's api and internal storage is different when + // value is enum. For enum, we cannot cast an int to enum. Thus, we have to + // copy value. For other types, they have same exposed api type and internal + // stored type. We should not introduce value copy for them. We achieve this + // by casting to value for enum while casting to reference for other types. + (*map)[it->key()] = static_cast(it->value()); + } +} + +template +size_t MapField::SpaceUsedExcludingSelfNoLock() const { + size_t size = 0; + if (this->MapFieldBase::repeated_field_ != NULL) { + size += this->MapFieldBase::repeated_field_->SpaceUsedExcludingSelfLong(); + } + Map* map = const_cast(this)->impl_.MutableMap(); + size += sizeof(*map); + for (typename Map::iterator it = map->begin(); it != map->end(); + ++it) { + size += KeyTypeHandler::SpaceUsedInMapLong(it->first); + size += ValueTypeHandler::SpaceUsedInMapLong(it->second); + } + return size; +} +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_MAP_FIELD_INL_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h new file mode 100644 index 0000000000000000000000000000000000000000..a8e04ca67aa1cfbe4f980ad5292140dcd04e022a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h @@ -0,0 +1,195 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__ +#define GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { +namespace internal { + +// This class provides access to map field using generated api. It is used for +// internal generated message implentation only. Users should never use this +// directly. +template +class MapFieldLite { + // Define message type for internal repeated field. + typedef Derived EntryType; + + public: + typedef Map MapType; + typedef EntryType EntryTypeTrait; + + MapFieldLite() { SetDefaultEnumValue(); } + + explicit MapFieldLite(Arena* arena) : map_(arena) { SetDefaultEnumValue(); } + + // Accessors + const Map& GetMap() const { return map_; } + Map* MutableMap() { return &map_; } + + // Convenient methods for generated message implementation. + int size() const { return static_cast(map_.size()); } + void Clear() { return map_.clear(); } + void MergeFrom(const MapFieldLite& other) { + for (typename Map::const_iterator it = other.map_.begin(); + it != other.map_.end(); ++it) { + map_[it->first] = it->second; + } + } + void Swap(MapFieldLite* other) { map_.swap(other->map_); } + + // Set default enum value only for proto2 map field whose value is enum type. + void SetDefaultEnumValue() { + MutableMap()->SetDefaultEnumValue(default_enum_value); + } + + // Used in the implementation of parsing. Caller should take the ownership iff + // arena_ is NULL. + EntryType* NewEntry() const { + return Arena::CreateMessage(map_.arena_); + } + // Used in the implementation of serializing enum value type. Caller should + // take the ownership iff arena_ is NULL. + EntryType* NewEnumEntryWrapper(const Key& key, const T t) const { + return EntryType::EnumWrap(key, t, map_.arena_); + } + // Used in the implementation of serializing other value types. Caller should + // take the ownership iff arena_ is NULL. + EntryType* NewEntryWrapper(const Key& key, const T& t) const { + return EntryType::Wrap(key, t, map_.arena_); + } + + const char* _InternalParse(const char* ptr, ParseContext* ctx) { + typename Derived::template Parser> parser(this); + return parser._InternalParse(ptr, ctx); + } + + template + const char* ParseWithEnumValidation(const char* ptr, ParseContext* ctx, + bool (*is_valid)(int), uint32 field_num, + InternalMetadata* metadata) { + typename Derived::template Parser> parser(this); + return parser.template ParseWithEnumValidation( + ptr, ctx, is_valid, field_num, metadata); + } + + private: + typedef void DestructorSkippable_; + + Map map_; + + friend class ::PROTOBUF_NAMESPACE_ID::Arena; +}; + +template +struct EnumParseWrapper { + const char* _InternalParse(const char* ptr, ParseContext* ctx) { + return map_field->template ParseWithEnumValidation( + ptr, ctx, is_valid, field_num, metadata); + } + T* map_field; + bool (*is_valid)(int); + uint32 field_num; + InternalMetadata* metadata; +}; + +// Helper function because the typenames of maps are horrendous to print. This +// leverages compiler type deduction, to keep all type data out of the +// generated code +template +EnumParseWrapper InitEnumParseWrapper( + T* map_field, bool (*is_valid)(int), uint32 field_num, + InternalMetadata* metadata) { + return EnumParseWrapper{map_field, is_valid, field_num, + metadata}; +} + +// True if IsInitialized() is true for value field in all elements of t. T is +// expected to be message. It's useful to have this helper here to keep the +// protobuf compiler from ever having to emit loops in IsInitialized() methods. +// We want the C++ compiler to inline this or not as it sees fit. +template +bool AllAreInitialized( + const MapFieldLite& field) { + const auto& t = field.GetMap(); + for (typename Map::const_iterator it = t.begin(); it != t.end(); + ++it) { + if (!it->second.IsInitialized()) return false; + } + return true; +} + +template +struct MapEntryToMapField : MapEntryToMapField {}; + +template +struct MapEntryToMapField> { + typedef MapFieldLite, + Key, Value, kKeyFieldType, kValueFieldType, + default_enum_value> + MapFieldType; +}; + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h new file mode 100644 index 0000000000000000000000000000000000000000..d0169bef30b5f7a3a41bfe301750b8ca3aa93a08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h @@ -0,0 +1,812 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_TYPE_HANDLER_H__ +#define GOOGLE_PROTOBUF_TYPE_HANDLER_H__ + +#include +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { +namespace internal { + +// Used for compile time type selection. MapIf::type will be TrueType if Flag is +// true and FalseType otherwise. +template +struct MapIf; + +template +struct MapIf { + typedef TrueType type; +}; + +template +struct MapIf { + typedef FalseType type; +}; + +// In proto2 Map, enum needs to be initialized to given default value, while +// other types' default value can be inferred from the type. +template +class MapValueInitializer { + public: + static inline void Initialize(Type& type, int default_enum_value); +}; + +template +class MapValueInitializer { + public: + static inline void Initialize(Type& value, int default_enum_value) { + value = static_cast(default_enum_value); + } +}; + +template +class MapValueInitializer { + public: + static inline void Initialize(Type& /* value */, + int /* default_enum_value */) {} +}; + +template +class MapArenaMessageCreator { + public: + // Use arena to create message if Type is arena constructable. Otherwise, + // create the message on heap. + static inline Type* CreateMessage(Arena* arena); +}; +template +class MapArenaMessageCreator { + public: + static inline Type* CreateMessage(Arena* arena) { + return Arena::CreateMessage(arena); + } +}; +template +class MapArenaMessageCreator { + public: + static inline Type* CreateMessage(Arena* arena) { + return Arena::Create(arena); + } +}; + +// Define constants for given wire field type +template +class MapWireFieldTypeTraits {}; + +#define TYPE_TRAITS(FieldType, CType, WireFormatType, IsMessage, IsEnum) \ + template \ + class MapWireFieldTypeTraits { \ + public: \ + static const bool kIsMessage = IsMessage; \ + static const bool kIsEnum = IsEnum; \ + typedef typename MapIf::type TypeOnMemory; \ + typedef typename MapIf::type MapEntryAccessorType; \ + static const WireFormatLite::WireType kWireType = \ + WireFormatLite::WIRETYPE_##WireFormatType; \ + }; + +TYPE_TRAITS(MESSAGE, Type, LENGTH_DELIMITED, true, false) +TYPE_TRAITS(STRING, ArenaStringPtr, LENGTH_DELIMITED, false, false) +TYPE_TRAITS(BYTES, ArenaStringPtr, LENGTH_DELIMITED, false, false) +TYPE_TRAITS(INT64, int64, VARINT, false, false) +TYPE_TRAITS(UINT64, uint64, VARINT, false, false) +TYPE_TRAITS(INT32, int32, VARINT, false, false) +TYPE_TRAITS(UINT32, uint32, VARINT, false, false) +TYPE_TRAITS(SINT64, int64, VARINT, false, false) +TYPE_TRAITS(SINT32, int32, VARINT, false, false) +TYPE_TRAITS(ENUM, int, VARINT, false, true) +TYPE_TRAITS(DOUBLE, double, FIXED64, false, false) +TYPE_TRAITS(FLOAT, float, FIXED32, false, false) +TYPE_TRAITS(FIXED64, uint64, FIXED64, false, false) +TYPE_TRAITS(FIXED32, uint32, FIXED32, false, false) +TYPE_TRAITS(SFIXED64, int64, FIXED64, false, false) +TYPE_TRAITS(SFIXED32, int32, FIXED32, false, false) +TYPE_TRAITS(BOOL, bool, VARINT, false, false) + +#undef TYPE_TRAITS + +template +class MapTypeHandler {}; + +template +class MapTypeHandler { + public: + // Enum type cannot be used for MapTypeHandler::Read. Define a type which will + // replace Enum with int. + typedef typename MapWireFieldTypeTraits::MapEntryAccessorType + MapEntryAccessorType; + // Internal stored type in MapEntryLite for given wire field type. + typedef typename MapWireFieldTypeTraits::TypeOnMemory TypeOnMemory; + // Corresponding wire type for field type. + static constexpr WireFormatLite::WireType kWireType = + MapWireFieldTypeTraits::kWireType; + // Whether wire type is for message. + static constexpr bool kIsMessage = + MapWireFieldTypeTraits::kIsMessage; + // Whether wire type is for enum. + static constexpr bool kIsEnum = + MapWireFieldTypeTraits::kIsEnum; + + // Functions used in parsing and serialization. =================== + static inline size_t ByteSize(const MapEntryAccessorType& value); + static inline int GetCachedSize(const MapEntryAccessorType& value); + static inline bool Read(io::CodedInputStream* input, + MapEntryAccessorType* value); + static inline const char* Read(const char* ptr, ParseContext* ctx, + MapEntryAccessorType* value); + + static inline uint8* Write(int field, const MapEntryAccessorType& value, + uint8* ptr, io::EpsCopyOutputStream* stream); + + // Functions to manipulate data on memory. ======================== + static inline const Type& GetExternalReference(const Type* value); + static inline void DeleteNoArena(const Type* x); + static inline void Merge(const Type& from, Type** to, Arena* arena); + static inline void Clear(Type** value, Arena* arena); + static inline void ClearMaybeByDefaultEnum(Type** value, Arena* arena, + int default_enum_value); + static inline void Initialize(Type** x, Arena* arena); + + static inline void InitializeMaybeByDefaultEnum(Type** x, + int default_enum_value, + Arena* arena); + static inline Type* EnsureMutable(Type** value, Arena* arena); + // SpaceUsedInMapEntry: Return bytes used by value in MapEntry, excluding + // those already calculate in sizeof(MapField). + static inline size_t SpaceUsedInMapEntryLong(const Type* value); + // Return bytes used by value in Map. + static inline size_t SpaceUsedInMapLong(const Type& value); + // Assign default value to given instance. + static inline void AssignDefaultValue(Type** value); + // Return default instance if value is not initialized when calling const + // reference accessor. + static inline const Type& DefaultIfNotInitialized(const Type* value, + const Type* default_value); + // Check if all required fields have values set. + static inline bool IsInitialized(Type* value); +}; + +#define MAP_HANDLER(FieldType) \ + template \ + class MapTypeHandler { \ + public: \ + typedef typename MapWireFieldTypeTraits::MapEntryAccessorType \ + MapEntryAccessorType; \ + typedef typename MapWireFieldTypeTraits::TypeOnMemory TypeOnMemory; \ + static const WireFormatLite::WireType kWireType = \ + MapWireFieldTypeTraits::kWireType; \ + static const bool kIsMessage = \ + MapWireFieldTypeTraits::kIsMessage; \ + static const bool kIsEnum = \ + MapWireFieldTypeTraits::kIsEnum; \ + static inline int ByteSize(const MapEntryAccessorType& value); \ + static inline int GetCachedSize(const MapEntryAccessorType& value); \ + static inline bool Read(io::CodedInputStream* input, \ + MapEntryAccessorType* value); \ + static inline const char* Read(const char* begin, ParseContext* ctx, \ + MapEntryAccessorType* value); \ + static inline uint8* Write(int field, const MapEntryAccessorType& value, \ + uint8* ptr, io::EpsCopyOutputStream* stream); \ + static inline const MapEntryAccessorType& GetExternalReference( \ + const TypeOnMemory& value); \ + static inline void DeleteNoArena(const TypeOnMemory& x); \ + static inline void Merge(const MapEntryAccessorType& from, \ + TypeOnMemory* to, Arena* arena); \ + static inline void Clear(TypeOnMemory* value, Arena* arena); \ + static inline void ClearMaybeByDefaultEnum(TypeOnMemory* value, \ + Arena* arena, \ + int default_enum); \ + static inline size_t SpaceUsedInMapEntryLong(const TypeOnMemory& value); \ + static inline size_t SpaceUsedInMapLong(const TypeOnMemory& value); \ + static inline size_t SpaceUsedInMapLong(ConstStringParam value); \ + static inline void AssignDefaultValue(TypeOnMemory* value); \ + static inline const MapEntryAccessorType& DefaultIfNotInitialized( \ + const TypeOnMemory& value, const TypeOnMemory& default_value); \ + static inline bool IsInitialized(const TypeOnMemory& value); \ + static void DeleteNoArena(TypeOnMemory& value); \ + static inline void Initialize(TypeOnMemory* value, Arena* arena); \ + static inline void InitializeMaybeByDefaultEnum(TypeOnMemory* value, \ + int default_enum_value, \ + Arena* arena); \ + static inline MapEntryAccessorType* EnsureMutable(TypeOnMemory* value, \ + Arena* arena); \ + }; +MAP_HANDLER(STRING) +MAP_HANDLER(BYTES) +MAP_HANDLER(INT64) +MAP_HANDLER(UINT64) +MAP_HANDLER(INT32) +MAP_HANDLER(UINT32) +MAP_HANDLER(SINT64) +MAP_HANDLER(SINT32) +MAP_HANDLER(ENUM) +MAP_HANDLER(DOUBLE) +MAP_HANDLER(FLOAT) +MAP_HANDLER(FIXED64) +MAP_HANDLER(FIXED32) +MAP_HANDLER(SFIXED64) +MAP_HANDLER(SFIXED32) +MAP_HANDLER(BOOL) +#undef MAP_HANDLER + +template +inline size_t MapTypeHandler::ByteSize( + const MapEntryAccessorType& value) { + return WireFormatLite::MessageSizeNoVirtual(value); +} + +#define GOOGLE_PROTOBUF_BYTE_SIZE(FieldType, DeclaredType) \ + template \ + inline int MapTypeHandler::ByteSize( \ + const MapEntryAccessorType& value) { \ + return static_cast(WireFormatLite::DeclaredType##Size(value)); \ + } + +GOOGLE_PROTOBUF_BYTE_SIZE(STRING, String) +GOOGLE_PROTOBUF_BYTE_SIZE(BYTES, Bytes) +GOOGLE_PROTOBUF_BYTE_SIZE(INT64, Int64) +GOOGLE_PROTOBUF_BYTE_SIZE(UINT64, UInt64) +GOOGLE_PROTOBUF_BYTE_SIZE(INT32, Int32) +GOOGLE_PROTOBUF_BYTE_SIZE(UINT32, UInt32) +GOOGLE_PROTOBUF_BYTE_SIZE(SINT64, SInt64) +GOOGLE_PROTOBUF_BYTE_SIZE(SINT32, SInt32) +GOOGLE_PROTOBUF_BYTE_SIZE(ENUM, Enum) + +#undef GOOGLE_PROTOBUF_BYTE_SIZE + +#define FIXED_BYTE_SIZE(FieldType, DeclaredType) \ + template \ + inline int MapTypeHandler::ByteSize( \ + const MapEntryAccessorType& /* value */) { \ + return WireFormatLite::k##DeclaredType##Size; \ + } + +FIXED_BYTE_SIZE(DOUBLE, Double) +FIXED_BYTE_SIZE(FLOAT, Float) +FIXED_BYTE_SIZE(FIXED64, Fixed64) +FIXED_BYTE_SIZE(FIXED32, Fixed32) +FIXED_BYTE_SIZE(SFIXED64, SFixed64) +FIXED_BYTE_SIZE(SFIXED32, SFixed32) +FIXED_BYTE_SIZE(BOOL, Bool) + +#undef FIXED_BYTE_SIZE + +template +inline int MapTypeHandler::GetCachedSize( + const MapEntryAccessorType& value) { + return static_cast(WireFormatLite::LengthDelimitedSize( + static_cast(value.GetCachedSize()))); +} + +#define GET_CACHED_SIZE(FieldType, DeclaredType) \ + template \ + inline int \ + MapTypeHandler::GetCachedSize( \ + const MapEntryAccessorType& value) { \ + return static_cast(WireFormatLite::DeclaredType##Size(value)); \ + } + +GET_CACHED_SIZE(STRING, String) +GET_CACHED_SIZE(BYTES, Bytes) +GET_CACHED_SIZE(INT64, Int64) +GET_CACHED_SIZE(UINT64, UInt64) +GET_CACHED_SIZE(INT32, Int32) +GET_CACHED_SIZE(UINT32, UInt32) +GET_CACHED_SIZE(SINT64, SInt64) +GET_CACHED_SIZE(SINT32, SInt32) +GET_CACHED_SIZE(ENUM, Enum) + +#undef GET_CACHED_SIZE + +#define GET_FIXED_CACHED_SIZE(FieldType, DeclaredType) \ + template \ + inline int \ + MapTypeHandler::GetCachedSize( \ + const MapEntryAccessorType& /* value */) { \ + return WireFormatLite::k##DeclaredType##Size; \ + } + +GET_FIXED_CACHED_SIZE(DOUBLE, Double) +GET_FIXED_CACHED_SIZE(FLOAT, Float) +GET_FIXED_CACHED_SIZE(FIXED64, Fixed64) +GET_FIXED_CACHED_SIZE(FIXED32, Fixed32) +GET_FIXED_CACHED_SIZE(SFIXED64, SFixed64) +GET_FIXED_CACHED_SIZE(SFIXED32, SFixed32) +GET_FIXED_CACHED_SIZE(BOOL, Bool) + +#undef GET_FIXED_CACHED_SIZE + +template +inline uint8* MapTypeHandler::Write( + int field, const MapEntryAccessorType& value, uint8* ptr, + io::EpsCopyOutputStream* stream) { + ptr = stream->EnsureSpace(ptr); + return WireFormatLite::InternalWriteMessage(field, value, ptr, stream); +} + +#define WRITE_METHOD(FieldType, DeclaredType) \ + template \ + inline uint8* MapTypeHandler::Write( \ + int field, const MapEntryAccessorType& value, uint8* ptr, \ + io::EpsCopyOutputStream* stream) { \ + ptr = stream->EnsureSpace(ptr); \ + return stream->Write##DeclaredType(field, value, ptr); \ + } + +WRITE_METHOD(STRING, String) +WRITE_METHOD(BYTES, Bytes) + +#undef WRITE_METHOD +#define WRITE_METHOD(FieldType, DeclaredType) \ + template \ + inline uint8* MapTypeHandler::Write( \ + int field, const MapEntryAccessorType& value, uint8* ptr, \ + io::EpsCopyOutputStream* stream) { \ + ptr = stream->EnsureSpace(ptr); \ + return WireFormatLite::Write##DeclaredType##ToArray(field, value, ptr); \ + } + +WRITE_METHOD(INT64, Int64) +WRITE_METHOD(UINT64, UInt64) +WRITE_METHOD(INT32, Int32) +WRITE_METHOD(UINT32, UInt32) +WRITE_METHOD(SINT64, SInt64) +WRITE_METHOD(SINT32, SInt32) +WRITE_METHOD(ENUM, Enum) +WRITE_METHOD(DOUBLE, Double) +WRITE_METHOD(FLOAT, Float) +WRITE_METHOD(FIXED64, Fixed64) +WRITE_METHOD(FIXED32, Fixed32) +WRITE_METHOD(SFIXED64, SFixed64) +WRITE_METHOD(SFIXED32, SFixed32) +WRITE_METHOD(BOOL, Bool) + +#undef WRITE_METHOD + +template +inline bool MapTypeHandler::Read( + io::CodedInputStream* input, MapEntryAccessorType* value) { + return WireFormatLite::ReadMessageNoVirtual(input, value); +} + +template +inline bool MapTypeHandler::Read( + io::CodedInputStream* input, MapEntryAccessorType* value) { + return WireFormatLite::ReadString(input, value); +} + +template +inline bool MapTypeHandler::Read( + io::CodedInputStream* input, MapEntryAccessorType* value) { + return WireFormatLite::ReadBytes(input, value); +} + +template +const char* MapTypeHandler::Read( + const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) { + return ctx->ParseMessage(value, ptr); +} + +template +const char* MapTypeHandler::Read( + const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) { + int size = ReadSize(&ptr); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + return ctx->ReadString(ptr, size, value); +} + +template +const char* MapTypeHandler::Read( + const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) { + int size = ReadSize(&ptr); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + return ctx->ReadString(ptr, size, value); +} + +inline const char* ReadINT64(const char* ptr, int64* value) { + return VarintParse(ptr, reinterpret_cast(value)); +} +inline const char* ReadUINT64(const char* ptr, uint64* value) { + return VarintParse(ptr, value); +} +inline const char* ReadINT32(const char* ptr, int32* value) { + return VarintParse(ptr, reinterpret_cast(value)); +} +inline const char* ReadUINT32(const char* ptr, uint32* value) { + return VarintParse(ptr, value); +} +inline const char* ReadSINT64(const char* ptr, int64* value) { + *value = ReadVarintZigZag64(&ptr); + return ptr; +} +inline const char* ReadSINT32(const char* ptr, int32* value) { + *value = ReadVarintZigZag32(&ptr); + return ptr; +} +template +inline const char* ReadENUM(const char* ptr, E* value) { + *value = static_cast(ReadVarint32(&ptr)); + return ptr; +} +inline const char* ReadBOOL(const char* ptr, bool* value) { + *value = static_cast(ReadVarint32(&ptr)); + return ptr; +} + +template +inline const char* ReadUnaligned(const char* ptr, F* value) { + *value = UnalignedLoad(ptr); + return ptr + sizeof(F); +} +inline const char* ReadFLOAT(const char* ptr, float* value) { + return ReadUnaligned(ptr, value); +} +inline const char* ReadDOUBLE(const char* ptr, double* value) { + return ReadUnaligned(ptr, value); +} +inline const char* ReadFIXED64(const char* ptr, uint64* value) { + return ReadUnaligned(ptr, value); +} +inline const char* ReadFIXED32(const char* ptr, uint32* value) { + return ReadUnaligned(ptr, value); +} +inline const char* ReadSFIXED64(const char* ptr, int64* value) { + return ReadUnaligned(ptr, value); +} +inline const char* ReadSFIXED32(const char* ptr, int32* value) { + return ReadUnaligned(ptr, value); +} + +#define READ_METHOD(FieldType) \ + template \ + inline bool MapTypeHandler::Read( \ + io::CodedInputStream* input, MapEntryAccessorType* value) { \ + return WireFormatLite::ReadPrimitive( \ + input, value); \ + } \ + template \ + const char* MapTypeHandler::Read( \ + const char* begin, ParseContext* ctx, MapEntryAccessorType* value) { \ + (void)ctx; \ + return Read##FieldType(begin, value); \ + } + +READ_METHOD(INT64) +READ_METHOD(UINT64) +READ_METHOD(INT32) +READ_METHOD(UINT32) +READ_METHOD(SINT64) +READ_METHOD(SINT32) +READ_METHOD(ENUM) +READ_METHOD(DOUBLE) +READ_METHOD(FLOAT) +READ_METHOD(FIXED64) +READ_METHOD(FIXED32) +READ_METHOD(SFIXED64) +READ_METHOD(SFIXED32) +READ_METHOD(BOOL) + +#undef READ_METHOD + +// Definition for message handler + +template +inline const Type& +MapTypeHandler::GetExternalReference( + const Type* value) { + return *value; +} + +template +inline size_t MapTypeHandler::SpaceUsedInMapEntryLong(const Type* value) { + return value->SpaceUsedLong(); +} + +template +size_t MapTypeHandler::SpaceUsedInMapLong( + const Type& value) { + return value.SpaceUsedLong(); +} + +template +inline void MapTypeHandler::Clear( + Type** value, Arena* /* arena */) { + if (*value != NULL) (*value)->Clear(); +} +template +inline void +MapTypeHandler::ClearMaybeByDefaultEnum( + Type** value, Arena* /* arena */, int /* default_enum_value */) { + if (*value != NULL) (*value)->Clear(); +} +template +inline void MapTypeHandler::Merge( + const Type& from, Type** to, Arena* /* arena */) { + (*to)->MergeFrom(from); +} + +template +void MapTypeHandler::DeleteNoArena( + const Type* ptr) { + delete ptr; +} + +template +inline void MapTypeHandler::AssignDefaultValue(Type** value) { + *value = const_cast(Type::internal_default_instance()); +} + +template +inline void MapTypeHandler::Initialize( + Type** x, Arena* /* arena */) { + *x = NULL; +} + +template +inline void MapTypeHandler:: + InitializeMaybeByDefaultEnum(Type** x, int /* default_enum_value */, + Arena* /* arena */) { + *x = NULL; +} + +template +inline Type* MapTypeHandler::EnsureMutable( + Type** value, Arena* arena) { + if (*value == NULL) { + *value = MapArenaMessageCreator< + Type, + Arena::is_arena_constructable::type::value>::CreateMessage(arena); + } + return *value; +} + +template +inline const Type& +MapTypeHandler::DefaultIfNotInitialized( + const Type* value, const Type* default_value) { + return value != NULL ? *value : *default_value; +} + +template +inline bool MapTypeHandler::IsInitialized( + Type* value) { + return value ? value->IsInitialized() : false; +} + +// Definition for string/bytes handler + +#define STRING_OR_BYTES_HANDLER_FUNCTIONS(FieldType) \ + template \ + inline const typename MapTypeHandler::MapEntryAccessorType& \ + MapTypeHandler::GetExternalReference(const TypeOnMemory& value) { \ + return value.Get(); \ + } \ + template \ + inline size_t \ + MapTypeHandler::SpaceUsedInMapEntryLong(const TypeOnMemory& value) { \ + return sizeof(value); \ + } \ + template \ + inline size_t \ + MapTypeHandler::SpaceUsedInMapLong( \ + const TypeOnMemory& value) { \ + return sizeof(value); \ + } \ + template \ + inline size_t \ + MapTypeHandler::SpaceUsedInMapLong( \ + ConstStringParam value) { \ + return sizeof(std::string); \ + } \ + template \ + inline void MapTypeHandler::Clear( \ + TypeOnMemory* value, Arena* arena) { \ + value->ClearToEmpty(&internal::GetEmptyStringAlreadyInited(), arena); \ + } \ + template \ + inline void MapTypeHandler:: \ + ClearMaybeByDefaultEnum(TypeOnMemory* value, Arena* arena, \ + int /* default_enum */) { \ + Clear(value, arena); \ + } \ + template \ + inline void MapTypeHandler::Merge( \ + const MapEntryAccessorType& from, TypeOnMemory* to, Arena* arena) { \ + to->Set(&internal::GetEmptyStringAlreadyInited(), from, arena); \ + } \ + template \ + void MapTypeHandler::DeleteNoArena( \ + TypeOnMemory& value) { \ + value.DestroyNoArena(&internal::GetEmptyStringAlreadyInited()); \ + } \ + template \ + inline void \ + MapTypeHandler::AssignDefaultValue( \ + TypeOnMemory* /* value */) {} \ + template \ + inline void \ + MapTypeHandler::Initialize( \ + TypeOnMemory* value, Arena* /* arena */) { \ + value->UnsafeSetDefault(&internal::GetEmptyStringAlreadyInited()); \ + } \ + template \ + inline void MapTypeHandler:: \ + InitializeMaybeByDefaultEnum( \ + TypeOnMemory* value, int /* default_enum_value */, Arena* arena) { \ + Initialize(value, arena); \ + } \ + template \ + inline typename MapTypeHandler::MapEntryAccessorType* \ + MapTypeHandler::EnsureMutable( \ + TypeOnMemory* value, Arena* arena) { \ + return value->Mutable(&internal::GetEmptyStringAlreadyInited(), arena); \ + } \ + template \ + inline const typename MapTypeHandler::MapEntryAccessorType& \ + MapTypeHandler:: \ + DefaultIfNotInitialized(const TypeOnMemory& value, \ + const TypeOnMemory& /* default_value */) { \ + return value.Get(); \ + } \ + template \ + inline bool \ + MapTypeHandler::IsInitialized( \ + const TypeOnMemory& /* value */) { \ + return true; \ + } +STRING_OR_BYTES_HANDLER_FUNCTIONS(STRING) +STRING_OR_BYTES_HANDLER_FUNCTIONS(BYTES) +#undef STRING_OR_BYTES_HANDLER_FUNCTIONS + +#define PRIMITIVE_HANDLER_FUNCTIONS(FieldType) \ + template \ + inline const typename MapTypeHandler::MapEntryAccessorType& \ + MapTypeHandler::GetExternalReference(const TypeOnMemory& value) { \ + return value; \ + } \ + template \ + inline size_t MapTypeHandler:: \ + SpaceUsedInMapEntryLong(const TypeOnMemory& /* value */) { \ + return 0; \ + } \ + template \ + inline size_t \ + MapTypeHandler::SpaceUsedInMapLong( \ + const TypeOnMemory& /* value */) { \ + return sizeof(Type); \ + } \ + template \ + inline void MapTypeHandler::Clear( \ + TypeOnMemory* value, Arena* /* arena */) { \ + *value = 0; \ + } \ + template \ + inline void MapTypeHandler:: \ + ClearMaybeByDefaultEnum(TypeOnMemory* value, Arena* /* arena */, \ + int default_enum_value) { \ + *value = static_cast(default_enum_value); \ + } \ + template \ + inline void MapTypeHandler::Merge( \ + const MapEntryAccessorType& from, TypeOnMemory* to, \ + Arena* /* arena */) { \ + *to = from; \ + } \ + template \ + inline void MapTypeHandler::DeleteNoArena(TypeOnMemory& /* x */) {} \ + template \ + inline void \ + MapTypeHandler::AssignDefaultValue( \ + TypeOnMemory* /* value */) {} \ + template \ + inline void \ + MapTypeHandler::Initialize( \ + TypeOnMemory* value, Arena* /* arena */) { \ + *value = 0; \ + } \ + template \ + inline void MapTypeHandler:: \ + InitializeMaybeByDefaultEnum( \ + TypeOnMemory* value, int default_enum_value, Arena* /* arena */) { \ + *value = static_cast(default_enum_value); \ + } \ + template \ + inline typename MapTypeHandler::MapEntryAccessorType* \ + MapTypeHandler::EnsureMutable( \ + TypeOnMemory* value, Arena* /* arena */) { \ + return value; \ + } \ + template \ + inline const typename MapTypeHandler::MapEntryAccessorType& \ + MapTypeHandler:: \ + DefaultIfNotInitialized(const TypeOnMemory& value, \ + const TypeOnMemory& /* default_value */) { \ + return value; \ + } \ + template \ + inline bool \ + MapTypeHandler::IsInitialized( \ + const TypeOnMemory& /* value */) { \ + return true; \ + } +PRIMITIVE_HANDLER_FUNCTIONS(INT64) +PRIMITIVE_HANDLER_FUNCTIONS(UINT64) +PRIMITIVE_HANDLER_FUNCTIONS(INT32) +PRIMITIVE_HANDLER_FUNCTIONS(UINT32) +PRIMITIVE_HANDLER_FUNCTIONS(SINT64) +PRIMITIVE_HANDLER_FUNCTIONS(SINT32) +PRIMITIVE_HANDLER_FUNCTIONS(ENUM) +PRIMITIVE_HANDLER_FUNCTIONS(DOUBLE) +PRIMITIVE_HANDLER_FUNCTIONS(FLOAT) +PRIMITIVE_HANDLER_FUNCTIONS(FIXED64) +PRIMITIVE_HANDLER_FUNCTIONS(FIXED32) +PRIMITIVE_HANDLER_FUNCTIONS(SFIXED64) +PRIMITIVE_HANDLER_FUNCTIONS(SFIXED32) +PRIMITIVE_HANDLER_FUNCTIONS(BOOL) +#undef PRIMITIVE_HANDLER_FUNCTIONS + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_TYPE_HANDLER_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h new file mode 100644 index 0000000000000000000000000000000000000000..89761c62ed239aaff1103627725864c56afea253 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h @@ -0,0 +1,1344 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Defines Message, the abstract interface implemented by non-lite +// protocol message objects. Although it's possible to implement this +// interface manually, most users will use the protocol compiler to +// generate implementations. +// +// Example usage: +// +// Say you have a message defined as: +// +// message Foo { +// optional string text = 1; +// repeated int32 numbers = 2; +// } +// +// Then, if you used the protocol compiler to generate a class from the above +// definition, you could use it like so: +// +// std::string data; // Will store a serialized version of the message. +// +// { +// // Create a message and serialize it. +// Foo foo; +// foo.set_text("Hello World!"); +// foo.add_numbers(1); +// foo.add_numbers(5); +// foo.add_numbers(42); +// +// foo.SerializeToString(&data); +// } +// +// { +// // Parse the serialized message and check that it contains the +// // correct data. +// Foo foo; +// foo.ParseFromString(data); +// +// assert(foo.text() == "Hello World!"); +// assert(foo.numbers_size() == 3); +// assert(foo.numbers(0) == 1); +// assert(foo.numbers(1) == 5); +// assert(foo.numbers(2) == 42); +// } +// +// { +// // Same as the last block, but do it dynamically via the Message +// // reflection interface. +// Message* foo = new Foo; +// const Descriptor* descriptor = foo->GetDescriptor(); +// +// // Get the descriptors for the fields we're interested in and verify +// // their types. +// const FieldDescriptor* text_field = descriptor->FindFieldByName("text"); +// assert(text_field != nullptr); +// assert(text_field->type() == FieldDescriptor::TYPE_STRING); +// assert(text_field->label() == FieldDescriptor::LABEL_OPTIONAL); +// const FieldDescriptor* numbers_field = descriptor-> +// FindFieldByName("numbers"); +// assert(numbers_field != nullptr); +// assert(numbers_field->type() == FieldDescriptor::TYPE_INT32); +// assert(numbers_field->label() == FieldDescriptor::LABEL_REPEATED); +// +// // Parse the message. +// foo->ParseFromString(data); +// +// // Use the reflection interface to examine the contents. +// const Reflection* reflection = foo->GetReflection(); +// assert(reflection->GetString(*foo, text_field) == "Hello World!"); +// assert(reflection->FieldSize(*foo, numbers_field) == 3); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 0) == 1); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 1) == 5); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 2) == 42); +// +// delete foo; +// } + +#ifndef GOOGLE_PROTOBUF_MESSAGE_H__ +#define GOOGLE_PROTOBUF_MESSAGE_H__ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +#define GOOGLE_PROTOBUF_HAS_ONEOF +#define GOOGLE_PROTOBUF_HAS_ARENAS + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { + +// Defined in this file. +class Message; +class Reflection; +class MessageFactory; + +// Defined in other files. +class AssignDescriptorsHelper; +class DynamicMessageFactory; +class MapKey; +class MapValueRef; +class MapIterator; +class MapReflectionTester; + +namespace internal { +struct DescriptorTable; +class MapFieldBase; +} +class UnknownFieldSet; // unknown_field_set.h +namespace io { +class ZeroCopyInputStream; // zero_copy_stream.h +class ZeroCopyOutputStream; // zero_copy_stream.h +class CodedInputStream; // coded_stream.h +class CodedOutputStream; // coded_stream.h +} // namespace io +namespace python { +class MapReflectionFriend; // scalar_map_container.h +} +namespace expr { +class CelMapReflectionFriend; // field_backed_map_impl.cc +} + +namespace internal { +class MapFieldPrinterHelper; // text_format.cc +} + + +namespace internal { +class ReflectionAccessor; // message.cc +class ReflectionOps; // reflection_ops.h +class MapKeySorter; // wire_format.cc +class WireFormat; // wire_format.h +class MapFieldReflectionTest; // map_test.cc +} // namespace internal + +template +class RepeatedField; // repeated_field.h + +template +class RepeatedPtrField; // repeated_field.h + +// A container to hold message metadata. +struct Metadata { + const Descriptor* descriptor; + const Reflection* reflection; +}; + +namespace internal { +template +inline To* GetPointerAtOffset(Message* message, uint32 offset) { + return reinterpret_cast(reinterpret_cast(message) + offset); +} + +template +const To* GetConstPointerAtOffset(const Message* message, uint32 offset) { + return reinterpret_cast(reinterpret_cast(message) + + offset); +} + +template +const To& GetConstRefAtOffset(const Message& message, uint32 offset) { + return *GetConstPointerAtOffset(&message, offset); +} + +bool CreateUnknownEnumValues(const FieldDescriptor* field); +} // namespace internal + +// Abstract interface for protocol messages. +// +// See also MessageLite, which contains most every-day operations. Message +// adds descriptors and reflection on top of that. +// +// The methods of this class that are virtual but not pure-virtual have +// default implementations based on reflection. Message classes which are +// optimized for speed will want to override these with faster implementations, +// but classes optimized for code size may be happy with keeping them. See +// the optimize_for option in descriptor.proto. +// +// Users must not derive from this class. Only the protocol compiler and +// the internal library are allowed to create subclasses. +class PROTOBUF_EXPORT Message : public MessageLite { + public: + inline Message() {} + + // Basic Operations ------------------------------------------------ + + // Construct a new instance of the same type. Ownership is passed to the + // caller. (This is also defined in MessageLite, but is defined again here + // for return-type covariance.) + Message* New() const override = 0; + + // Construct a new instance on the arena. Ownership is passed to the caller + // if arena is a nullptr. Default implementation allows for API compatibility + // during the Arena transition. + Message* New(Arena* arena) const override { + Message* message = New(); + if (arena != nullptr) { + arena->Own(message); + } + return message; + } + + // Make this message into a copy of the given message. The given message + // must have the same descriptor, but need not necessarily be the same class. + // By default this is just implemented as "Clear(); MergeFrom(from);". + virtual void CopyFrom(const Message& from); + + // Merge the fields from the given message into this message. Singular + // fields will be overwritten, if specified in from, except for embedded + // messages which will be merged. Repeated fields will be concatenated. + // The given message must be of the same type as this message (i.e. the + // exact same class). + virtual void MergeFrom(const Message& from); + + // Verifies that IsInitialized() returns true. GOOGLE_CHECK-fails otherwise, with + // a nice error message. + void CheckInitialized() const; + + // Slowly build a list of all required fields that are not set. + // This is much, much slower than IsInitialized() as it is implemented + // purely via reflection. Generally, you should not call this unless you + // have already determined that an error exists by calling IsInitialized(). + void FindInitializationErrors(std::vector* errors) const; + + // Like FindInitializationErrors, but joins all the strings, delimited by + // commas, and returns them. + std::string InitializationErrorString() const override; + + // Clears all unknown fields from this message and all embedded messages. + // Normally, if unknown tag numbers are encountered when parsing a message, + // the tag and value are stored in the message's UnknownFieldSet and + // then written back out when the message is serialized. This allows servers + // which simply route messages to other servers to pass through messages + // that have new field definitions which they don't yet know about. However, + // this behavior can have security implications. To avoid it, call this + // method after parsing. + // + // See Reflection::GetUnknownFields() for more on unknown fields. + virtual void DiscardUnknownFields(); + + // Computes (an estimate of) the total number of bytes currently used for + // storing the message in memory. The default implementation calls the + // Reflection object's SpaceUsed() method. + // + // SpaceUsed() is noticeably slower than ByteSize(), as it is implemented + // using reflection (rather than the generated code implementation for + // ByteSize()). Like ByteSize(), its CPU time is linear in the number of + // fields defined for the proto. + virtual size_t SpaceUsedLong() const; + + PROTOBUF_DEPRECATED_MSG("Please use SpaceUsedLong() instead") + int SpaceUsed() const { return internal::ToIntSize(SpaceUsedLong()); } + + // Debugging & Testing---------------------------------------------- + + // Generates a human readable form of this message, useful for debugging + // and other purposes. + std::string DebugString() const; + // Like DebugString(), but with less whitespace. + std::string ShortDebugString() const; + // Like DebugString(), but do not escape UTF-8 byte sequences. + std::string Utf8DebugString() const; + // Convenience function useful in GDB. Prints DebugString() to stdout. + void PrintDebugString() const; + + // Reflection-based methods ---------------------------------------- + // These methods are pure-virtual in MessageLite, but Message provides + // reflection-based default implementations. + + std::string GetTypeName() const override; + void Clear() override; + + // Returns whether all required fields have been set. Note that required + // fields no longer exist starting in proto3. + bool IsInitialized() const override; + + void CheckTypeAndMergeFrom(const MessageLite& other) override; + // Reflective parser + const char* _InternalParse(const char* ptr, + internal::ParseContext* ctx) override; + size_t ByteSizeLong() const override; + uint8* _InternalSerialize(uint8* target, + io::EpsCopyOutputStream* stream) const override; + + private: + // This is called only by the default implementation of ByteSize(), to + // update the cached size. If you override ByteSize(), you do not need + // to override this. If you do not override ByteSize(), you MUST override + // this; the default implementation will crash. + // + // The method is private because subclasses should never call it; only + // override it. Yes, C++ lets you do that. Crazy, huh? + virtual void SetCachedSize(int size) const; + + public: + // Introspection --------------------------------------------------- + + + // Get a non-owning pointer to a Descriptor for this message's type. This + // describes what fields the message contains, the types of those fields, etc. + // This object remains property of the Message. + const Descriptor* GetDescriptor() const { return GetMetadata().descriptor; } + + // Get a non-owning pointer to the Reflection interface for this Message, + // which can be used to read and modify the fields of the Message dynamically + // (in other words, without knowing the message type at compile time). This + // object remains property of the Message. + const Reflection* GetReflection() const { return GetMetadata().reflection; } + + protected: + // Get a struct containing the metadata for the Message, which is used in turn + // to implement GetDescriptor() and GetReflection() above. + virtual Metadata GetMetadata() const = 0; + + inline explicit Message(Arena* arena) : MessageLite(arena) {} + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Message); +}; + +namespace internal { +// Forward-declare interfaces used to implement RepeatedFieldRef. +// These are protobuf internals that users shouldn't care about. +class RepeatedFieldAccessor; +} // namespace internal + +// Forward-declare RepeatedFieldRef templates. The second type parameter is +// used for SFINAE tricks. Users should ignore it. +template +class RepeatedFieldRef; + +template +class MutableRepeatedFieldRef; + +// This interface contains methods that can be used to dynamically access +// and modify the fields of a protocol message. Their semantics are +// similar to the accessors the protocol compiler generates. +// +// To get the Reflection for a given Message, call Message::GetReflection(). +// +// This interface is separate from Message only for efficiency reasons; +// the vast majority of implementations of Message will share the same +// implementation of Reflection (GeneratedMessageReflection, +// defined in generated_message.h), and all Messages of a particular class +// should share the same Reflection object (though you should not rely on +// the latter fact). +// +// There are several ways that these methods can be used incorrectly. For +// example, any of the following conditions will lead to undefined +// results (probably assertion failures): +// - The FieldDescriptor is not a field of this message type. +// - The method called is not appropriate for the field's type. For +// each field type in FieldDescriptor::TYPE_*, there is only one +// Get*() method, one Set*() method, and one Add*() method that is +// valid for that type. It should be obvious which (except maybe +// for TYPE_BYTES, which are represented using strings in C++). +// - A Get*() or Set*() method for singular fields is called on a repeated +// field. +// - GetRepeated*(), SetRepeated*(), or Add*() is called on a non-repeated +// field. +// - The Message object passed to any method is not of the right type for +// this Reflection object (i.e. message.GetReflection() != reflection). +// +// You might wonder why there is not any abstract representation for a field +// of arbitrary type. E.g., why isn't there just a "GetField()" method that +// returns "const Field&", where "Field" is some class with accessors like +// "GetInt32Value()". The problem is that someone would have to deal with +// allocating these Field objects. For generated message classes, having to +// allocate space for an additional object to wrap every field would at least +// double the message's memory footprint, probably worse. Allocating the +// objects on-demand, on the other hand, would be expensive and prone to +// memory leaks. So, instead we ended up with this flat interface. +class PROTOBUF_EXPORT Reflection final { + public: + // Get the UnknownFieldSet for the message. This contains fields which + // were seen when the Message was parsed but were not recognized according + // to the Message's definition. + const UnknownFieldSet& GetUnknownFields(const Message& message) const; + // Get a mutable pointer to the UnknownFieldSet for the message. This + // contains fields which were seen when the Message was parsed but were not + // recognized according to the Message's definition. + UnknownFieldSet* MutableUnknownFields(Message* message) const; + + // Estimate the amount of memory used by the message object. + size_t SpaceUsedLong(const Message& message) const; + + PROTOBUF_DEPRECATED_MSG("Please use SpaceUsedLong() instead") + int SpaceUsed(const Message& message) const { + return internal::ToIntSize(SpaceUsedLong(message)); + } + + // Check if the given non-repeated field is set. + bool HasField(const Message& message, const FieldDescriptor* field) const; + + // Get the number of elements of a repeated field. + int FieldSize(const Message& message, const FieldDescriptor* field) const; + + // Clear the value of a field, so that HasField() returns false or + // FieldSize() returns zero. + void ClearField(Message* message, const FieldDescriptor* field) const; + + // Check if the oneof is set. Returns true if any field in oneof + // is set, false otherwise. + bool HasOneof(const Message& message, + const OneofDescriptor* oneof_descriptor) const; + + void ClearOneof(Message* message, + const OneofDescriptor* oneof_descriptor) const; + + // Returns the field descriptor if the oneof is set. nullptr otherwise. + const FieldDescriptor* GetOneofFieldDescriptor( + const Message& message, const OneofDescriptor* oneof_descriptor) const; + + // Removes the last element of a repeated field. + // We don't provide a way to remove any element other than the last + // because it invites inefficient use, such as O(n^2) filtering loops + // that should have been O(n). If you want to remove an element other + // than the last, the best way to do it is to re-arrange the elements + // (using Swap()) so that the one you want removed is at the end, then + // call RemoveLast(). + void RemoveLast(Message* message, const FieldDescriptor* field) const; + // Removes the last element of a repeated message field, and returns the + // pointer to the caller. Caller takes ownership of the returned pointer. + Message* ReleaseLast(Message* message, const FieldDescriptor* field) const; + + // Swap the complete contents of two messages. + void Swap(Message* message1, Message* message2) const; + + // Swap fields listed in fields vector of two messages. + void SwapFields(Message* message1, Message* message2, + const std::vector& fields) const; + + // Swap two elements of a repeated field. + void SwapElements(Message* message, const FieldDescriptor* field, int index1, + int index2) const; + + // List all fields of the message which are currently set, except for unknown + // fields, but including extension known to the parser (i.e. compiled in). + // Singular fields will only be listed if HasField(field) would return true + // and repeated fields will only be listed if FieldSize(field) would return + // non-zero. Fields (both normal fields and extension fields) will be listed + // ordered by field number. + // Use Reflection::GetUnknownFields() or message.unknown_fields() to also get + // access to fields/extensions unknown to the parser. + void ListFields(const Message& message, + std::vector* output) const; + + // Singular field getters ------------------------------------------ + // These get the value of a non-repeated field. They return the default + // value for fields that aren't set. + + int32 GetInt32(const Message& message, const FieldDescriptor* field) const; + int64 GetInt64(const Message& message, const FieldDescriptor* field) const; + uint32 GetUInt32(const Message& message, const FieldDescriptor* field) const; + uint64 GetUInt64(const Message& message, const FieldDescriptor* field) const; + float GetFloat(const Message& message, const FieldDescriptor* field) const; + double GetDouble(const Message& message, const FieldDescriptor* field) const; + bool GetBool(const Message& message, const FieldDescriptor* field) const; + std::string GetString(const Message& message, + const FieldDescriptor* field) const; + const EnumValueDescriptor* GetEnum(const Message& message, + const FieldDescriptor* field) const; + + // GetEnumValue() returns an enum field's value as an integer rather than + // an EnumValueDescriptor*. If the integer value does not correspond to a + // known value descriptor, a new value descriptor is created. (Such a value + // will only be present when the new unknown-enum-value semantics are enabled + // for a message.) + int GetEnumValue(const Message& message, const FieldDescriptor* field) const; + + // See MutableMessage() for the meaning of the "factory" parameter. + const Message& GetMessage(const Message& message, + const FieldDescriptor* field, + MessageFactory* factory = nullptr) const; + + // Get a string value without copying, if possible. + // + // GetString() necessarily returns a copy of the string. This can be + // inefficient when the std::string is already stored in a std::string object + // in the underlying message. GetStringReference() will return a reference to + // the underlying std::string in this case. Otherwise, it will copy the + // string into *scratch and return that. + // + // Note: It is perfectly reasonable and useful to write code like: + // str = reflection->GetStringReference(message, field, &str); + // This line would ensure that only one copy of the string is made + // regardless of the field's underlying representation. When initializing + // a newly-constructed string, though, it's just as fast and more + // readable to use code like: + // std::string str = reflection->GetString(message, field); + const std::string& GetStringReference(const Message& message, + const FieldDescriptor* field, + std::string* scratch) const; + + + // Singular field mutators ----------------------------------------- + // These mutate the value of a non-repeated field. + + void SetInt32(Message* message, const FieldDescriptor* field, + int32 value) const; + void SetInt64(Message* message, const FieldDescriptor* field, + int64 value) const; + void SetUInt32(Message* message, const FieldDescriptor* field, + uint32 value) const; + void SetUInt64(Message* message, const FieldDescriptor* field, + uint64 value) const; + void SetFloat(Message* message, const FieldDescriptor* field, + float value) const; + void SetDouble(Message* message, const FieldDescriptor* field, + double value) const; + void SetBool(Message* message, const FieldDescriptor* field, + bool value) const; + void SetString(Message* message, const FieldDescriptor* field, + std::string value) const; + void SetEnum(Message* message, const FieldDescriptor* field, + const EnumValueDescriptor* value) const; + // Set an enum field's value with an integer rather than EnumValueDescriptor. + // For proto3 this is just setting the enum field to the value specified, for + // proto2 it's more complicated. If value is a known enum value the field is + // set as usual. If the value is unknown then it is added to the unknown field + // set. Note this matches the behavior of parsing unknown enum values. + // If multiple calls with unknown values happen than they are all added to the + // unknown field set in order of the calls. + void SetEnumValue(Message* message, const FieldDescriptor* field, + int value) const; + + // Get a mutable pointer to a field with a message type. If a MessageFactory + // is provided, it will be used to construct instances of the sub-message; + // otherwise, the default factory is used. If the field is an extension that + // does not live in the same pool as the containing message's descriptor (e.g. + // it lives in an overlay pool), then a MessageFactory must be provided. + // If you have no idea what that meant, then you probably don't need to worry + // about it (don't provide a MessageFactory). WARNING: If the + // FieldDescriptor is for a compiled-in extension, then + // factory->GetPrototype(field->message_type()) MUST return an instance of + // the compiled-in class for this type, NOT DynamicMessage. + Message* MutableMessage(Message* message, const FieldDescriptor* field, + MessageFactory* factory = nullptr) const; + // Replaces the message specified by 'field' with the already-allocated object + // sub_message, passing ownership to the message. If the field contained a + // message, that message is deleted. If sub_message is nullptr, the field is + // cleared. + void SetAllocatedMessage(Message* message, Message* sub_message, + const FieldDescriptor* field) const; + // Releases the message specified by 'field' and returns the pointer, + // ReleaseMessage() will return the message the message object if it exists. + // Otherwise, it may or may not return nullptr. In any case, if the return + // value is non-null, the caller takes ownership of the pointer. + // If the field existed (HasField() is true), then the returned pointer will + // be the same as the pointer returned by MutableMessage(). + // This function has the same effect as ClearField(). + Message* ReleaseMessage(Message* message, const FieldDescriptor* field, + MessageFactory* factory = nullptr) const; + + + // Repeated field getters ------------------------------------------ + // These get the value of one element of a repeated field. + + int32 GetRepeatedInt32(const Message& message, const FieldDescriptor* field, + int index) const; + int64 GetRepeatedInt64(const Message& message, const FieldDescriptor* field, + int index) const; + uint32 GetRepeatedUInt32(const Message& message, const FieldDescriptor* field, + int index) const; + uint64 GetRepeatedUInt64(const Message& message, const FieldDescriptor* field, + int index) const; + float GetRepeatedFloat(const Message& message, const FieldDescriptor* field, + int index) const; + double GetRepeatedDouble(const Message& message, const FieldDescriptor* field, + int index) const; + bool GetRepeatedBool(const Message& message, const FieldDescriptor* field, + int index) const; + std::string GetRepeatedString(const Message& message, + const FieldDescriptor* field, int index) const; + const EnumValueDescriptor* GetRepeatedEnum(const Message& message, + const FieldDescriptor* field, + int index) const; + // GetRepeatedEnumValue() returns an enum field's value as an integer rather + // than an EnumValueDescriptor*. If the integer value does not correspond to a + // known value descriptor, a new value descriptor is created. (Such a value + // will only be present when the new unknown-enum-value semantics are enabled + // for a message.) + int GetRepeatedEnumValue(const Message& message, const FieldDescriptor* field, + int index) const; + const Message& GetRepeatedMessage(const Message& message, + const FieldDescriptor* field, + int index) const; + + // See GetStringReference(), above. + const std::string& GetRepeatedStringReference(const Message& message, + const FieldDescriptor* field, + int index, + std::string* scratch) const; + + + // Repeated field mutators ----------------------------------------- + // These mutate the value of one element of a repeated field. + + void SetRepeatedInt32(Message* message, const FieldDescriptor* field, + int index, int32 value) const; + void SetRepeatedInt64(Message* message, const FieldDescriptor* field, + int index, int64 value) const; + void SetRepeatedUInt32(Message* message, const FieldDescriptor* field, + int index, uint32 value) const; + void SetRepeatedUInt64(Message* message, const FieldDescriptor* field, + int index, uint64 value) const; + void SetRepeatedFloat(Message* message, const FieldDescriptor* field, + int index, float value) const; + void SetRepeatedDouble(Message* message, const FieldDescriptor* field, + int index, double value) const; + void SetRepeatedBool(Message* message, const FieldDescriptor* field, + int index, bool value) const; + void SetRepeatedString(Message* message, const FieldDescriptor* field, + int index, std::string value) const; + void SetRepeatedEnum(Message* message, const FieldDescriptor* field, + int index, const EnumValueDescriptor* value) const; + // Set an enum field's value with an integer rather than EnumValueDescriptor. + // For proto3 this is just setting the enum field to the value specified, for + // proto2 it's more complicated. If value is a known enum value the field is + // set as usual. If the value is unknown then it is added to the unknown field + // set. Note this matches the behavior of parsing unknown enum values. + // If multiple calls with unknown values happen than they are all added to the + // unknown field set in order of the calls. + void SetRepeatedEnumValue(Message* message, const FieldDescriptor* field, + int index, int value) const; + // Get a mutable pointer to an element of a repeated field with a message + // type. + Message* MutableRepeatedMessage(Message* message, + const FieldDescriptor* field, + int index) const; + + + // Repeated field adders ------------------------------------------- + // These add an element to a repeated field. + + void AddInt32(Message* message, const FieldDescriptor* field, + int32 value) const; + void AddInt64(Message* message, const FieldDescriptor* field, + int64 value) const; + void AddUInt32(Message* message, const FieldDescriptor* field, + uint32 value) const; + void AddUInt64(Message* message, const FieldDescriptor* field, + uint64 value) const; + void AddFloat(Message* message, const FieldDescriptor* field, + float value) const; + void AddDouble(Message* message, const FieldDescriptor* field, + double value) const; + void AddBool(Message* message, const FieldDescriptor* field, + bool value) const; + void AddString(Message* message, const FieldDescriptor* field, + std::string value) const; + void AddEnum(Message* message, const FieldDescriptor* field, + const EnumValueDescriptor* value) const; + // Add an integer value to a repeated enum field rather than + // EnumValueDescriptor. For proto3 this is just setting the enum field to the + // value specified, for proto2 it's more complicated. If value is a known enum + // value the field is set as usual. If the value is unknown then it is added + // to the unknown field set. Note this matches the behavior of parsing unknown + // enum values. If multiple calls with unknown values happen than they are all + // added to the unknown field set in order of the calls. + void AddEnumValue(Message* message, const FieldDescriptor* field, + int value) const; + // See MutableMessage() for comments on the "factory" parameter. + Message* AddMessage(Message* message, const FieldDescriptor* field, + MessageFactory* factory = nullptr) const; + + // Appends an already-allocated object 'new_entry' to the repeated field + // specified by 'field' passing ownership to the message. + void AddAllocatedMessage(Message* message, const FieldDescriptor* field, + Message* new_entry) const; + + + // Get a RepeatedFieldRef object that can be used to read the underlying + // repeated field. The type parameter T must be set according to the + // field's cpp type. The following table shows the mapping from cpp type + // to acceptable T. + // + // field->cpp_type() T + // CPPTYPE_INT32 int32 + // CPPTYPE_UINT32 uint32 + // CPPTYPE_INT64 int64 + // CPPTYPE_UINT64 uint64 + // CPPTYPE_DOUBLE double + // CPPTYPE_FLOAT float + // CPPTYPE_BOOL bool + // CPPTYPE_ENUM generated enum type or int32 + // CPPTYPE_STRING std::string + // CPPTYPE_MESSAGE generated message type or google::protobuf::Message + // + // A RepeatedFieldRef object can be copied and the resulted object will point + // to the same repeated field in the same message. The object can be used as + // long as the message is not destroyed. + // + // Note that to use this method users need to include the header file + // "reflection.h" (which defines the RepeatedFieldRef class templates). + template + RepeatedFieldRef GetRepeatedFieldRef(const Message& message, + const FieldDescriptor* field) const; + + // Like GetRepeatedFieldRef() but return an object that can also be used + // manipulate the underlying repeated field. + template + MutableRepeatedFieldRef GetMutableRepeatedFieldRef( + Message* message, const FieldDescriptor* field) const; + + // DEPRECATED. Please use Get(Mutable)RepeatedFieldRef() for repeated field + // access. The following repeated field accesors will be removed in the + // future. + // + // Repeated field accessors ------------------------------------------------- + // The methods above, e.g. GetRepeatedInt32(msg, fd, index), provide singular + // access to the data in a RepeatedField. The methods below provide aggregate + // access by exposing the RepeatedField object itself with the Message. + // Applying these templates to inappropriate types will lead to an undefined + // reference at link time (e.g. GetRepeatedField<***double>), or possibly a + // template matching error at compile time (e.g. GetRepeatedPtrField). + // + // Usage example: my_doubs = refl->GetRepeatedField(msg, fd); + + // DEPRECATED. Please use GetRepeatedFieldRef(). + // + // for T = Cord and all protobuf scalar types except enums. + template + PROTOBUF_DEPRECATED_MSG("Please use GetRepeatedFieldRef() instead") + const RepeatedField& GetRepeatedField(const Message& msg, + const FieldDescriptor* d) const { + return GetRepeatedFieldInternal(msg, d); + } + + // DEPRECATED. Please use GetMutableRepeatedFieldRef(). + // + // for T = Cord and all protobuf scalar types except enums. + template + PROTOBUF_DEPRECATED_MSG("Please use GetMutableRepeatedFieldRef() instead") + RepeatedField* MutableRepeatedField(Message* msg, + const FieldDescriptor* d) const { + return MutableRepeatedFieldInternal(msg, d); + } + + // DEPRECATED. Please use GetRepeatedFieldRef(). + // + // for T = std::string, google::protobuf::internal::StringPieceField + // google::protobuf::Message & descendants. + template + PROTOBUF_DEPRECATED_MSG("Please use GetRepeatedFieldRef() instead") + const RepeatedPtrField& GetRepeatedPtrField( + const Message& msg, const FieldDescriptor* d) const { + return GetRepeatedPtrFieldInternal(msg, d); + } + + // DEPRECATED. Please use GetMutableRepeatedFieldRef(). + // + // for T = std::string, google::protobuf::internal::StringPieceField + // google::protobuf::Message & descendants. + template + PROTOBUF_DEPRECATED_MSG("Please use GetMutableRepeatedFieldRef() instead") + RepeatedPtrField* MutableRepeatedPtrField(Message* msg, + const FieldDescriptor* d) const { + return MutableRepeatedPtrFieldInternal(msg, d); + } + + // Extensions ---------------------------------------------------------------- + + // Try to find an extension of this message type by fully-qualified field + // name. Returns nullptr if no extension is known for this name or number. + const FieldDescriptor* FindKnownExtensionByName( + const std::string& name) const; + + // Try to find an extension of this message type by field number. + // Returns nullptr if no extension is known for this name or number. + const FieldDescriptor* FindKnownExtensionByNumber(int number) const; + + // Feature Flags ------------------------------------------------------------- + + // Does this message support storing arbitrary integer values in enum fields? + // If |true|, GetEnumValue/SetEnumValue and associated repeated-field versions + // take arbitrary integer values, and the legacy GetEnum() getter will + // dynamically create an EnumValueDescriptor for any integer value without + // one. If |false|, setting an unknown enum value via the integer-based + // setters results in undefined behavior (in practice, GOOGLE_DCHECK-fails). + // + // Generic code that uses reflection to handle messages with enum fields + // should check this flag before using the integer-based setter, and either + // downgrade to a compatible value or use the UnknownFieldSet if not. For + // example: + // + // int new_value = GetValueFromApplicationLogic(); + // if (reflection->SupportsUnknownEnumValues()) { + // reflection->SetEnumValue(message, field, new_value); + // } else { + // if (field_descriptor->enum_type()-> + // FindValueByNumber(new_value) != nullptr) { + // reflection->SetEnumValue(message, field, new_value); + // } else if (emit_unknown_enum_values) { + // reflection->MutableUnknownFields(message)->AddVarint( + // field->number(), new_value); + // } else { + // // convert value to a compatible/default value. + // new_value = CompatibleDowngrade(new_value); + // reflection->SetEnumValue(message, field, new_value); + // } + // } + bool SupportsUnknownEnumValues() const; + + // Returns the MessageFactory associated with this message. This can be + // useful for determining if a message is a generated message or not, for + // example: + // if (message->GetReflection()->GetMessageFactory() == + // google::protobuf::MessageFactory::generated_factory()) { + // // This is a generated message. + // } + // It can also be used to create more messages of this type, though + // Message::New() is an easier way to accomplish this. + MessageFactory* GetMessageFactory() const; + + private: + template + const RepeatedField& GetRepeatedFieldInternal( + const Message& message, const FieldDescriptor* field) const; + template + RepeatedField* MutableRepeatedFieldInternal( + Message* message, const FieldDescriptor* field) const; + template + const RepeatedPtrField& GetRepeatedPtrFieldInternal( + const Message& message, const FieldDescriptor* field) const; + template + RepeatedPtrField* MutableRepeatedPtrFieldInternal( + Message* message, const FieldDescriptor* field) const; + // Obtain a pointer to a Repeated Field Structure and do some type checking: + // on field->cpp_type(), + // on field->field_option().ctype() (if ctype >= 0) + // of field->message_type() (if message_type != nullptr). + // We use 2 routine rather than 4 (const vs mutable) x (scalar vs pointer). + void* MutableRawRepeatedField(Message* message, const FieldDescriptor* field, + FieldDescriptor::CppType, int ctype, + const Descriptor* message_type) const; + + const void* GetRawRepeatedField(const Message& message, + const FieldDescriptor* field, + FieldDescriptor::CppType cpptype, int ctype, + const Descriptor* message_type) const; + + // The following methods are used to implement (Mutable)RepeatedFieldRef. + // A Ref object will store a raw pointer to the repeated field data (obtained + // from RepeatedFieldData()) and a pointer to a Accessor (obtained from + // RepeatedFieldAccessor) which will be used to access the raw data. + + // Returns a raw pointer to the repeated field + // + // "cpp_type" and "message_type" are deduced from the type parameter T passed + // to Get(Mutable)RepeatedFieldRef. If T is a generated message type, + // "message_type" should be set to its descriptor. Otherwise "message_type" + // should be set to nullptr. Implementations of this method should check + // whether "cpp_type"/"message_type" is consistent with the actual type of the + // field. We use 1 routine rather than 2 (const vs mutable) because it is + // protected and it doesn't change the message. + void* RepeatedFieldData(Message* message, const FieldDescriptor* field, + FieldDescriptor::CppType cpp_type, + const Descriptor* message_type) const; + + // The returned pointer should point to a singleton instance which implements + // the RepeatedFieldAccessor interface. + const internal::RepeatedFieldAccessor* RepeatedFieldAccessor( + const FieldDescriptor* field) const; + + // Lists all fields of the message which are currently set, except for unknown + // fields and stripped fields. See ListFields for details. + void ListFieldsOmitStripped( + const Message& message, + std::vector* output) const; + + bool IsMessageStripped(const Descriptor* descriptor) const { + return schema_.IsMessageStripped(descriptor); + } + + friend class TextFormat; + + void ListFieldsMayFailOnStripped( + const Message& message, bool should_fail, + std::vector* output) const; + + const Descriptor* const descriptor_; + const internal::ReflectionSchema schema_; + const DescriptorPool* const descriptor_pool_; + MessageFactory* const message_factory_; + + // Last non weak field index. This is an optimization when most weak fields + // are at the end of the containing message. If a message proto doesn't + // contain weak fields, then this field equals descriptor_->field_count(). + int last_non_weak_field_index_; + + template + friend class RepeatedFieldRef; + template + friend class MutableRepeatedFieldRef; + friend class ::PROTOBUF_NAMESPACE_ID::MessageLayoutInspector; + friend class ::PROTOBUF_NAMESPACE_ID::AssignDescriptorsHelper; + friend class DynamicMessageFactory; + friend class python::MapReflectionFriend; +#define GOOGLE_PROTOBUF_HAS_CEL_MAP_REFLECTION_FRIEND + friend class expr::CelMapReflectionFriend; + friend class internal::MapFieldReflectionTest; + friend class internal::MapKeySorter; + friend class internal::WireFormat; + friend class internal::ReflectionOps; + // Needed for implementing text format for map. + friend class internal::MapFieldPrinterHelper; + + Reflection(const Descriptor* descriptor, + const internal::ReflectionSchema& schema, + const DescriptorPool* pool, MessageFactory* factory); + + // Special version for specialized implementations of string. We can't + // call MutableRawRepeatedField directly here because we don't have access to + // FieldOptions::* which are defined in descriptor.pb.h. Including that + // file here is not possible because it would cause a circular include cycle. + // We use 1 routine rather than 2 (const vs mutable) because it is private + // and mutable a repeated string field doesn't change the message. + void* MutableRawRepeatedString(Message* message, const FieldDescriptor* field, + bool is_string) const; + + friend class MapReflectionTester; + // Returns true if key is in map. Returns false if key is not in map field. + bool ContainsMapKey(const Message& message, const FieldDescriptor* field, + const MapKey& key) const; + + // If key is in map field: Saves the value pointer to val and returns + // false. If key in not in map field: Insert the key into map, saves + // value pointer to val and returns true. + bool InsertOrLookupMapValue(Message* message, const FieldDescriptor* field, + const MapKey& key, MapValueRef* val) const; + + // Delete and returns true if key is in the map field. Returns false + // otherwise. + bool DeleteMapValue(Message* message, const FieldDescriptor* field, + const MapKey& key) const; + + // Returns a MapIterator referring to the first element in the map field. + // If the map field is empty, this function returns the same as + // reflection::MapEnd. Mutation to the field may invalidate the iterator. + MapIterator MapBegin(Message* message, const FieldDescriptor* field) const; + + // Returns a MapIterator referring to the theoretical element that would + // follow the last element in the map field. It does not point to any + // real element. Mutation to the field may invalidate the iterator. + MapIterator MapEnd(Message* message, const FieldDescriptor* field) const; + + // Get the number of pair of a map field. The result may be + // different from FieldSize which can have duplicate keys. + int MapSize(const Message& message, const FieldDescriptor* field) const; + + // Help method for MapIterator. + friend class MapIterator; + friend class WireFormatForMapFieldTest; + internal::MapFieldBase* MutableMapData(Message* message, + const FieldDescriptor* field) const; + + const internal::MapFieldBase* GetMapData(const Message& message, + const FieldDescriptor* field) const; + + template + const T& GetRawNonOneof(const Message& message, + const FieldDescriptor* field) const; + template + T* MutableRawNonOneof(Message* message, const FieldDescriptor* field) const; + + template + const Type& GetRaw(const Message& message, + const FieldDescriptor* field) const; + template + inline Type* MutableRaw(Message* message, const FieldDescriptor* field) const; + template + const Type& DefaultRaw(const FieldDescriptor* field) const; + + inline const uint32* GetHasBits(const Message& message) const; + inline uint32* MutableHasBits(Message* message) const; + inline uint32 GetOneofCase(const Message& message, + const OneofDescriptor* oneof_descriptor) const; + inline uint32* MutableOneofCase( + Message* message, const OneofDescriptor* oneof_descriptor) const; + inline bool HasExtensionSet(const Message& message) const { + return schema_.HasExtensionSet(); + } + const internal::ExtensionSet& GetExtensionSet(const Message& message) const; + internal::ExtensionSet* MutableExtensionSet(Message* message) const; + inline Arena* GetArena(Message* message) const; + + inline const internal::InternalMetadata& GetInternalMetadata( + const Message& message) const; + + internal::InternalMetadata* MutableInternalMetadata(Message* message) const; + + inline bool IsInlined(const FieldDescriptor* field) const; + + inline bool HasBit(const Message& message, + const FieldDescriptor* field) const; + inline void SetBit(Message* message, const FieldDescriptor* field) const; + inline void ClearBit(Message* message, const FieldDescriptor* field) const; + inline void SwapBit(Message* message1, Message* message2, + const FieldDescriptor* field) const; + + // This function only swaps the field. Should swap corresponding has_bit + // before or after using this function. + void SwapField(Message* message1, Message* message2, + const FieldDescriptor* field) const; + + void SwapOneofField(Message* message1, Message* message2, + const OneofDescriptor* oneof_descriptor) const; + + inline bool HasOneofField(const Message& message, + const FieldDescriptor* field) const; + inline void SetOneofCase(Message* message, + const FieldDescriptor* field) const; + inline void ClearOneofField(Message* message, + const FieldDescriptor* field) const; + + template + inline const Type& GetField(const Message& message, + const FieldDescriptor* field) const; + template + inline void SetField(Message* message, const FieldDescriptor* field, + const Type& value) const; + template + inline Type* MutableField(Message* message, + const FieldDescriptor* field) const; + template + inline const Type& GetRepeatedField(const Message& message, + const FieldDescriptor* field, + int index) const; + template + inline const Type& GetRepeatedPtrField(const Message& message, + const FieldDescriptor* field, + int index) const; + template + inline void SetRepeatedField(Message* message, const FieldDescriptor* field, + int index, Type value) const; + template + inline Type* MutableRepeatedField(Message* message, + const FieldDescriptor* field, + int index) const; + template + inline void AddField(Message* message, const FieldDescriptor* field, + const Type& value) const; + template + inline Type* AddField(Message* message, const FieldDescriptor* field) const; + + int GetExtensionNumberOrDie(const Descriptor* type) const; + + // Internal versions of EnumValue API perform no checking. Called after checks + // by public methods. + void SetEnumValueInternal(Message* message, const FieldDescriptor* field, + int value) const; + void SetRepeatedEnumValueInternal(Message* message, + const FieldDescriptor* field, int index, + int value) const; + void AddEnumValueInternal(Message* message, const FieldDescriptor* field, + int value) const; + + Message* UnsafeArenaReleaseMessage(Message* message, + const FieldDescriptor* field, + MessageFactory* factory = nullptr) const; + + void UnsafeArenaSetAllocatedMessage(Message* message, Message* sub_message, + const FieldDescriptor* field) const; + + friend inline // inline so nobody can call this function. + void + RegisterAllTypesInternal(const Metadata* file_level_metadata, int size); + friend inline const char* ParseLenDelim(int field_number, + const FieldDescriptor* field, + Message* msg, + const Reflection* reflection, + const char* ptr, + internal::ParseContext* ctx); + friend inline const char* ParsePackedField(const FieldDescriptor* field, + Message* msg, + const Reflection* reflection, + const char* ptr, + internal::ParseContext* ctx); + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Reflection); +}; + +// Abstract interface for a factory for message objects. +class PROTOBUF_EXPORT MessageFactory { + public: + inline MessageFactory() {} + virtual ~MessageFactory(); + + // Given a Descriptor, gets or constructs the default (prototype) Message + // of that type. You can then call that message's New() method to construct + // a mutable message of that type. + // + // Calling this method twice with the same Descriptor returns the same + // object. The returned object remains property of the factory. Also, any + // objects created by calling the prototype's New() method share some data + // with the prototype, so these must be destroyed before the MessageFactory + // is destroyed. + // + // The given descriptor must outlive the returned message, and hence must + // outlive the MessageFactory. + // + // Some implementations do not support all types. GetPrototype() will + // return nullptr if the descriptor passed in is not supported. + // + // This method may or may not be thread-safe depending on the implementation. + // Each implementation should document its own degree thread-safety. + virtual const Message* GetPrototype(const Descriptor* type) = 0; + + // Gets a MessageFactory which supports all generated, compiled-in messages. + // In other words, for any compiled-in type FooMessage, the following is true: + // MessageFactory::generated_factory()->GetPrototype( + // FooMessage::descriptor()) == FooMessage::default_instance() + // This factory supports all types which are found in + // DescriptorPool::generated_pool(). If given a descriptor from any other + // pool, GetPrototype() will return nullptr. (You can also check if a + // descriptor is for a generated message by checking if + // descriptor->file()->pool() == DescriptorPool::generated_pool().) + // + // This factory is 100% thread-safe; calling GetPrototype() does not modify + // any shared data. + // + // This factory is a singleton. The caller must not delete the object. + static MessageFactory* generated_factory(); + + // For internal use only: Registers a .proto file at static initialization + // time, to be placed in generated_factory. The first time GetPrototype() + // is called with a descriptor from this file, |register_messages| will be + // called, with the file name as the parameter. It must call + // InternalRegisterGeneratedMessage() (below) to register each message type + // in the file. This strange mechanism is necessary because descriptors are + // built lazily, so we can't register types by their descriptor until we + // know that the descriptor exists. |filename| must be a permanent string. + static void InternalRegisterGeneratedFile( + const google::protobuf::internal::DescriptorTable* table); + + // For internal use only: Registers a message type. Called only by the + // functions which are registered with InternalRegisterGeneratedFile(), + // above. + static void InternalRegisterGeneratedMessage(const Descriptor* descriptor, + const Message* prototype); + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MessageFactory); +}; + +#define DECLARE_GET_REPEATED_FIELD(TYPE) \ + template <> \ + PROTOBUF_EXPORT const RepeatedField& \ + Reflection::GetRepeatedFieldInternal( \ + const Message& message, const FieldDescriptor* field) const; \ + \ + template <> \ + PROTOBUF_EXPORT RepeatedField* \ + Reflection::MutableRepeatedFieldInternal( \ + Message * message, const FieldDescriptor* field) const; + +DECLARE_GET_REPEATED_FIELD(int32) +DECLARE_GET_REPEATED_FIELD(int64) +DECLARE_GET_REPEATED_FIELD(uint32) +DECLARE_GET_REPEATED_FIELD(uint64) +DECLARE_GET_REPEATED_FIELD(float) +DECLARE_GET_REPEATED_FIELD(double) +DECLARE_GET_REPEATED_FIELD(bool) + +#undef DECLARE_GET_REPEATED_FIELD + +// Tries to downcast this message to a generated message type. Returns nullptr +// if this class is not an instance of T. This works even if RTTI is disabled. +// +// This also has the effect of creating a strong reference to T that will +// prevent the linker from stripping it out at link time. This can be important +// if you are using a DynamicMessageFactory that delegates to the generated +// factory. +template +const T* DynamicCastToGenerated(const Message* from) { + // Compile-time assert that T is a generated type that has a + // default_instance() accessor, but avoid actually calling it. + const T& (*get_default_instance)() = &T::default_instance; + (void)get_default_instance; + + // Compile-time assert that T is a subclass of google::protobuf::Message. + const Message* unused = static_cast(nullptr); + (void)unused; + +#if PROTOBUF_RTTI + return dynamic_cast(from); +#else + bool ok = T::default_instance().GetReflection() == from->GetReflection(); + return ok ? down_cast(from) : nullptr; +#endif +} + +template +T* DynamicCastToGenerated(Message* from) { + const Message* message_const = from; + return const_cast(DynamicCastToGenerated(message_const)); +} + +// Call this function to ensure that this message's reflection is linked into +// the binary: +// +// google::protobuf::LinkMessageReflection(); +// +// This will ensure that the following lookup will succeed: +// +// DescriptorPool::generated_pool()->FindMessageTypeByName("FooMessage"); +// +// As a side-effect, it will also guarantee that anything else from the same +// .proto file will also be available for lookup in the generated pool. +// +// This function does not actually register the message, so it does not need +// to be called before the lookup. However it does need to occur in a function +// that cannot be stripped from the binary (ie. it must be reachable from main). +// +// Best practice is to call this function as close as possible to where the +// reflection is actually needed. This function is very cheap to call, so you +// should not need to worry about its runtime overhead except in the tightest +// of loops (on x86-64 it compiles into two "mov" instructions). +template +void LinkMessageReflection() { + internal::StrongReference(T::default_instance); +} + +// ============================================================================= +// Implementation details for {Get,Mutable}RawRepeatedPtrField. We provide +// specializations for , and and +// handle everything else with the default template which will match any type +// having a method with signature "static const google::protobuf::Descriptor* +// descriptor()". Such a type presumably is a descendant of google::protobuf::Message. + +template <> +inline const RepeatedPtrField& +Reflection::GetRepeatedPtrFieldInternal( + const Message& message, const FieldDescriptor* field) const { + return *static_cast*>( + MutableRawRepeatedString(const_cast(&message), field, true)); +} + +template <> +inline RepeatedPtrField* +Reflection::MutableRepeatedPtrFieldInternal( + Message* message, const FieldDescriptor* field) const { + return static_cast*>( + MutableRawRepeatedString(message, field, true)); +} + + +// ----- + +template <> +inline const RepeatedPtrField& Reflection::GetRepeatedPtrFieldInternal( + const Message& message, const FieldDescriptor* field) const { + return *static_cast*>(GetRawRepeatedField( + message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1, nullptr)); +} + +template <> +inline RepeatedPtrField* Reflection::MutableRepeatedPtrFieldInternal( + Message* message, const FieldDescriptor* field) const { + return static_cast*>(MutableRawRepeatedField( + message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1, nullptr)); +} + +template +inline const RepeatedPtrField& Reflection::GetRepeatedPtrFieldInternal( + const Message& message, const FieldDescriptor* field) const { + return *static_cast*>( + GetRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1, + PB::default_instance().GetDescriptor())); +} + +template +inline RepeatedPtrField* Reflection::MutableRepeatedPtrFieldInternal( + Message* message, const FieldDescriptor* field) const { + return static_cast*>( + MutableRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE, + -1, PB::default_instance().GetDescriptor())); +} + +template +const Type& Reflection::DefaultRaw(const FieldDescriptor* field) const { + return *reinterpret_cast(schema_.GetFieldDefault(field)); +} +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_MESSAGE_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..857faa035fb8c2f0501dd69eac03a24b70f401a9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h @@ -0,0 +1,96 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This header is logically internal, but is made public because it is used +// from protocol-compiler-generated code, which may reside in other components. + +#ifndef GOOGLE_PROTOBUF_REFLECTION_OPS_H__ +#define GOOGLE_PROTOBUF_REFLECTION_OPS_H__ + +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { +namespace internal { + +// Basic operations that can be performed using reflection. +// These can be used as a cheap way to implement the corresponding +// methods of the Message interface, though they are likely to be +// slower than implementations tailored for the specific message type. +// +// This class should stay limited to operations needed to implement +// the Message interface. +// +// This class is really a namespace that contains only static methods. +class PROTOBUF_EXPORT ReflectionOps { + public: + static void Copy(const Message& from, Message* to); + static void Merge(const Message& from, Message* to); + static void Clear(Message* message); + static bool IsInitialized(const Message& message); + static bool IsInitialized(const Message& message, bool check_fields, + bool check_descendants); + static void DiscardUnknownFields(Message* message); + + // Finds all unset required fields in the message and adds their full + // paths (e.g. "foo.bar[5].baz") to *names. "prefix" will be attached to + // the front of each name. + static void FindInitializationErrors(const Message& message, + const std::string& prefix, + std::vector* errors); + + private: + // All methods are static. No need to construct. + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ReflectionOps); +}; + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_REFLECTION_OPS_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..f3d36923e0507a71255343c2c56d6853d774babe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h @@ -0,0 +1,300 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/source_context.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fsource_5fcontext_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fsource_5fcontext_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto; +PROTOBUF_NAMESPACE_OPEN +class SourceContext; +class SourceContextDefaultTypeInternal; +PROTOBUF_EXPORT extern SourceContextDefaultTypeInternal _SourceContext_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceContext* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT SourceContext PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceContext) */ { + public: + inline SourceContext() : SourceContext(nullptr) {} + virtual ~SourceContext(); + + SourceContext(const SourceContext& from); + SourceContext(SourceContext&& from) noexcept + : SourceContext() { + *this = ::std::move(from); + } + + inline SourceContext& operator=(const SourceContext& from) { + CopyFrom(from); + return *this; + } + inline SourceContext& operator=(SourceContext&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const SourceContext& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const SourceContext* internal_default_instance() { + return reinterpret_cast( + &_SourceContext_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(SourceContext& a, SourceContext& b) { + a.Swap(&b); + } + inline void Swap(SourceContext* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(SourceContext* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline SourceContext* New() const final { + return CreateMaybeMessage(nullptr); + } + + SourceContext* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const SourceContext& from); + void MergeFrom(const SourceContext& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(SourceContext* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.SourceContext"; + } + protected: + explicit SourceContext(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto); + return ::descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kFileNameFieldNumber = 1, + }; + // string file_name = 1; + void clear_file_name(); + const std::string& file_name() const; + void set_file_name(const std::string& value); + void set_file_name(std::string&& value); + void set_file_name(const char* value); + void set_file_name(const char* value, size_t size); + std::string* mutable_file_name(); + std::string* release_file_name(); + void set_allocated_file_name(std::string* file_name); + private: + const std::string& _internal_file_name() const; + void _internal_set_file_name(const std::string& value); + std::string* _internal_mutable_file_name(); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.SourceContext) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr file_name_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fsource_5fcontext_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// SourceContext + +// string file_name = 1; +inline void SourceContext::clear_file_name() { + file_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline const std::string& SourceContext::file_name() const { + // @@protoc_insertion_point(field_get:google.protobuf.SourceContext.file_name) + return _internal_file_name(); +} +inline void SourceContext::set_file_name(const std::string& value) { + _internal_set_file_name(value); + // @@protoc_insertion_point(field_set:google.protobuf.SourceContext.file_name) +} +inline std::string* SourceContext::mutable_file_name() { + // @@protoc_insertion_point(field_mutable:google.protobuf.SourceContext.file_name) + return _internal_mutable_file_name(); +} +inline const std::string& SourceContext::_internal_file_name() const { + return file_name_.Get(); +} +inline void SourceContext::_internal_set_file_name(const std::string& value) { + + file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void SourceContext::set_file_name(std::string&& value) { + + file_name_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceContext.file_name) +} +inline void SourceContext::set_file_name(const char* value) { + GOOGLE_DCHECK(value != nullptr); + + file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value), + GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.SourceContext.file_name) +} +inline void SourceContext::set_file_name(const char* value, + size_t size) { + + file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceContext.file_name) +} +inline std::string* SourceContext::_internal_mutable_file_name() { + + return file_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* SourceContext::release_file_name() { + // @@protoc_insertion_point(field_release:google.protobuf.SourceContext.file_name) + return file_name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void SourceContext::set_allocated_file_name(std::string* file_name) { + if (file_name != nullptr) { + + } else { + + } + file_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), file_name, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceContext.file_name) +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h new file mode 100644 index 0000000000000000000000000000000000000000..8e629b9ea5ced2c75792919eeed8be612fc15108 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h @@ -0,0 +1,412 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// atenasio@google.com (Chris Atenasio) (ZigZag transform) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This header is logically internal, but is made public because it is used +// from protocol-compiler-generated code, which may reside in other components. + +#ifndef GOOGLE_PROTOBUF_WIRE_FORMAT_H__ +#define GOOGLE_PROTOBUF_WIRE_FORMAT_H__ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { +class UnknownFieldSet; // unknown_field_set.h +} // namespace protobuf +} // namespace google + +namespace google { +namespace protobuf { +namespace internal { + +// This class is for internal use by the protocol buffer library and by +// protocol-compiler-generated message classes. It must not be called +// directly by clients. +// +// This class contains code for implementing the binary protocol buffer +// wire format via reflection. The WireFormatLite class implements the +// non-reflection based routines. +// +// This class is really a namespace that contains only static methods +class PROTOBUF_EXPORT WireFormat { + public: + // Given a field return its WireType + static inline WireFormatLite::WireType WireTypeForField( + const FieldDescriptor* field); + + // Given a FieldDescriptor::Type return its WireType + static inline WireFormatLite::WireType WireTypeForFieldType( + FieldDescriptor::Type type); + + // Compute the byte size of a tag. For groups, this includes both the start + // and end tags. + static inline size_t TagSize(int field_number, FieldDescriptor::Type type); + + // These procedures can be used to implement the methods of Message which + // handle parsing and serialization of the protocol buffer wire format + // using only the Reflection interface. When you ask the protocol + // compiler to optimize for code size rather than speed, it will implement + // those methods in terms of these procedures. Of course, these are much + // slower than the specialized implementations which the protocol compiler + // generates when told to optimize for speed. + + // Read a message in protocol buffer wire format. + // + // This procedure reads either to the end of the input stream or through + // a WIRETYPE_END_GROUP tag ending the message, whichever comes first. + // It returns false if the input is invalid. + // + // Required fields are NOT checked by this method. You must call + // IsInitialized() on the resulting message yourself. + static bool ParseAndMergePartial(io::CodedInputStream* input, + Message* message); + + // This is meant for internal protobuf use (WireFormat is an internal class). + // This is the reflective implementation of the _InternalParse functionality. + static const char* _InternalParse(Message* msg, const char* ptr, + internal::ParseContext* ctx); + + // Serialize a message in protocol buffer wire format. + // + // Any embedded messages within the message must have their correct sizes + // cached. However, the top-level message need not; its size is passed as + // a parameter to this procedure. + // + // These return false iff the underlying stream returns a write error. + static void SerializeWithCachedSizes(const Message& message, int size, + io::CodedOutputStream* output) { + int expected_endpoint = output->ByteCount() + size; + output->SetCur( + _InternalSerialize(message, output->Cur(), output->EpsCopy())); + GOOGLE_CHECK_EQ(output->ByteCount(), expected_endpoint) + << ": Protocol message serialized to a size different from what was " + "originally expected. Perhaps it was modified by another thread " + "during serialization?"; + } + static uint8* _InternalSerialize(const Message& message, uint8* target, + io::EpsCopyOutputStream* stream); + + // Implements Message::ByteSize() via reflection. WARNING: The result + // of this method is *not* cached anywhere. However, all embedded messages + // will have their ByteSize() methods called, so their sizes will be cached. + // Therefore, calling this method is sufficient to allow you to call + // WireFormat::SerializeWithCachedSizes() on the same object. + static size_t ByteSize(const Message& message); + + // ----------------------------------------------------------------- + // Helpers for dealing with unknown fields + + // Skips a field value of the given WireType. The input should start + // positioned immediately after the tag. If unknown_fields is non-NULL, + // the contents of the field will be added to it. + static bool SkipField(io::CodedInputStream* input, uint32 tag, + UnknownFieldSet* unknown_fields); + + // Reads and ignores a message from the input. If unknown_fields is + // non-NULL, the contents will be added to it. + static bool SkipMessage(io::CodedInputStream* input, + UnknownFieldSet* unknown_fields); + + // Read a packed enum field. If the is_valid function is not NULL, values + // for which is_valid(value) returns false are appended to + // unknown_fields_stream. + static bool ReadPackedEnumPreserveUnknowns(io::CodedInputStream* input, + uint32 field_number, + bool (*is_valid)(int), + UnknownFieldSet* unknown_fields, + RepeatedField* values); + + // Write the contents of an UnknownFieldSet to the output. + static void SerializeUnknownFields(const UnknownFieldSet& unknown_fields, + io::CodedOutputStream* output) { + output->SetCur(InternalSerializeUnknownFieldsToArray( + unknown_fields, output->Cur(), output->EpsCopy())); + } + // Same as above, except writing directly to the provided buffer. + // Requires that the buffer have sufficient capacity for + // ComputeUnknownFieldsSize(unknown_fields). + // + // Returns a pointer past the last written byte. + static uint8* SerializeUnknownFieldsToArray( + const UnknownFieldSet& unknown_fields, uint8* target) { + io::EpsCopyOutputStream stream( + target, static_cast(ComputeUnknownFieldsSize(unknown_fields)), + io::CodedOutputStream::IsDefaultSerializationDeterministic()); + return InternalSerializeUnknownFieldsToArray(unknown_fields, target, + &stream); + } + static uint8* InternalSerializeUnknownFieldsToArray( + const UnknownFieldSet& unknown_fields, uint8* target, + io::EpsCopyOutputStream* stream); + + // Same thing except for messages that have the message_set_wire_format + // option. + static void SerializeUnknownMessageSetItems( + const UnknownFieldSet& unknown_fields, io::CodedOutputStream* output) { + output->SetCur(InternalSerializeUnknownMessageSetItemsToArray( + unknown_fields, output->Cur(), output->EpsCopy())); + } + // Same as above, except writing directly to the provided buffer. + // Requires that the buffer have sufficient capacity for + // ComputeUnknownMessageSetItemsSize(unknown_fields). + // + // Returns a pointer past the last written byte. + static uint8* SerializeUnknownMessageSetItemsToArray( + const UnknownFieldSet& unknown_fields, uint8* target); + static uint8* InternalSerializeUnknownMessageSetItemsToArray( + const UnknownFieldSet& unknown_fields, uint8* target, + io::EpsCopyOutputStream* stream); + + // Compute the size of the UnknownFieldSet on the wire. + static size_t ComputeUnknownFieldsSize(const UnknownFieldSet& unknown_fields); + + // Same thing except for messages that have the message_set_wire_format + // option. + static size_t ComputeUnknownMessageSetItemsSize( + const UnknownFieldSet& unknown_fields); + + // Helper functions for encoding and decoding tags. (Inlined below and in + // _inl.h) + // + // This is different from MakeTag(field->number(), field->type()) in the + // case of packed repeated fields. + static uint32 MakeTag(const FieldDescriptor* field); + + // Parse a single field. The input should start out positioned immediately + // after the tag. + static bool ParseAndMergeField( + uint32 tag, + const FieldDescriptor* field, // May be NULL for unknown + Message* message, io::CodedInputStream* input); + + // Serialize a single field. + static void SerializeFieldWithCachedSizes( + const FieldDescriptor* field, // Cannot be NULL + const Message& message, io::CodedOutputStream* output) { + output->SetCur(InternalSerializeField(field, message, output->Cur(), + output->EpsCopy())); + } + static uint8* InternalSerializeField( + const FieldDescriptor* field, // Cannot be NULL + const Message& message, uint8* target, io::EpsCopyOutputStream* stream); + + // Compute size of a single field. If the field is a message type, this + // will call ByteSize() for the embedded message, insuring that it caches + // its size. + static size_t FieldByteSize(const FieldDescriptor* field, // Cannot be NULL + const Message& message); + + // Parse/serialize a MessageSet::Item group. Used with messages that use + // option message_set_wire_format = true. + static bool ParseAndMergeMessageSetItem(io::CodedInputStream* input, + Message* message); + static void SerializeMessageSetItemWithCachedSizes( + const FieldDescriptor* field, const Message& message, + io::CodedOutputStream* output) { + output->SetCur(InternalSerializeMessageSetItem( + field, message, output->Cur(), output->EpsCopy())); + } + static uint8* InternalSerializeMessageSetItem( + const FieldDescriptor* field, const Message& message, uint8* target, + io::EpsCopyOutputStream* stream); + static size_t MessageSetItemByteSize(const FieldDescriptor* field, + const Message& message); + + // Computes the byte size of a field, excluding tags. For packed fields, it + // only includes the size of the raw data, and not the size of the total + // length, but for other length-delimited types, the size of the length is + // included. + static size_t FieldDataOnlyByteSize( + const FieldDescriptor* field, // Cannot be NULL + const Message& message); + + enum Operation { + PARSE = 0, + SERIALIZE = 1, + }; + + // Verifies that a string field is valid UTF8, logging an error if not. + // This function will not be called by newly generated protobuf code + // but remains present to support existing code. + static void VerifyUTF8String(const char* data, int size, Operation op); + // The NamedField variant takes a field name in order to produce an + // informative error message if verification fails. + static void VerifyUTF8StringNamedField(const char* data, int size, + Operation op, const char* field_name); + + private: + struct MessageSetParser; + // Skip a MessageSet field. + static bool SkipMessageSetField(io::CodedInputStream* input, + uint32 field_number, + UnknownFieldSet* unknown_fields); + + // Parse a MessageSet field. + static bool ParseAndMergeMessageSetField(uint32 field_number, + const FieldDescriptor* field, + Message* message, + io::CodedInputStream* input); + // Parses the value from the wire that belongs to tag. + static const char* _InternalParseAndMergeField(Message* msg, const char* ptr, + internal::ParseContext* ctx, + uint64 tag, + const Reflection* reflection, + const FieldDescriptor* field); + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(WireFormat); +}; + +// Subclass of FieldSkipper which saves skipped fields to an UnknownFieldSet. +class PROTOBUF_EXPORT UnknownFieldSetFieldSkipper : public FieldSkipper { + public: + UnknownFieldSetFieldSkipper(UnknownFieldSet* unknown_fields) + : unknown_fields_(unknown_fields) {} + ~UnknownFieldSetFieldSkipper() override {} + + // implements FieldSkipper ----------------------------------------- + bool SkipField(io::CodedInputStream* input, uint32 tag) override; + bool SkipMessage(io::CodedInputStream* input) override; + void SkipUnknownEnum(int field_number, int value) override; + + protected: + UnknownFieldSet* unknown_fields_; +}; + +// inline methods ==================================================== + +inline WireFormatLite::WireType WireFormat::WireTypeForField( + const FieldDescriptor* field) { + if (field->is_packed()) { + return WireFormatLite::WIRETYPE_LENGTH_DELIMITED; + } else { + return WireTypeForFieldType(field->type()); + } +} + +inline WireFormatLite::WireType WireFormat::WireTypeForFieldType( + FieldDescriptor::Type type) { + // Some compilers don't like enum -> enum casts, so we implicit_cast to + // int first. + return WireFormatLite::WireTypeForFieldType( + static_cast(implicit_cast(type))); +} + +inline uint32 WireFormat::MakeTag(const FieldDescriptor* field) { + return WireFormatLite::MakeTag(field->number(), WireTypeForField(field)); +} + +inline size_t WireFormat::TagSize(int field_number, + FieldDescriptor::Type type) { + // Some compilers don't like enum -> enum casts, so we implicit_cast to + // int first. + return WireFormatLite::TagSize( + field_number, + static_cast(implicit_cast(type))); +} + +inline void WireFormat::VerifyUTF8String(const char* data, int size, + WireFormat::Operation op) { +#ifdef GOOGLE_PROTOBUF_UTF8_VALIDATION_ENABLED + WireFormatLite::VerifyUtf8String( + data, size, static_cast(op), NULL); +#else + // Avoid the compiler warning about unused variables. + (void)data; + (void)size; + (void)op; +#endif +} + +inline void WireFormat::VerifyUTF8StringNamedField(const char* data, int size, + WireFormat::Operation op, + const char* field_name) { +#ifdef GOOGLE_PROTOBUF_UTF8_VALIDATION_ENABLED + WireFormatLite::VerifyUtf8String( + data, size, static_cast(op), field_name); +#else + // Avoid the compiler warning about unused variables. + (void)data; + (void)size; + (void)op; + (void)field_name; +#endif +} + + +inline uint8* InternalSerializeUnknownMessageSetItemsToArray( + const UnknownFieldSet& unknown_fields, uint8* target, + io::EpsCopyOutputStream* stream) { + return WireFormat::InternalSerializeUnknownMessageSetItemsToArray( + unknown_fields, target, stream); +} + +inline size_t ComputeUnknownMessageSetItemsSize( + const UnknownFieldSet& unknown_fields) { + return WireFormat::ComputeUnknownMessageSetItemsSize(unknown_fields); +} + +// Compute the size of the UnknownFieldSet on the wire. +PROTOBUF_EXPORT +size_t ComputeUnknownFieldsSize(const InternalMetadata& metadata, size_t size, + CachedSize* cached_size); + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_WIRE_FORMAT_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h new file mode 100644 index 0000000000000000000000000000000000000000..14ff7bfb910fea278c05f79a2613669a9cf9e080 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h @@ -0,0 +1,3993 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2016-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C API + +#ifndef ONEAPI_DNNL_DNNL_H +#define ONEAPI_DNNL_DNNL_H + +#include "oneapi/dnnl/dnnl_common.h" +#include "oneapi/dnnl/dnnl_config.h" +#include "oneapi/dnnl/dnnl_types.h" +#include "oneapi/dnnl/dnnl_version.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_primitives +/// @{ + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Changes the primitive descriptor to point to the next available +/// implementation. +/// +/// @param primitive_desc A primitive descriptor to change. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +/// @returns #dnnl_last_impl_reached if no more implementations available, +/// in which case the primitive descriptor itself is kept unchanged. +dnnl_status_t DNNL_API dnnl_primitive_desc_next_impl( + dnnl_primitive_desc_t primitive_desc); + +/// Clones a primitive descriptor. The resulting primitive descriptor must be +/// destroyed separately. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param existing_primitive_desc Primitive descriptor to clone. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_desc_clone( + dnnl_primitive_desc_t *primitive_desc, + const_dnnl_primitive_desc_t existing_primitive_desc); + +/// Returns a constant reference to the attributes of a primitive descriptor. +/// +/// @warning +/// It is an error to destroy the resulting @p attr. +/// +/// @warning +/// The lifetime of an @p attr is the same as that of a @p +/// primitive_desc, so it is an error to use the @p attr once the @p +/// primitive_desc has been destroyed. +/// +/// @param primitive_desc Primitive descriptor. +/// @param attr Output primitive attributes. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_desc_get_attr( + const_dnnl_primitive_desc_t primitive_desc, + const_dnnl_primitive_attr_t *attr); + +/// Destroys a primitive descriptor. +/// +/// @param primitive_desc Primitive descriptor to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_desc_destroy( + dnnl_primitive_desc_t primitive_desc); + +/// Queries a primitive descriptor for various pieces of information. +/// +/// The most common use case is to query a primitive descriptor, created with +/// source, weights, and destination memory descriptors with format tags set +/// to #dnnl_format_tag_any, for the corresponding memory descriptors (in this +/// case the @p what is set to #dnnl_query_src_md, #dnnl_query_weights_md, and +/// #dnnl_query_dst_md respectively) so that it is possible to create memory +/// objects and reorder primitives if necessary. +/// +/// Another typical use case is to query a primitive descriptor for workspace +/// memory descriptor (with @p what set to #dnnl_query_workspace_md). If this +/// query returns #dnnl_not_required status, then workspace memory is not +/// required. +/// +/// @note +/// When querying for a memory descriptor for a scratchpad, a workspace, +/// or an optional parameter, the query will return a pointer to a zero +/// memory descriptor if the parameter is not needed. +/// +/// A few other use cases: +/// - query a primitive descriptor for the implementation information string +/// (#dnnl_query_impl_info_str) +/// - query a primitive descriptor for the number of inputs and outputs +/// (#dnnl_query_num_of_inputs_s32 and #dnnl_query_num_of_outputs_s32 +/// respectively) +/// +/// @sa dnnl_query_t for more options +/// +/// @param primitive_desc Primitive descriptor. +/// @param what Parameter to query. +/// @param index Index of the parameter to query for. +/// @param result Output result. The type depends on the query. For example, +/// it must be a @c dnnl_memory_desc_t* if querying for a memory +/// descriptor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_desc_query( + const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what, + int index, void *result); + +/// Queries primitive descriptor for a memory descriptor. +/// +/// @note +/// This function is a convenience version of +/// #dnnl_primitive_desc_query(). +/// +/// @param primitive_desc Primitive descriptor. +/// @param what Kind of memory descriptor parameter to query for. +/// @param index Index of the parameter to query. +/// @returns A pointer to the requested memory descriptor. +/// @returns A pointer to a zero memory descriptor if the parameter is not +/// needed. +/// @returns NULL in case of any error. +/// +const_dnnl_memory_desc_t DNNL_API dnnl_primitive_desc_query_md( + const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what, + int index); + +/// Queries primitive descriptor for a signed 32bit int. +/// +/// @note +/// This function is a convenience version of +/// #dnnl_primitive_desc_query(). +/// +/// @param primitive_desc Primitive descriptor. +/// @param what Kind of the value to query for. +/// @param index Index of the parameter to query. +/// @returns The requested value. +/// @returns 0 in case of any error (in particular if the queried entity is +/// not of type int32_t). Note that 0 may also be the actual returned +/// value. +int DNNL_API dnnl_primitive_desc_query_s32( + const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what, + int index); + +/// Creates a primitive. +/// +/// @param primitive Output primitive. +/// @param primitive_desc Primitive descriptor used to create the primitive. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_create(dnnl_primitive_t *primitive, + const_dnnl_primitive_desc_t primitive_desc); + +/// Creates a primitive from a cache blob. +/// +/// @param primitive Output primitive. +/// @param primitive_desc Primitive descriptor used to create the primitive. +/// @param size Size of the cache blob in bytes. +/// @param cache_blob Cache blob of size @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_create_from_cache_blob( + dnnl_primitive_t *primitive, const_dnnl_primitive_desc_t primitive_desc, + size_t size, const uint8_t *cache_blob); + +/// Executes a primitive. +/// +/// @param primitive Primitive to execute. +/// @param stream Stream to use. +/// @param nargs Number of arguments. +/// @param args Array of arguments. Each argument is an +/// pair. The index is one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see +/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory +/// descriptor as that returned by +/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. + +/// @note If any argument in @p args is padded (padded_dims > +/// dims), the primitive execution will assume properly zero-padded +/// input arguments, and produce zero-padded output arguments. +dnnl_status_t DNNL_API dnnl_primitive_execute(const_dnnl_primitive_t primitive, + dnnl_stream_t stream, int nargs, const dnnl_exec_arg_t *args); + +/// Retrieves a constant reference to the primitive descriptor of a given +/// primitive. +/// +/// @warning +/// It is an error to destroy the returned object. It is owned by the +/// primitive. The @c const qualifier of the returned object prevents +/// such attempts. +/// +/// @param primitive Primitive to query for the primitive descriptor. +/// @param primitive_desc Output primitive descriptor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_get_primitive_desc( + const_dnnl_primitive_t primitive, + const_dnnl_primitive_desc_t *primitive_desc); + +/// Retrieves a cache blob associated with the given primitive. +/// +/// @param primitive Primitive to query for the cache blob. +/// @param size Size of the cache blob in bytes. +/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is +/// nullptr then the size of the cache blob is returned in @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +/// +/// @note The cache blob can be empty. It's the user's responsibility to check +/// whether it's empty prior to passing it to +/// #dnnl_primitive_create_from_cache_blob(). +dnnl_status_t DNNL_API dnnl_primitive_get_cache_blob( + const_dnnl_primitive_t primitive, size_t *size, uint8_t *cache_blob); + +/// Destroys a primitive. +/// +/// @param primitive The primitive to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_destroy(dnnl_primitive_t primitive); + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_attributes +/// @{ + +/// Creates an empty (default) primitive attributes with all the parameters +/// set to their default values. +/// +/// Empty attributes are implied whenever the respective argument is NULL. +/// +/// @param attr Output primitive attributes. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_create(dnnl_primitive_attr_t *attr); + +/// Clones primitive attributes. +/// +/// @param attr Output primitive attributes. +/// @param existing_attr Primitive attributes to clone. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_clone( + dnnl_primitive_attr_t *attr, const_dnnl_primitive_attr_t existing_attr); + +/// Destroys primitive attributes. +/// +/// @param attr Primitive attributes to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_destroy(dnnl_primitive_attr_t attr); + +/// Returns probability for output dropout primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param dropout_desc Output dropout memory descriptor +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_dropout( + const_dnnl_primitive_attr_t attr, + const_dnnl_memory_desc_t *dropout_desc); + +/// Sets probability for output dropout primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param dropout_desc Output dropout memory descriptor +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_dropout( + dnnl_primitive_attr_t attr, const_dnnl_memory_desc_t dropout_desc); + +/// Returns the floating-point math mode primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param mode Output FP math mode. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_fpmath_mode( + const_dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t *mode); + +/// Sets the floating-point math mode primitive attributes. +/// +/// @param attr Primitive attributes. +/// @param mode FP math mode. The possible values are: +/// #dnnl_fpmath_mode_strict (default), +/// #dnnl_fpmath_mode_bf16, +/// #dnnl_fpmath_mode_f16, +/// #dnnl_fpmath_mode_tf32, +/// #dnnl_fpmath_mode_any. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_fpmath_mode( + dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t mode); + +/// Returns the floating-point math mode primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param mode Output FP math mode. +/// @param apply_to_int Output use floating-point arithmetic for integer primitives. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_fpmath_mode_v2( + const_dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t *mode, + int *apply_to_int); + +/// Sets the floating-point math mode primitive attributes. +/// +/// @param attr Primitive attributes. +/// @param mode FP math mode. The possible values are: +/// #dnnl_fpmath_mode_strict (default), +/// #dnnl_fpmath_mode_bf16, +/// #dnnl_fpmath_mode_f16, +/// #dnnl_fpmath_mode_tf32, +/// #dnnl_fpmath_mode_any. +/// @param apply_to_int Boolean. Use of floating-point arithmetic for integer primitives. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_fpmath_mode_v2( + dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t mode, int apply_to_int); + +/// Returns the deterministic primitive attribute value. +/// +/// @param attr Primitive attributes. +/// @param value Output deterministic attribute value +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_deterministic( + const_dnnl_primitive_attr_t attr, int *value); + +/// Sets the deterministic primitive attribute value. +/// +/// @param attr Primitive attributes. +/// @param value Boolean value to set deterministic attribute. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_deterministic( + dnnl_primitive_attr_t attr, int value); + +/// Returns the accumulation mode primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param mode Output accumulation mode. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_accumulation_mode( + const_dnnl_primitive_attr_t attr, dnnl_accumulation_mode_t *mode); + +/// Sets the accumulation mode primitive attribute. +/// +/// @param attr Primitive attributes. +/// @param mode Accumulation mode. The possible values are: +/// #dnnl_accumulation_mode_strict (default), which is s32 for quantized primitives, f32/f64 otherwise +/// #dnnl_accumulation_mode_relaxed, which is same as strict but allows intermediate accumulators to be in src/dst datatype +/// #dnnl_accumulation_mode_any, which allows accumulators to be src/dst datatype or any wider type. +/// #dnnl_accumulation_mode_f32, +/// #dnnl_accumulation_mode_s32, +/// #dnnl_accumulation_mode_f16. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_accumulation_mode( + dnnl_primitive_attr_t attr, dnnl_accumulation_mode_t mode); + +/// Returns the primitive attributes scratchpad mode. +/// +/// @param attr Primitive attributes. +/// @param mode Output scratchpad mode. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_scratchpad_mode( + const_dnnl_primitive_attr_t attr, dnnl_scratchpad_mode_t *mode); + +/// Sets primitive attributes scratchpad mode. +/// +/// @param attr Primitive attributes. +/// @param mode Scratchpad mode. The possible values are: +/// #dnnl_scratchpad_mode_library (default) and +/// #dnnl_scratchpad_mode_user. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_scratchpad_mode( + dnnl_primitive_attr_t attr, dnnl_scratchpad_mode_t mode); + +/// Sets primitive attributes scaling factors for primitive operations for a +/// given memory argument. The scaling factors must be passed at execution time +/// as an argument with index #DNNL_ARG_ATTR_SCALES | arg. +/// +/// @sa dnnl_primitive_attr_set_scales_mask +/// +/// +/// @param attr Primitive attributes. +/// @param arg Parameter argument index as passed to the +/// dnnl_primitive_execute() call. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the tensor dimensions and the @p scales array. +/// The set i-th bit indicates that a dedicated scaling factor is used for +/// each index along that dimension. Set the mask to 0 to use a common +/// scaling factor for the whole output tensor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales_mask( + dnnl_primitive_attr_t attr, int arg, int mask); + +/// Sets primitive attributes scaling factors for primitive operations for a +/// given memory argument. The scaling factors must be passed at execution time +/// as an argument with index #DNNL_ARG_ATTR_SCALES | arg. +/// +/// @sa dnnl_primitive_attr_set_scales +/// +/// +/// @param attr Primitive attributes. +/// @param arg Parameter argument index as passed to the +/// dnnl_primitive_execute() call. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the tensor dimensions and the @p scales array. +/// The set i-th bit indicates that a dedicated scaling factor is used for +/// each index along that dimension. Set the mask to 0 to use a common +/// scaling factor for the whole output tensor. +/// @param ndims Number of group dimensions. +/// @param group_dims Scaling factors correspondence groups that define the +/// correspondence between the tensor dimensions and the scales array. +/// The group dimensions should only be provided for each logical dimension +/// that has correspondence mask @p mask set. +/// @param data_type Scaling factors data_type. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales( + dnnl_primitive_attr_t attr, int arg, int mask, int ndims, + const dnnl_dims_t group_dims, dnnl_data_type_t data_type); + +/// Sets primitive attributes zero points for primitive operations for a given +/// memory argument. The zero points must be passed at execution time +/// as an argument with index #DNNL_ARG_ATTR_ZERO_POINTS | arg. +/// +/// @sa dnnl_primitive_attr_set_zero_points_mask +/// +/// +/// @param attr Primitive attributes. +/// @param arg Parameter argument index as passed to the +/// dnnl_primitive_execute() call. +/// @param mask Zero point correspondence mask that defines the +/// correspondence between the tensor dimensions and the @p +/// zero_points array. The set i-th bit indicates that a dedicated +/// zero point is used for each index along that dimension. Set the +/// mask to 0 to use a common zero point for the whole output tensor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points_mask( + dnnl_primitive_attr_t attr, int arg, int mask); + +/// Sets primitive attributes zero points for primitive operations for a given +/// memory argument. The zero points must be passed at execution time +/// as an argument with index #DNNL_ARG_ATTR_ZERO_POINTS | arg. +/// +/// @sa dnnl_primitive_attr_set_zero_points +/// +/// +/// @param attr Primitive attributes. +/// @param arg Parameter argument index as passed to the +/// dnnl_primitive_execute() call. +/// @param mask Zero point correspondence mask that defines the +/// correspondence between the tensor dimensions and the @p +/// zero_points array. The set i-th bit indicates that a dedicated +/// zero point is used for each index along that dimension. Set the +/// mask to 0 to use a common zero point for the whole output tensor. +/// @param ndims Number of group dimensions. +/// @param group_dims Zero point factors correspondence groups that define the +/// correspondence between the tensor dimensions and the zero_points array. +/// The group dimensions should be only provided for each logical dimension +/// that has the bit set correspondence mask @p mask set. +/// @param data_type Zero points factors data_type. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points( + dnnl_primitive_attr_t attr, int arg, int mask, int ndims, + const dnnl_dims_t group_dims, dnnl_data_type_t data_type); + +/// Sets the rounding mode attribute value for a given argument +/// +/// @param attr Primitive attributes. +/// @param arg Argument for which rounding mode should be set. +/// @param mode Rounding mode to apply to the argument. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_rounding( + dnnl_primitive_attr_t attr, int arg, dnnl_rounding_mode_t mode); + +/// Returns the rounding mode attribute value for a given argument +/// +/// @param attr Primitive attributes. +/// @param arg Argument for which rounding mode query applies. +/// @param mode Output rounding mode. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_rounding( + dnnl_primitive_attr_t attr, int arg, dnnl_rounding_mode_t *mode); + +/// Returns primitive attributes post-ops. +/// +/// @warning +/// The output @p post_ops points to the internal @p attr field, so it is +/// an error to modify or destroy them. The lifetime of @p post_ops is +/// the same as that of the @p attr it belongs to, so it is an error to +/// use @p post_ops after @p attr has been destroyed. +/// +/// @param attr Primitive attributes. +/// @param post_ops Output post-ops. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_post_ops( + const_dnnl_primitive_attr_t attr, const_dnnl_post_ops_t *post_ops); + +/// Sets primitive attributes post-ops. +/// +/// @note +/// There is no way to check whether the post-ops would be supported by +/// the target primitive. Any error will be reported by the +/// dnnl__[propagation kind]_primitive_desc_create() function call. +/// +/// @param attr Primitive attributes. +/// @param post_ops Post-ops to set. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_post_ops( + dnnl_primitive_attr_t attr, const_dnnl_post_ops_t post_ops); + +/// Creates empty post-ops sequence. +/// +/// @param post_ops Output post-ops. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_create(dnnl_post_ops_t *post_ops); + +/// Clones post-ops primitive attribute. +/// +/// @param post_ops Output post-ops primitive attribute. +/// @param existing_post_ops Post-ops primitive attribute to clone. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_clone( + dnnl_post_ops_t *post_ops, const_dnnl_post_ops_t existing_post_ops); + +/// Destroys post-ops. +/// +/// @param post_ops Post-ops to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_destroy(dnnl_post_ops_t post_ops); + +/// Returns the length of post-ops. +/// +/// @param post_ops Post-ops. +/// @returns The number of post-ops entries. +int DNNL_API dnnl_post_ops_len(const_dnnl_post_ops_t post_ops); + +/// Returns the kind of a post-op entry. +/// +/// @param post_ops Post-ops. +/// @param index Post-op entry index. +/// @returns The kind of the post-op with the specified index. +/// @returns #dnnl_undefined_primitive if there is no post-op at the specified +/// index. +dnnl_primitive_kind_t DNNL_API dnnl_post_ops_get_kind( + const_dnnl_post_ops_t post_ops, int index); + +/// Appends an accumulation v3 (sum) to post-ops. Prior to accumulating the +/// result, a zero point is subtracted from the previous value and is +/// multiplied by the scale. +/// +/// The kind of this post-op is #dnnl_sum. +/// +/// This feature may improve performance for cases like dequantize the +/// asymmetrically quantized sum's src1 tensor to f32 domain before performing +/// the sum operation by subtracting the @p zero_point before the scaling. +/// +/// In the simplest case where accumulation is the only post-op, the +/// computations will be: +/// +/// dst[:] <- scale * (dst[:] - zero_point) + op(...) +/// // instead of dst[:] <- op(...) +/// +/// If @p data_type is specified, original dst tensor will be reinterpreted +/// as a tensor with provided data type. Since it is reinterpretation, +/// data_type and dst data type should have the same size. +/// As a result, computations will be: +/// +/// dst[:] <- scale * (as_data_type(dst[:]) - zero_point) + op(...) +/// // instead of dst[:] <- op(...) +/// @note +/// This post-op executes in-place and does not change the +/// destination layout. +/// +/// @param post_ops Post-ops. +/// @param scale Accumulation scaling factor. +/// @param zero_point Single scalar int32_t value of zero point. +/// @param data_type Accumulation data_type. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_append_sum(dnnl_post_ops_t post_ops, + float scale, int32_t zero_point, dnnl_data_type_t data_type); + +/// Returns the parameters of an accumulation (sum) post-op with +/// zero point and data type parameter. +/// +/// @param post_ops Post-ops. +/// @param index Index of the sum post-op. +/// @param scale Output accumulation scaling factor. +/// @param zero_point Zero point. +/// @param data_type Data type for accumulation. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_get_params_sum( + const_dnnl_post_ops_t post_ops, int index, float *scale, + int32_t *zero_point, dnnl_data_type_t *data_type); + +/// Appends an elementwise post-op. +/// +/// The kind of this post operation is #dnnl_eltwise. +/// +/// In the simplest case when the elementwise is the only post operation, the +/// computations would be: +/// +/// dst[:] <- eltwise_op (op(...)) // instead of dst[:] <- op(...) +/// +/// where eltwise_op is configured with the given parameters. +/// +/// @param post_ops Post-ops. +/// @param alg_kind Elementwise algorithm for the post-op. +/// @param alpha Alpha parameter for the elementwise algorithm. +/// @param beta Beta parameter for the elementwise algorithm. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_append_eltwise(dnnl_post_ops_t post_ops, + dnnl_alg_kind_t alg_kind, float alpha, float beta); + +/// Returns the parameters of an elementwise post-op. +/// +/// @param post_ops Post-ops. +/// @param index Index of the elementwise post-op. +/// @param alg_kind Output elementwise algorithm kind. +/// @param alpha Output alpha parameter for the elementwise algorithm. +/// @param beta Output beta parameter for the elementwise algorithm. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +/// @returns #dnnl_invalid_arguments if @p index does not refer to an +/// elementwise post-op. +dnnl_status_t DNNL_API dnnl_post_ops_get_params_eltwise( + const_dnnl_post_ops_t post_ops, int index, dnnl_alg_kind_t *alg_kind, + float *alpha, float *beta); + +/// Appends a depthwise post-op convolution. +/// +/// This post-op can only be fused with a 2D 1x1 convolution (convolution with +/// weights spatial dimensions equal to 1 i.e., kh=kw=1). +/// +/// The kind of this post-op is #dnnl_convolution. +/// +/// The number of outputs for primitive with fusion is one. The output spatial +/// size can be derived as below: +/// +/// output_height = ceil(output_height_1x1_convolution, stride) +/// output_width = ceil(output_width_1x1_convolution, stride) +/// +/// See @ref dev_guide_attributes_post_ops_depthwise and +/// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info. +/// +/// @param post_ops Post-ops. +/// @param weights_data_type Weights data type of depthwise post-op +/// @param bias_data_type Bias data type of depthwise post-op +/// @param dst_data_type Output data type of depthwise post-op +/// @param kernel_size Size of kernel of depthwise post-op +/// @param stride_size Size of stride of depthwise post-op +/// @param padding_l_size Size of left and top paddings of depthwise post-op +/// @returns #dnnl_success on success and a status describing the error +/// otherwise +dnnl_status_t DNNL_API dnnl_post_ops_append_dw(dnnl_post_ops_t post_ops, + dnnl_data_type_t weights_data_type, dnnl_data_type_t bias_data_type, + dnnl_data_type_t dst_data_type, dnnl_dim_t kernel_size, + dnnl_dim_t stride_size, dnnl_dim_t padding_l_size); + +/// Returns the parameters of an depthwise post-op. +/// +/// @param post_ops Post-ops. +/// @param index Index of the elementwise post-op. +/// @param weights_data_type Weights data type of depthwise post-op +/// @param bias_data_type Bias data type of depthwise post-op +/// @param dst_data_type Output data type of depthwise post-op +/// @param kernel_size Size of kernel of depthwise post-op +/// @param stride_size Size of stride of depthwise post-op +/// @param padding_l_size Size of left and top paddings of depthwise post-op +/// @returns #dnnl_success on success and a status describing the error +/// otherwise +dnnl_status_t DNNL_API dnnl_post_ops_get_params_dw( + const_dnnl_post_ops_t post_ops, int index, + dnnl_data_type_t *weights_data_type, dnnl_data_type_t *bias_data_type, + dnnl_data_type_t *dst_data_type, dnnl_dim_t *kernel_size, + dnnl_dim_t *stride_size, dnnl_dim_t *padding_l_size); + +/// Appends a binary post-op. +/// +/// The kind of this post operation is #dnnl_binary. +/// +/// In the simplest case when the binary is the only post operation, the +/// computations would be: +/// +/// dst[:] <- binary_op (dst[:], another_input[:]) +/// +/// where binary_op is configured with the given parameters. binary_op supports +/// broadcast semantics for a second operand. +/// +/// @param post_ops Post-ops. +/// @param alg_kind Binary algorithm for the post-op. +/// @param src1_desc Memory descriptor of a second operand. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_append_binary(dnnl_post_ops_t post_ops, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src1_desc); + +/// Returns the parameters of a binary post-op. +/// +/// @param post_ops Post-ops. +/// @param index Index of the binary post-op. +/// @param alg_kind Output binary algorithm kind. +/// @param src1_desc Output memory descriptor of a second operand. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +/// @returns #dnnl_invalid_arguments if @p index does not refer to a binary +/// post-op. +dnnl_status_t DNNL_API dnnl_post_ops_get_params_binary( + const_dnnl_post_ops_t post_ops, int index, dnnl_alg_kind_t *alg_kind, + const_dnnl_memory_desc_t *src1_desc); + +/// Appends a prelu forward post-op. +/// +/// The kind of this post-op is #dnnl::primitive::kind::prelu. +/// +/// The post-op can be defined as: +/// +/// dst[:] <- prelu(dst[:], weights[:]) +/// prelu: +/// dst[:] <- dst[:] if dst[:] > 0 +/// dst[:] <- dst[:] * weights[:] if dst[:] <= 0 +/// +/// +/// @note +/// The order of dimensions does not depend on how elements are laid +/// out in memory. For example: +/// - for a 2D CNN activations tensor the order is always (n, c) +/// - for a 4D CNN activations tensor the order is always (n, c, h, w) +/// - for a 5D CNN weights tensor the order is always +/// (g, oc, ic, kh, kw) +/// +/// Prelu weights tensor is passed in runtime execution phase. Prelu +/// weights tensor data type is implicitly assumed as f32 using plain +/// layout (a, ab, acb, acdb, acdeb) +/// +/// @param post_ops Post-ops. +/// @param mask Defines the correspondence between the output tensor +/// dimensions and the prelu weights tensor. The set i-th bit indicates +/// that a dedicated weights value is used for each index along that +/// dimension. Set the mask to 0 to use a common weights value +/// for the whole output tensor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_append_prelu( + dnnl_post_ops_t post_ops, int mask); + +/// Returns the parameters of a prelu post-op. +/// +/// @param post_ops Post-ops. +/// @param index Index of the prelu post-op. +/// @param mask Mask of the prelu post-op. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_post_ops_get_params_prelu( + const_dnnl_post_ops_t post_ops, int index, int *mask); + +/// @} dnnl_api_attributes + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_memory +/// @{ + +/// Destroys a memory descriptor. +/// +/// @param memory_desc Memory descriptor to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_destroy(dnnl_memory_desc_t memory_desc); + +/// Clones a memory descriptor. The resulting memory descriptor must be +/// destroyed separately. +/// +/// @param memory_desc Output memory descriptor. +/// @param existing_memory_desc Memory descriptor to clone. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_clone(dnnl_memory_desc_t *memory_desc, + const_dnnl_memory_desc_t existing_memory_desc); + +/// Retrieves a binary blob associated with the given memory descriptor +/// +/// @param blob Output pointer to binary blob. +/// If not nullptr, size bytes of the memory descriptor blob are written. +/// @param size Output pointer to the size of the binary blob in bytes. +/// Size is written if blob is nullptr. +/// @param memory_desc input memory descriptor to serialize +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_get_blob( + uint8_t *blob, size_t *size, const_dnnl_memory_desc_t memory_desc); + +/// Creates a memory descriptor from a memory descriptor binary blob. +/// +/// @param memory_desc Output pointer to a newly allocated memory descriptor. +/// @param blob Pointer to a memory descriptor binary blob. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_blob( + dnnl_memory_desc_t *memory_desc, const uint8_t *blob); + +/// Creates a memory descriptor using dimensions and strides. +/// +/// @note +/// As always, the logical order of dimensions corresponds to the `abc...` +/// format tag, and the physical meaning of the dimensions depends on both +/// the primitive that consumes the memory and the context of that +/// consumption. +/// +/// @param memory_desc Output memory descriptor. +/// @param ndims Number of dimensions +/// @param dims Array of dimensions. +/// @param data_type Elements data type. +/// @param strides Strides in each dimension. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_strides( + dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims, + dnnl_data_type_t data_type, const dnnl_dims_t strides); + +/// Creates a memory descriptor using dimensions and memory format tag. +/// +/// @note +/// As always, the logical order of dimensions corresponds to the `abc...` +/// format tag, and the physical meaning of the dimensions depends on both +/// the primitive that consumes the memory and the context of that +/// consumption. +/// +/// @param memory_desc Output memory descriptor. +/// @param ndims Number of dimensions +/// @param dims Array of dimensions. +/// @param data_type Elements data type. +/// @param tag Memory format tag. Can be #dnnl_format_tag_any which would +/// allow a primitive to chose the final memory format. In this case the +/// format_kind field of the memory descriptor would be set to +/// #dnnl_format_kind_any. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_tag( + dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims, + dnnl_data_type_t data_type, dnnl_format_tag_t tag); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory descriptor for CSR encoding. +/// +/// @param memory_desc Output memory descriptor. +/// @param ndims Number of dimensions +/// @param dims Array of dimensions. +/// @param data_type Elements data type. +/// @param nnz Number of non-zero entries. +/// @param indices_dt Data type of indices. +/// @param pointers_dt Data type of pointers. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_csr_encoding( + dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims, + dnnl_data_type_t data_type, dnnl_dim_t nnz, dnnl_data_type_t indices_dt, + dnnl_data_type_t pointers_dt); + +/// Creates a memory descriptor for COO encoding. +/// +/// The created memory descriptor will describe a memory object that +/// contains n+1 buffers for an n-dimensional tensor. +/// The buffers have the following meaning and assigned numbers (index): +/// - 0: values +/// - 1: indices for dimension 0 +/// - 2: indices for dimension 1 ... +/// - n: indices for dimension n-1 +/// +/// @param memory_desc Output memory descriptor. +/// @param ndims Number of dimensions. +/// @param dims Array of dimensions. +/// @param data_type Elements data type. +/// @param nnz Number of non-zero entries. +/// @param indices_dt Data type of indices. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_coo_encoding( + dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims, + dnnl_data_type_t data_type, dnnl_dim_t nnz, + dnnl_data_type_t indices_dt); + +/// Creates a memory descriptor for packed sparse encoding. +/// +/// The created memory descriptor cannot be used to create a memory +/// object. It can only be used to create a primitive descriptor to +/// query the actual memory descriptor (similar to the format tag +/// `any`). +/// +/// @warning +/// The meaning and content of the handles of the memory object that +/// is created using the queried memory descriptor are unspecified +/// therefore using the content is an undefined behavior. +/// +/// @param memory_desc Output memory descriptor. +/// @param ndims Number of dimensions +/// @param dims Array of dimensions. +/// @param data_type Elements data type. +/// @param nnz Number of non-zero entries. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_with_packed_encoding( + dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims, + dnnl_data_type_t data_type, dnnl_dim_t nnz); +#endif + +/// Creates a memory descriptor for a region inside an area +/// described by an existing memory descriptor. +/// +/// @warning +/// Some combinations of physical memory layout and/or offsets or dims may +/// result in a failure to create a submemory. +// +/// @param memory_desc Output memory descriptor. +/// @param parent_memory_desc An existing memory descriptor. +/// @param dims Sizes of the region. +/// @param offsets Offsets to the region from the encompassing +/// memory object in each dimension +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_create_submemory( + dnnl_memory_desc_t *memory_desc, + const_dnnl_memory_desc_t parent_memory_desc, const dnnl_dims_t dims, + const dnnl_dims_t offsets); + +/// Creates a memory descriptor by reshaping an existing one. The new +/// memory descriptor inherits the data type. This operation is valid only for +/// memory descriptors that have format_kind #dnnl_blocked or +/// #dnnl_format_kind_any. +/// +/// The resulting memory descriptor must be destroyed separately. +/// +/// The operation ensures the transformation of the physical memory format +/// corresponds to the transformation of the logical dimensions. If such +/// transformation is impossible, the function returns #dnnl_invalid_arguments. +/// +/// The reshape operation can be described as a combination of the following +/// basic operations: +/// 1. Add a dimension of size `1`. This is always possible. +/// 2. Remove a dimension of size `1`. This is possible only if the dimension +/// has no padding (i.e. `padded_dims[dim] == dims[dim] && dims[dim] == 1`). +/// 3. Split a dimension into multiple ones. This is possible only if the size +/// of the dimension is exactly equal to the product of the split ones and +/// the dimension does not have padding (i.e. +/// `padded_dims[dim] = dims[dim]`). +/// 4. Joining multiple consecutive dimensions into a single one. As in the +/// cases above, this requires that the dimensions do not have padding and +/// that the memory format is such that in physical memory these dimensions +/// are dense and have the same order as their logical counterparts. This +/// also assumes that these dimensions are not blocked. +/// - Here, dense means: +/// `stride for dim[i] == (stride for dim[i + 1]) * dim[i + 1]`; +/// - And same order means: +/// `i < j` if and only if `stride for dim[j] <= stride for dim[i]`. +/// +/// @warning +/// Some combinations of physical memory layout and/or offsets or +/// dimensions may result in a failure to make a reshape. +/// +/// @param out_memory_desc Output memory descriptor. +/// @param in_memory_desc An existing memory descriptor. Must have format_kind +/// set to #dnnl_blocked or #dnnl_format_kind_any. +/// @param ndims Number of dimensions for the output memory descriptor. +/// @param dims Dimensions for the output memory descriptor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_reshape( + dnnl_memory_desc_t *out_memory_desc, + const_dnnl_memory_desc_t in_memory_desc, int ndims, + const dnnl_dims_t dims); + +/// Creates a memory descriptor by permuting axes in an existing one. +/// +/// The physical memory layout representation is adjusted accordingly to +/// maintain the consistency between the logical and physical parts of the +/// memory descriptor. +/// +/// The resulting memory descriptor must be destroyed separately. +/// +/// The new memory descriptor inherits the data type. This operation is valid +/// only for memory descriptors that have format_kind set to #dnnl_blocked or +/// #dnnl_format_kind_any. +/// +/// The logical axes will be permuted in the following manner: +/// ``` +/// for (i: 0 .. in_memory_desc->ndims) +/// out_memory_desc->dims[permutation[i]] = in_memory_desc->dims[i]; +/// ``` +/// +/// Example: +/// @code +/// dnnl_memory_desc_t in_md, out_md, expect_out_md; +/// +/// const int permutation[] = {1, 0}; // swap the first and the second axes +/// +/// dnnl_dims_t in_dims = {2, 3}, out_dims = {3, 2}; +/// dnnl_format_tag_t in_tag = dnnl_ab, out_tag = dnnl_ba; +/// +/// dnnl_memory_desc_create_with_tag( +/// &in_md, 2, in_dims, data_type, in_tag); +/// dnnl_memory_desc_create_with_tag( +/// &expect_out_md, 2, out_dims, data_type, out_tag); +/// +/// dnnl_memory_desc_permute_axes(&out_md, in_md, permutation); +/// assert(dnnl_memory_desc_equal(out_md, expect_out_md)); +/// +/// dnnl_memory_desc_destroy(in_md); +/// dnnl_memory_desc_destroy(out_md); +/// dnnl_memory_desc_destroy(expect_out_md); +/// @endcode +/// +/// @param out_memory_desc Output memory descriptor. +/// @param in_memory_desc An existing memory descriptor. Must have format_kind +/// set to #dnnl_blocked or #dnnl_format_kind_any. +/// @param permutation Axes permutation (of size `in_memory_desc->ndims`). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_permute_axes( + dnnl_memory_desc_t *out_memory_desc, + const_dnnl_memory_desc_t in_memory_desc, const int *permutation); + +/// Queries a memory descriptor for various pieces of information. +/// +/// The following information can be queried: +/// - Number of dimensions (#dnnl_query_ndims_s32) +/// - Dimensions (#dnnl_query_dims) in the following order: +/// - CNN data tensors: mini-batch, channel, spatial +/// ({N, C, [[D,] H,] W}) +/// - CNN weight tensors: group (optional), output channel, input channel, +/// spatial ({[G,] O, I, [[D,] H,] W}) +/// - RNN data tensors: time, mini-batch, channels ({T, N, C}) +/// or layers, directions, states, mini-batch, channels +/// ({L, D, S, N, C}) +/// - RNN weight tensor: layers, directions, input channel, gates, output +/// channels ({L, D, I, G, O}) +/// - Data type of the tensor elements (#dnnl_query_data_type) +/// - Padded dimensions (#dnnl_query_padded_dims) - size of the data including +/// padding in each dimension +/// - Padded offsets (#dnnl_query_padded_offsets) - per-dimension offset from +/// the padding to actual data, the top-level tensor with offsets applied +/// must lie within the padding area. +/// - Submemory offset (#dnnl_query_submemory_offset_s64) - offset from memory +/// origin to the current block, non-zero only in a description of a memory +/// sub-block. +/// - Format kind (#dnnl_query_format_kind) - memory format kind +/// +/// @note +/// The order of dimensions does not depend on the memory format, so +/// whether the data is laid out in #dnnl_nchw or #dnnl_nhwc +/// the dims for 4D CN data tensor would be {N, C, H, W}. +/// +/// The following queries are applicable only to format kind #dnnl_blocked. +/// - Strides (#dnnl_query_strides) between the outermost blocks or in case +/// of plain (non-blocked) formats the strides between dimensions +/// - Number of innermost blocks (#dnnl_query_inner_nblks_s32), e.g. +/// `{4, 16, 4}` in case of `OIhw_4i16o4i` +/// - Size of the innermost blocks (#dnnl_query_inner_blks), e.g. 3 in case +/// of `OIhw_4i16o4i_` +/// - Logical indices of the blocks (#dnnl_query_inner_idxs), e.g. `{1, 0, 1}` +/// in case of `4i16o4i`, because `i` is the 1st dim and `o` is the 0st dim +/// +/// @param memory_desc Memory descriptor. +/// @param what Parameter to query. +/// @param result Output result. The type depends on the query. For example, +/// it must be a @c dnnl_dims_t** if querying for a strides. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_query( + const_dnnl_memory_desc_t memory_desc, dnnl_query_t what, void *result); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Queries a memory descriptor for various pieces of information. This version +/// support additional queries #dnnl_query_sparse_encoding, #dnnl_query_nnz_s64 +/// #dnnl_query_num_handles_s32 and #dnnl_query_data_type for a particular +/// buffer. +/// +/// The following information can be queried: +/// - Number of dimensions (#dnnl_query_ndims_s32) +/// - Dimensions (#dnnl_query_dims) in the following order: +/// - CNN data tensors: mini-batch, channel, spatial +/// ({N, C, [[D,] H,] W}) +/// - CNN weight tensors: group (optional), output channel, input channel, +/// spatial ({[G,] O, I, [[D,] H,] W}) +/// - RNN data tensors: time, mini-batch, channels ({T, N, C}) +/// or layers, directions, states, mini-batch, channels +/// ({L, D, S, N, C}) +/// - RNN weight tensor: layers, directions, input channel, gates, output +/// channels ({L, D, I, G, O}) +/// - Data type of the tensor elements (#dnnl_query_data_type) +/// - Padded dimensions (#dnnl_query_padded_dims) - size of the data including +/// padding in each dimension +/// - Padded offsets (#dnnl_query_padded_offsets) - per-dimension offset from +/// the padding to actual data, the top-level tensor with offsets applied +/// must lie within the padding area. +/// - Submemory offset (#dnnl_query_submemory_offset_s64) - offset from memory +/// origin to the current block, non-zero only in a description of a memory +/// sub-block. +/// - Format kind (#dnnl_query_format_kind) - memory format kind +/// +/// @note +/// The order of dimensions does not depend on the memory format, so +/// whether the data is laid out in #dnnl_nchw or #dnnl_nhwc +/// the dims for 4D CN data tensor would be {N, C, H, W}. +/// +/// The following queries are applicable only to format kind #dnnl_blocked. +/// - Strides (#dnnl_query_strides) between the outermost blocks or in case +/// of plain (non-blocked) formats the strides between dimensions +/// - Number of innermost blocks (#dnnl_query_inner_nblks_s32), e.g. +/// `{4, 16, 4}` in case of `OIhw_4i16o4i` +/// - Size of the innermost blocks (#dnnl_query_inner_blks), e.g. 3 in case +/// of `OIhw_4i16o4i_` +/// - Logical indices of the blocks (#dnnl_query_inner_idxs), e.g. `{1, 0, 1}` +/// in case of `4i16o4i`, because `i` is the 1st dim and `o` is the 0st dim +/// +/// @param memory_desc Memory descriptor. +/// @param what Parameter to query. +/// @param index Index of the parameter to query for. It is mostly used with +/// #dnnl_query_data_type to specify which data type is being queried. +/// The main data type (data type of values) has always index 0. For other +/// indices please refer to the API for creating a memory descriptor for +/// sparse encoding. +/// @param result Output result. The type depends on the query. For example, +/// it must be a @c dnnl_dims_t** if querying for a strides. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_desc_query_v2( + const_dnnl_memory_desc_t memory_desc, dnnl_query_t what, int index, + void *result); +#endif + +/// Compares two memory descriptors. +/// +/// Use this function to identify whether a reorder is required between the +/// two memories +/// +/// @param lhs Left-hand side of the comparison. +/// @param rhs Right-hand side of the comparison. +/// @returns 1 if the descriptors are the same. +/// @returns 0 if the descriptors are different. +int DNNL_API dnnl_memory_desc_equal( + const_dnnl_memory_desc_t lhs, const_dnnl_memory_desc_t rhs); + +/// Returns the size of a memory descriptor. +/// +/// @param memory_desc Memory descriptor. +/// @returns The number of bytes required for memory described by a memory +/// descriptor. +size_t DNNL_API dnnl_memory_desc_get_size(const_dnnl_memory_desc_t memory_desc); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Returns the size of the data that corresponds to the given index. +/// +/// @param memory_desc Memory descriptor. +/// @param index Index of the buffer. +/// +/// @returns The number of bytes required for the requested data. +size_t DNNL_API dnnl_memory_desc_get_size_v2( + const_dnnl_memory_desc_t memory_desc, int index); +#endif + +/// Returns the size of data type. +/// +/// @param data_type Data type. +/// @returns The number of bytes occupied by data type. +size_t DNNL_API dnnl_data_type_size(dnnl_data_type_t data_type); + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE, the constructed memory +/// object will have the underlying buffer set. In this case, the buffer will +/// be initialized as if dnnl_memory_set_data_handle() had been called. +/// +/// @sa dnnl_memory_set_data_handle() +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - DNNL_MEMORY_NONE to create dnnl_memory without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_create(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + void *handle); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory object with multiple handles. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param nhandles Number of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - DNNL_MEMORY_NONE Instructs the library to skip allocation of the +/// memory buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_create_v2(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + int nhandles, void **handles); +#endif + +/// Returns the memory descriptor for a memory object. +/// +/// @param memory Memory object. +/// @param memory_desc Output memory descriptor (a copy). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_get_memory_desc( + const_dnnl_memory_t memory, const_dnnl_memory_desc_t *memory_desc); + +/// Returns the engine of a memory object. +/// +/// @param memory Memory object. +/// @param engine Output engine on which the memory is located. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_get_engine( + const_dnnl_memory_t memory, dnnl_engine_t *engine); + +/// Maps a memory object and returns a host-side pointer to a memory buffer +/// with a copy of its contents. +/// +/// Mapping enables explicit direct access to memory contents for the engines +/// that do not support it implicitly. +/// +/// Mapping is an exclusive operation - a memory object cannot be used in +/// other operations until this memory object is unmapped. +/// +/// @note +/// Any primitives working with @p memory should be completed before +/// the memory is mapped. Use dnnl_stream_wait to synchronize the +/// corresponding execution stream. +/// +/// @note +/// The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are +/// mainly provided for debug and testing purposes, and their performance +/// may be suboptimal. +/// +/// @param memory Memory object. +/// @param mapped_ptr Output pointer to the mapped buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_map_data( + const_dnnl_memory_t memory, void **mapped_ptr); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Maps a memory object and returns a host-side pointer to a memory buffer +/// with a copy of its contents. The memory buffer corresponds to the given +/// index. +/// +/// Mapping enables explicit direct access to memory contents for the engines +/// that do not support it implicitly. +/// +/// Mapping is an exclusive operation - a memory object cannot be used in +/// other operations until this memory object is unmapped. +/// +/// @note +/// Any primitives working with @p memory should be completed before +/// the memory is mapped. Use dnnl_stream_wait to synchronize the +/// corresponding execution stream. +/// +/// @note +/// The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are +/// mainly provided for debug and testing purposes, and their performance +/// may be suboptimal. +/// +/// @param memory Memory object. +/// @param mapped_ptr Output pointer to the mapped buffer. +/// @param index Index of the buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_map_data_v2( + const_dnnl_memory_t memory, void **mapped_ptr, int index); +#endif + +/// Unmaps a memory object and writes back any changes made to the previously +/// mapped memory buffer. The pointer to the mapped buffer must be obtained +/// via the dnnl_memory_map_data() call. +/// +/// @note +/// The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are +/// mainly provided for debug and testing purposes, and their performance +/// may be suboptimal. +/// +/// @param memory Memory object. +/// @param mapped_ptr Pointer to the mapped buffer that must have been +/// obtained using the dnnl_memory_map_data() function. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_unmap_data( + const_dnnl_memory_t memory, void *mapped_ptr); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Unmaps a memory object and writes back any changes made to the previously +/// mapped memory buffer. The pointer to the mapped buffer must be obtained +/// via the dnnl_memory_map_data() call. The buffer corresponds to the given +/// index. +/// +/// @note +/// The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are +/// mainly provided for debug and testing purposes, and their performance +/// may be suboptimal. +/// +/// @param memory Memory object. +/// @param mapped_ptr Pointer to the mapped buffer that must have been +/// obtained using the dnnl_memory_map_data() function. +/// @param index Index of the buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_unmap_data_v2( + const_dnnl_memory_t memory, void *mapped_ptr, int index); +#endif + +/// Returns memory object's data handle. +/// +/// @param memory Memory object. +/// @param handle Output data handle. For the CPU engine, the data handle is a +/// pointer to the actual data. For OpenCL it is a cl_mem. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_get_data_handle( + const_dnnl_memory_t memory, void **handle); + +/// Sets the underlying memory buffer. +/// +/// @param memory Memory object. +/// @param handle Data handle. For the CPU engine or when USM is used, the +/// memory buffer is a pointer to the actual data. For OpenCL it is a +/// `cl_mem`. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_set_data_handle( + dnnl_memory_t memory, void *handle); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Returns an underlying memory buffer that corresponds to the given index. +/// +/// @param memory Memory object. +/// @param handle Data handle. For the CPU engine or when USM is used, the +/// memory buffer is a pointer to the actual data. For OpenCL it is a +/// `cl_mem`. +/// @param index Index of the buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_get_data_handle_v2( + const_dnnl_memory_t memory, void **handle, int index); + +/// Sets an underlying memory buffer that corresponds to the given index. +/// +/// @param memory Memory object. +/// @param handle Data handle. For the CPU engine or when USM is used, the +/// memory buffer is a pointer to the actual data. For OpenCL it is a +/// `cl_mem`. +/// @param index Index of the buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_set_data_handle_v2( + dnnl_memory_t memory, void *handle, int index); +#endif + +/// Destroys a memory object. +/// +/// @param memory Memory object to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_memory_destroy(dnnl_memory_t memory); + +/// @} dnnl_api_memory + +/// @addtogroup dnnl_api_primitives +/// @{ + +/// @addtogroup dnnl_api_reorder +/// @{ + +/// Creates a primitive descriptor for a reorder primitive. +/// +/// @param reorder_primitive_desc Output primitive descriptor. +/// @param src_desc Source memory descriptor. +/// @param src_engine Engine on which the source memory object will be +/// located. +/// @param dst_desc Destination memory descriptor. +/// @param dst_engine Engine on which the destination memory object +/// will be located. +/// @param attr Primitive attributes to use (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_reorder_primitive_desc_create( + dnnl_primitive_desc_t *reorder_primitive_desc, + const_dnnl_memory_desc_t src_desc, dnnl_engine_t src_engine, + const_dnnl_memory_desc_t dst_desc, dnnl_engine_t dst_engine, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_reorder + +/// @addtogroup dnnl_api_concat +/// @{ + +/// Creates a primitive descriptor for an out-of-place concatenation +/// primitive. +/// +/// @param concat_primitive_desc Output primitive descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param n Number of source parameters. +/// @param concat_dimension Source tensors will be concatenated over +/// dimension with this index. Note that order of dimensions does +/// not depend on memory format. +/// @param src_descs Array of source memory descriptors with @p n elements. +/// @param attr Primitive attributes to use (can be NULL). +/// @param engine Engine to use. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_concat_primitive_desc_create( + dnnl_primitive_desc_t *concat_primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t dst_desc, int n, int concat_dimension, + const_dnnl_memory_desc_t const *src_descs, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_concat + +/// @addtogroup dnnl_api_sum +/// @{ + +/// Creates a primitive descriptor for an (out-of-place) sum primitive. +/// +/// @param sum_primitive_desc Output primitive descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param n Number of source parameters. +/// @param scales Vector of scales to multiply data in each source +/// memory by. +/// @param src_descs Array of source memory descriptors having @p n elements. +/// @param attr Primitive attributes to use (can be NULL). +/// @param engine Engine to use. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sum_primitive_desc_create( + dnnl_primitive_desc_t *sum_primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t dst_desc, int n, const float *scales, + const_dnnl_memory_desc_t const *src_descs, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_sum + +/// @addtogroup dnnl_api_binary +/// @{ + +/// Creates a primitive descriptor for a binary primitive. +/// +/// @note +/// Memory descriptors @p src1_desc and @p dst_desc are allowed to be +/// initialized with #dnnl_format_tag_any or with format_kind set to +/// #dnnl_format_kind_any. +/// +/// @note +/// Both memory descriptors must have the same number of dimensions. +/// Element broadcasting is supported for memory descriptor @p src1_desc +/// and are applied to @p src1_desc dimensions that have size equal to 1. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Algorithm kind. Valid values are #dnnl_binary_add, +/// #dnnl_binary_mul, #dnnl_binary_max, #dnnl_binary_min, #dnnl_binary_div, +/// #dnnl_binary_sub, #dnnl_binary_ge, #dnnl_binary_gt, #dnnl_binary_le, +/// #dnnl_binary_lt, #dnnl_binary_eq and #dnnl_binary_ne. +/// @param src0_desc Source 0 memory descriptor. +/// @param src1_desc Source 1 memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src0_desc, + const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t dst_desc, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a binary primitive with support of +/// ternary operators. +/// +/// @note +/// Memory descriptors @p src1_desc, @p src2_desc and @p dst_desc are +/// allowed to be initialized with #dnnl_format_tag_any or with format_kind +/// set to #dnnl_format_kind_any. +/// +/// @note +/// All memory descriptors must have the same number of dimensions. +/// Element broadcasting is supported for memory descriptor @p src1_desc +/// and is applied to @p src1_desc dimensions that have a size equal to 1. +/// There is no broadcasting support for @p src2_desc. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Algorithm kind. +/// @param src0_desc Source 0 memory descriptor. +/// @param src1_desc Source 1 memory descriptor. +/// @param src2_desc Source memory descriptor for ternary operations. Might +/// be empty. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create_v2( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src0_desc, + const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t src2_desc, + const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_binary + +/// @addtogroup dnnl_api_convolution +/// @{ + +/// Creates a primitive descriptor for a convolution forward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind Convolution algorithm. Possible values are +/// #dnnl_convolution_direct, #dnnl_convolution_winograd, +/// #dnnl_convolution_auto. +/// @param src_desc Source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory +/// descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param dst_desc Destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_convolution_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc, + const dnnl_dims_t strides, const dnnl_dims_t dilates, + const dnnl_dims_t padding_l, const dnnl_dims_t padding_r, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a convolution backward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Convolution algorithm. Possible values are +/// #dnnl_convolution_direct, #dnnl_convolution_winograd, +/// #dnnl_convolution_auto. +/// @param diff_src_desc Diff source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_convolution_backward_data_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides, + const dnnl_dims_t dilates, const dnnl_dims_t padding_l, + const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a convolution weights gradient primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Convolution algorithm. Possible values are +/// #dnnl_convolution_direct, #dnnl_convolution_winograd, +/// #dnnl_convolution_auto. +/// @param src_desc Source memory descriptor. +/// @param diff_weights_desc Diff weights memory descriptor. +/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero +/// memory descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_convolution_backward_weights_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t diff_weights_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides, + const dnnl_dims_t dilates, const dnnl_dims_t padding_l, + const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_convolution + +/// @addtogroup dnnl_api_deconvolution +/// @{ + +/// Creates a primitive descriptor for a deconvolution forward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind Deconvolution algorithm. Possible values are +/// #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd. +/// @param src_desc Source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory +/// descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param dst_desc Destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_deconvolution_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc, + const dnnl_dims_t strides, const dnnl_dims_t dilates, + const dnnl_dims_t padding_l, const dnnl_dims_t padding_r, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a deconvolution backward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Deconvolution algorithm. Possible values are +/// #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd. +/// @param diff_src_desc Diff source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_deconvolution_backward_data_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides, + const dnnl_dims_t dilates, const dnnl_dims_t padding_l, + const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a deconvolution weights gradient +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain +/// values for spatial dimensions only and hence must have the same number of +/// elements as there are spatial dimensions. The order of values is the same +/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors), +/// and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Deconvolution algorithm. Possible values are +/// #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd. +/// @param src_desc Source memory descriptor. +/// @param diff_weights_desc Diff weights memory descriptor. +/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero +/// memory descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param dilates Array of dilations for spatial dimension. A zero value +/// means no dilation in the corresponding dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API +dnnl_deconvolution_backward_weights_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t diff_weights_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides, + const dnnl_dims_t dilates, const dnnl_dims_t padding_l, + const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_deconvolution + +/// @addtogroup dnnl_api_shuffle +/// @{ + +/// Creates a primitive descriptor for a shuffle forward propagation primitive +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param axis The axis along which the data is shuffled. +/// @param group_size Shuffle group size. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_shuffle_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, int axis, dnnl_dim_t group_size, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a shuffle backward propagation primitive +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param axis The axis along which the data is shuffled. +/// @param group_size Shuffle group size. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_shuffle_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, int axis, dnnl_dim_t group_size, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_shuffle + +/// @addtogroup dnnl_api_eltwise +/// @{ + +/// Creates a primitive descriptor for an eltwise forward propagation primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind Elementwise algorithm kind. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param alpha The alpha parameter for the elementwise operation. Specific +/// meaning depends on the algorithm. +/// @param beta The beta parameter for the elementwise operation. Specific +/// meaning depends on the algorithm. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_eltwise_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc, + float alpha, float beta, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an eltwise backward propagation +/// primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Elementwise algorithm kind. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param data_desc Destination memory descriptor if one of the +/// "use_dst_for_bwd" algorithms are used (such as +/// #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor otherwise. +/// @param alpha The alpha parameter for the elementwise operation. Specific +/// meaning depends on the algorithm. +/// @param beta The beta parameter for the elementwise operation. Specific +/// meaning depends on the algorithm. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_eltwise_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t data_desc, float alpha, float beta, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_eltwise + +/// @addtogroup dnnl_api_softmax +/// @{ + +/// Creates a primitive descriptor for a softmax forward propagation primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind Softmax algorithm kind: either #dnnl_softmax_accurate, or +/// #dnnl_softmax_log. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param softmax_axis Axis over which softmax is computed. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_softmax_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc, + int softmax_axis, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a softmax backward propagation primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Softmax algorithm kind: either #dnnl_softmax_accurate, or +/// #dnnl_softmax_log. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param softmax_axis Axis over which softmax is computed. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_softmax_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t dst_desc, int softmax_axis, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_softmax + +/// @addtogroup dnnl_api_pooling +/// @{ + +/// Creates a primitive descriptor for a pooling forward propagation +/// primitive. +/// +/// Arrays @p strides, @p kernel, @p dilation, @p padding_l and @p padding_r +/// contain values for spatial dimensions only and hence must have the same +/// number of elements as there are spatial dimensions. The order of values +/// is the same as in the tensor: depth (for 3D tensors), +/// height (for 3D and 2D tensors), and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind Pooling algorithm kind: either #dnnl_pooling_max, +/// #dnnl_pooling_avg_include_padding, or #dnnl_pooling_avg_exclude_padding. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param kernel Array of kernel spatial dimensions. +/// @param dilation Array of dilations for spatial dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_pooling_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc, + const dnnl_dims_t strides, const dnnl_dims_t kernel, + const dnnl_dims_t dilation, const dnnl_dims_t padding_l, + const dnnl_dims_t padding_r, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a pooling backward propagation +/// primitive. +/// +/// Arrays @p strides, @p kernel, @p dilation, @p padding_l and @p padding_r +/// contain values for spatial dimensions only and hence must have the same +/// number of elements as there are spatial dimensions. The order of values +/// is the same as in the tensor: depth (for 3D tensors), +/// height (for 3D and 2D tensors), and width. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind Pooling algorithm kind: either #dnnl_pooling_max, +/// #dnnl_pooling_avg_include_padding, or #dnnl_pooling_avg_exclude_padding. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param strides Array of strides for spatial dimension. +/// @param kernel Array of kernel spatial dimensions. +/// @param dilation Array of dilations for spatial dimension. +/// @param padding_l Array of padding values for low indices for each spatial +/// dimension `([[front,] top,] left)`. +/// @param padding_r Array of padding values for high indices for each spatial +/// dimension `([[back,] bottom,] right)`. Can be NULL in which case +/// padding is considered to be symmetrical. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_pooling_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides, + const dnnl_dims_t kernel, const dnnl_dims_t dilation, + const dnnl_dims_t padding_l, const dnnl_dims_t padding_r, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_pooling + +/// @addtogroup dnnl_api_prelu +/// @{ + +/// Creates a primitive descriptor for a PReLU (leaky ReLU with trainable +/// alpha parameter) forward propagation primitive. +/// +/// @note +/// weights descriptor is allowed to be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param weights_desc Alpha parameters memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_prelu_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a PReLU (leaky ReLU with trainable +/// alpha parameter) backward propagation primitive. +/// +/// @note +/// weights descriptor and diff_weights descriptor are allowed +/// to be initialized with #dnnl_format_tag_any or with format_kind +/// set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param src_desc Source memory descriptor. +/// @param weights_desc Alpha parameters memory descriptor. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_weights_desc Diff alpha parameters memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_prelu_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_weights_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_prelu + +/// @addtogroup dnnl_api_lrn +/// @{ + +/// Creates a primitive descriptor for an LRN forward propagation primitive. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind LRN algorithm kind: either #dnnl_lrn_across_channels or +/// #dnnl_lrn_within_channel. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param local_size Regularization local size. +/// @param alpha The alpha regularization parameter. +/// @param beta The beta regularization parameter. +/// @param k The k regularization parameter. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lrn_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc, + dnnl_dim_t local_size, float alpha, float beta, float k, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an LRN backward propagation primitive. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param alg_kind LRN algorithm kind: either #dnnl_lrn_across_channels or +/// #dnnl_lrn_within_channel. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param src_desc Source memory descriptor. +/// @param local_size Regularization local size. +/// @param alpha The alpha regularization parameter. +/// @param beta The beta regularization parameter. +/// @param k The k regularization parameter. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lrn_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t src_desc, dnnl_dim_t local_size, float alpha, + float beta, float k, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_lrn + +/// @addtogroup dnnl_api_batch_normalization +/// @{ + +/// Creates a primitive descriptor for a batch normalization forward propagation +/// primitive. +/// +/// @note +/// In-place operation is supported: the dst can refer to the same memory +/// as the src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param epsilon Batch normalization epsilon parameter. +/// @param flags Batch normalization flags (@ref dnnl_normalization_flags_t). +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_batch_normalization_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, float epsilon, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a batch normalization backward +/// propagation primitive. +/// +/// @note +/// In-place operation is supported: the diff_dst can refer to the same +/// memory as the diff_src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_backward_data and #dnnl_backward (diffs for all parameters are +/// computed in this case). +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param src_desc Source memory descriptor. +/// @param epsilon Batch normalization epsilon parameter. +/// @param flags Batch normalization flags (@ref dnnl_normalization_flags_t). +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_batch_normalization_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t src_desc, float epsilon, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_batch_normalization + +/// @addtogroup dnnl_api_group_normalization +/// @{ + +/// Creates a primitive descriptor for a group normalization forward propagation +/// primitive. +/// +/// @note +/// In-place operation is supported: the dst can refer to the same memory +/// as the src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param groups Group normalization groups parameter. +/// @param epsilon Group normalization epsilon parameter. +/// @param flags Group normalization flags (@ref dnnl_normalization_flags_t). +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_group_normalization_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, dnnl_dim_t groups, float epsilon, + unsigned flags, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a group normalization backward +/// propagation primitive. +/// +/// @note +/// In-place operation is supported: the diff_dst can refer to the same +/// memory as the diff_src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_backward_data and #dnnl_backward (diffs for all parameters are +/// computed in this case). +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param src_desc Source memory descriptor. +/// @param groups Group normalization groups parameter. +/// @param epsilon Group normalization epsilon parameter. +/// @param flags Group normalization flags (@ref dnnl_normalization_flags_t). +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_group_normalization_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t src_desc, dnnl_dim_t groups, float epsilon, + unsigned flags, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_group_normalization + +/// @addtogroup dnnl_api_layer_normalization +/// @{ + +/// Creates a primitive descriptor for a layer normalization forward propagation +/// primitive. +/// +/// @note +/// In-place operation is supported: the dst can refer to the same memory +/// as the src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param stat_desc Memory descriptor for mean and variance. If this +/// parameter is NULL, a zero memory descriptor, or a memory descriptor +/// with format_kind set to #dnnl_format_kind_undef, then the memory +/// descriptor for stats is derived from @p src_desc by removing the last +/// dimension. +/// @param epsilon Layer normalization epsilon parameter. +/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t). +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_layer_normalization_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, const_dnnl_memory_desc_t stat_desc, + float epsilon, unsigned flags, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a layer normalization backward +/// propagation primitive. +/// +/// @note +/// In-place operation is supported: the diff_dst can refer to the same +/// memory as the diff_src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_backward_data and #dnnl_backward (diffs for all parameters are +/// computed in this case). +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param src_desc Source memory descriptor. +/// @param stat_desc Memory descriptor for mean and variance. If this +/// parameter is NULL, a zero memory descriptor, or a memory descriptor +/// with format_kind set to #dnnl_format_kind_undef, then the memory +/// descriptor for stats is derived from @p src_desc by removing the last +/// dimension. +/// @param epsilon Layer normalization epsilon parameter. +/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t). +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_layer_normalization_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t stat_desc, + float epsilon, unsigned flags, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a layer normalization forward propagation +/// primitive with a user-provided data type for the scale and shift +/// memory objects. +/// +/// @note +/// In-place operation is supported: the dst can refer to the same memory +/// as the src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param stat_desc Memory descriptor for mean and variance. If this +/// parameter is NULL, a zero memory descriptor, or a memory descriptor +/// with format_kind set to #dnnl_format_kind_undef, then the memory +/// descriptor for stats is derived from @p src_desc by removing the last +/// dimension. +/// @param scale_shift_data_type Data type of scale and shift memory. If neither scale +/// nor shift flag are specified the parameter is ignored. +/// @param epsilon Layer normalization epsilon parameter. +/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t). +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API +dnnl_layer_normalization_forward_primitive_desc_create_v2( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, const_dnnl_memory_desc_t stat_desc, + dnnl_data_type_t scale_shift_data_type, float epsilon, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a layer normalization backward +/// propagation primitive with a user-provided data type for the +/// scale and shift memory objects. +/// +/// @note +/// In-place operation is supported: the diff_dst can refer to the same +/// memory as the diff_src. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_backward_data and #dnnl_backward (diffs for all parameters are +/// computed in this case). +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param src_desc Source memory descriptor. +/// @param stat_desc Memory descriptor for mean and variance. If this +/// parameter is NULL, a zero memory descriptor, or a memory descriptor +/// with format_kind set to #dnnl_format_kind_undef, then the memory +/// descriptor for stats is derived from @p src_desc by removing the last +/// dimension. +/// @param diff_scale_shift_data_type Data type of diff scale and shift memory. If neither scale +/// nor shift flag are specified the parameter is ignored. +/// @param scale_shift_data_type Data type of scale and shift memory. If neither scale +/// nor shift flag are specified the parameter is ignored. +/// @param epsilon Layer normalization epsilon parameter. +/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t). +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API +dnnl_layer_normalization_backward_primitive_desc_create_v2( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t stat_desc, + dnnl_data_type_t diff_scale_shift_data_type, + dnnl_data_type_t scale_shift_data_type, float epsilon, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_layer_normalization + +/// @addtogroup dnnl_api_inner_product +/// @{ + +/// Creates a primitive descriptor for an inner product forward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param src_desc Source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory +/// descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_inner_product_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an inner product backward propagation +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param diff_src_desc Diff source memory descriptor. +/// @param weights_desc Weights memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_inner_product_backward_data_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an inner product weights gradient +/// primitive. +/// +/// @note +/// Memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive_descriptor. +/// @param engine Engine to use. +/// @param src_desc Source memory descriptor. +/// @param diff_weights_desc Diff weights memory descriptor. +/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero +/// memory descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API +dnnl_inner_product_backward_weights_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t diff_weights_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_inner_product + +/// @addtogroup dnnl_api_attributes +/// @{ + +/// Set quantization scale and shift parameters for RNN data tensors. +/// +/// For performance reasons, the low-precision configuration of the RNN +/// primitives expects input activations to have the unsigned 8-bit integer +/// data type. The scale and shift parameters are used to quantize +/// floating-point data to unsigned integer and must be passed to the RNN +/// primitive using attributes. +/// +/// The quantization formula is `scale * data + shift`. +/// +/// @note +/// Quantization scale and shift are common for src_layer, src_iter, +/// dst_iter, and dst_layer. +/// +/// Example usage: +/// @code +/// // RNN parameters +/// int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32; +/// // Activations quantization parameters +/// float scale = 63.f, shift = 64.f; +/// +/// dnnl_primitive_attr_t rnn_attr; +/// // Create default attributes +/// dnnl_primitive_attr_create(&rnn_attr); +/// +/// // Set scale and shift for int8 quantization of activation +/// dnnl_primitive_attr_set_rnn_data_qparams(rnn_attr, scale, shift); +/// +/// // Create an RNN primitive descriptor. +/// dnnl_primitive_desc_t rnn_pd; +/// dnnl_vanilla_rnn_forward_primitive_desc_create(&rnn_pd, +/// engine, /* arguments */, attr); +/// @endcode +/// +/// @param attr Primitive attributes. +/// @param scale The value to scale the data by. +/// @param shift The value to shift the data by. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_data_qparams( + dnnl_primitive_attr_t attr, const float scale, const float shift); + +/// Returns the quantization scale and shift parameters for RNN data tensors. +/// +/// @note +/// Quantization scale and shift are common for src_layer, src_iter, +/// dst_iter, and dst_layer. +/// +/// @param attr Primitive attributes. +/// @param scale The value to scale the data by. +/// @param shift The value to shift the data by. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_data_qparams( + const_dnnl_primitive_attr_t attr, float *scale, float *shift); + +/// Sets quantization scaling factors for RNN weights tensors. The +/// low-precision configuration of the RNN primitives expects input weights to +/// use the signed 8-bit integer data type. The scaling factors are used to +/// quantize floating-point data to signed integer and must be passed to RNN +/// primitives using attributes. +/// +/// @note +/// The dimension order is always native and does not depend on the actual +/// layout used. For example, five-dimensional weights always have (l, d, +/// i, g, o) logical dimension ordering. +/// +/// @note +/// Quantization scales are common for weights_layer and weights_iteration +/// +/// @param attr Primitive attributes. +/// @param count Number of elements in the @p scales array. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the output tensor dimensions and the @p +/// scales vector. The set i-th bit indicates that a dedicated scaling +/// factor should be used for each index along that dimension. Set the +/// mask to 0 to use a common scaling factor for the whole output +/// tensor. +/// @param scales Array of output scaling factors that must contain @p count +/// values and the following equality must hold: +/// \f[count = \prod\limits_{d \in mask} weights.dims[d].\f] +/// Violations can only be detected when the attributes are used to create +/// a primitive descriptor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_weights_qparams( + dnnl_primitive_attr_t attr, dnnl_dim_t count, int mask, + const float *scales); + +/// Returns the quantization scaling factors for RNN weights tensors. +/// +/// @param attr Primitive attributes. +/// @param count Number of elements in the @p scales array. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the output tensor dimensions and the @p +/// scales vector. The set i-th bit indicates that a dedicated scaling +/// factor should be used for each index along that dimension. Set the +/// mask to 0 to use a common scaling factor for the whole output +/// tensor. +/// @param scales Array of output scaling factors that contain @p count +/// values and the following equality must hold: +/// \f[count = \prod\limits_{d \in mask} weights.dims[d].\f] +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_weights_qparams( + const_dnnl_primitive_attr_t attr, dnnl_dim_t *count, int *mask, + const float **scales); + +/// Sets quantization scaling factors for RNN projection weights tensors. The +/// low-precision configuration of the RNN primitives expects input weights to +/// use the signed 8-bit integer data type. The scaling factors are used to +/// quantize floating-point data to signed integer and must be passed to RNN +/// primitives using attributes. +/// +/// @note +/// The dimension order is always native and does not depend on the actual +/// layout used. For example, five-dimensional weights always have (l, d, +/// i, g, o) logical dimension ordering. +/// +/// @param attr Primitive attributes. +/// @param count Number of elements in the @p scales array. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the output tensor dimensions and the @p +/// scales vector. The set i-th bit indicates that a dedicated scaling +/// factor should be used for each index along that dimension. Set the +/// mask to 0 to use a common scaling factor for the whole output +/// tensor. +/// @param scales Array of output scaling factors that must contain @p count +/// values and the following equality must hold: +/// \f[count = \prod\limits_{d \in mask} weights.dims[d].\f] +/// Violations can only be detected when the attributes are used to create +/// a primitive descriptor. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_weights_projection_qparams( + dnnl_primitive_attr_t attr, dnnl_dim_t count, int mask, + const float *scales); + +/// Returns the quantization scaling factors for RNN projection weights tensors. +/// +/// @param attr Primitive attributes. +/// @param count Number of elements in the @p scales array. +/// @param mask Scaling factors correspondence mask that defines the +/// correspondence between the output tensor dimensions and the @p +/// scales vector. The set i-th bit indicates that a dedicated scaling +/// factor should be used for each index along that dimension. Set the +/// mask to 0 to use a common scaling factor for the whole output +/// tensor. +/// @param scales Array of output scaling factors that contain @p count +/// values and the following equality must hold: +/// \f[count = \prod\limits_{d \in mask} weights.dims[d].\f] +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_weights_projection_qparams( + const_dnnl_primitive_attr_t attr, dnnl_dim_t *count, int *mask, + const float **scales); + +/// @} dnnl_api_attributes + +/// @addtogroup dnnl_api_rnn +/// @{ + +/// Creates a primitive descriptor for vanilla RNN forward propagation +/// primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc. +/// +/// This would then indicate that the RNN forward propagation primitive should +/// not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param activation Activation kind. Possible values are #dnnl_eltwise_relu, +/// #dnnl_eltwise_tanh or #dnnl_eltwise_logistic. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param flags Unused. +/// @param alpha Negative slope if activation is #dnnl_eltwise_relu. +/// @param beta Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_vanilla_rnn_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const dnnl_alg_kind_t activation, + const dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, float alpha, + float beta, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for vanilla RNN backward propagation +/// primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p diff_src_iter_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p diff_dst_iter_desc. +/// +/// This would then indicate that the RNN backward propagation primitive should +/// not use the respective data and should use zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param activation Activation kind. Possible values are #dnnl_eltwise_relu, +/// #dnnl_eltwise_tanh or #dnnl_eltwise_logistic. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param flags Unused. +/// @param alpha Negative slope if activation is #dnnl_eltwise_relu. +/// @param beta Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_vanilla_rnn_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, const dnnl_alg_kind_t activation, + const dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags, + float alpha, float beta, const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an LSTM forward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p src_iter_c_desc, +/// - @p weights_peephole_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc together with @p dst_iter_c_desc. +/// +/// This would then indicate that the LSTM forward propagation primitive should +/// not use them and should default to zero values instead. +/// +/// The @p weights_projection_desc could either be @c NULL or point to a zero +/// memory descriptor. This would then indicate that the LSTM doesn't have +/// recurrent projection layer. +/// +/// @note +/// All memory descriptors can be initialized with #dnnl_format_tag_any or +/// with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param src_iter_c_desc Memory descriptor for the input recurrent cell +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param weights_peephole_desc Memory descriptor for the weights applied to +/// the cell states (according to the Peephole LSTM formula). +/// @param weights_projection_desc Memory descriptor for the weights applied to +/// the hidden states to get the recurrent projection (according to the +/// Projection LSTM formula). +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param dst_iter_c_desc Memory descriptor for the output recurrent cell +/// state vector. +/// @param flags Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lstm_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t src_iter_c_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t weights_peephole_desc, + const_dnnl_memory_desc_t weights_projection_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t dst_iter_c_desc, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for an LSTM backward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p src_iter_c_desc, @p diff_src_iter_desc, +/// and @p diff_src_iter_c_desc, +/// - @p weights_peephole_desc together with @p diff_weights_peephole_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p dst_iter_c_desc, @p diff_dst_iter_desc, +/// and @p diff_dst_iter_c_desc. +/// +/// This would then indicate that the LSTM backward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// The @p weights_projection_desc together with @p +/// diff_weights_projection_desc could either be @c NULL or point to a zero +/// memory descriptor. This would then indicate that the LSTM doesn't have +/// recurrent projection layer. +/// +/// @note +/// All memory descriptors can be initialized with #dnnl_format_tag_any or +/// with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param src_iter_c_desc Memory descriptor for the input recurrent cell +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param weights_peephole_desc Memory descriptor for the weights applied to +/// the cell states (according to the Peephole LSTM formula). +/// @param weights_projection_desc Memory descriptor for the weights applied to +/// the hidden states to get the recurrent projection (according to the +/// Projection LSTM formula). +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param dst_iter_c_desc Memory descriptor for the output recurrent cell +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_src_iter_c_desc Memory descriptor for the diff of input +/// recurrent cell state vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_weights_peephole_desc Memory descriptor for the diff of weights +/// applied to the cell states (according to the Peephole LSTM formula). +/// @param diff_weights_projection_desc Memory descriptor for the diff of +/// weights applied to the hidden states to get the recurrent projection +/// (according to the Projection LSTM formula). +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param diff_dst_iter_c_desc Memory descriptor for the diff of output +/// recurrent cell state vector. +/// @param flags Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lstm_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t src_iter_c_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t weights_peephole_desc, + const_dnnl_memory_desc_t weights_projection_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t dst_iter_c_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_src_iter_c_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_weights_peephole_desc, + const_dnnl_memory_desc_t diff_weights_projection_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, + const_dnnl_memory_desc_t diff_dst_iter_c_desc, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for GRU forward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc. +/// +/// This would then indicate that the GRU forward propagation primitive should +/// not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param flags Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_gru_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for GRU backward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p diff_src_iter_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p diff_dst_iter_desc. +/// +/// This would then indicate that the GRU backward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param flags Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_gru_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a descriptor for LBR GRU forward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc. +/// +/// This would then indicate that the LBR GRU forward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param flags Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lbr_gru_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for LBR GRU backward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p diff_src_iter_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p diff_dst_iter_desc. +/// +/// This would then indicate that the LBR GRU backward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param flags Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lbr_gru_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for AUGRU forward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc. +/// +/// This would then indicate that the AUGRU forward propagation primitive should +/// not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param attention_desc Memory descriptor for the attention vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param flags Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_augru_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t attention_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for AUGRU backward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p diff_src_iter_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p diff_dst_iter_desc. +/// +/// This would then indicate that the AUGRU backward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param attention_desc Memory descriptor for the attention vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_attention_desc Memory descriptor for the diff of attention vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param flags Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_augru_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t attention_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_attention_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for LBR AUGRU forward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc, +/// - @p bias_desc, +/// - @p dst_iter_desc. +/// +/// This would then indicate that the LBR AUGRU forward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param attention_desc Memory descriptor for the attention vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param flags Unused. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lbr_augru_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t attention_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, + const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for LBR AUGRU backward propagation primitive. +/// +/// The following arguments may either be @c NULL or point to a zero memory +/// descriptor: +/// - @p src_iter_desc together with @p diff_src_iter_desc, +/// - @p bias_desc together with @p diff_bias_desc, +/// - @p dst_iter_desc together with @p diff_dst_iter_desc. +/// +/// This would then indicate that the LBR AUGRU backward propagation primitive +/// should not use them and should default to zero values instead. +/// +/// @note +/// All memory descriptors can be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Must be #dnnl_backward. +/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more +/// info. +/// @param src_layer_desc Memory descriptor for the input vector. +/// @param src_iter_desc Memory descriptor for the input recurrent hidden +/// state vector. +/// @param attention_desc Memory descriptor for the attention vector. +/// @param weights_layer_desc Memory descriptor for the weights applied to the +/// layer input. +/// @param weights_iter_desc Memory descriptor for the weights applied to the +/// recurrent input. +/// @param bias_desc Bias memory descriptor. +/// @param dst_layer_desc Memory descriptor for the output vector. +/// @param dst_iter_desc Memory descriptor for the output recurrent hidden +/// state vector. +/// @param diff_src_layer_desc Memory descriptor for the diff of input vector. +/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent +/// hidden state vector. +/// @param diff_attention_desc Memory descriptor for the diff of attention vector. +/// @param diff_weights_layer_desc Memory descriptor for the diff of weights +/// applied to the layer input. +/// @param diff_weights_iter_desc Memory descriptor for the diff of weights +/// applied to the recurrent input. +/// @param diff_bias_desc Diff bias memory descriptor. +/// @param diff_dst_layer_desc Memory descriptor for the diff of output +/// vector. +/// @param diff_dst_iter_desc Memory descriptor for the diff of output +/// recurrent hidden state vector. +/// @param flags Unused. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_lbr_augru_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction, + const_dnnl_memory_desc_t src_layer_desc, + const_dnnl_memory_desc_t src_iter_desc, + const_dnnl_memory_desc_t attention_desc, + const_dnnl_memory_desc_t weights_layer_desc, + const_dnnl_memory_desc_t weights_iter_desc, + const_dnnl_memory_desc_t bias_desc, + const_dnnl_memory_desc_t dst_layer_desc, + const_dnnl_memory_desc_t dst_iter_desc, + const_dnnl_memory_desc_t diff_src_layer_desc, + const_dnnl_memory_desc_t diff_src_iter_desc, + const_dnnl_memory_desc_t diff_attention_desc, + const_dnnl_memory_desc_t diff_weights_layer_desc, + const_dnnl_memory_desc_t diff_weights_iter_desc, + const_dnnl_memory_desc_t diff_bias_desc, + const_dnnl_memory_desc_t diff_dst_layer_desc, + const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_rnn + +/// @addtogroup dnnl_api_matmul +/// @{ + +/// Creates a primitive descriptor for a matrix multiplication primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param src_desc Source memory descriptor (matrix A) +/// @param weights_desc Weights memory descriptor (matrix B) +/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory +/// descriptor, or a memory descriptor with format_kind set to +/// #dnnl_format_kind_undef disables the bias term. +/// @param dst_desc Destination memory descriptor (matrix C). +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_matmul_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t weights_desc, + const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_matmul + +/// @addtogroup dnnl_api_resampling Resampling +/// @{ + +/// Creates a primitive descriptor for a resampling forward propagation +/// primitive. +/// +/// @note +/// Destination memory descriptor is allowed to be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param prop_kind Propagation kind. Possible values are +/// #dnnl_forward_training and #dnnl_forward_inference. +/// @param alg_kind resampling algorithm kind: either #dnnl_resampling_nearest, +/// or #dnnl_resampling_linear. +/// @param factors Array of scaling factors for spatial dimension. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_resampling_forward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind, + const float *factors, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr); + +/// Creates a primitive descriptor for a resampling backward propagation +/// primitive. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind resamplinging algorithm kind: either +/// #dnnl_resampling_nearest, or #dnnl_resampling_linear. +/// @param diff_src_desc Diff source memory descriptor. +/// @param diff_dst_desc Diff destination memory descriptor. +/// @param factors Array of scaling factors for spatial dimension. +/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation +/// primitive. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +/// +dnnl_status_t DNNL_API dnnl_resampling_backward_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const float *factors, + const_dnnl_memory_desc_t diff_src_desc, + const_dnnl_memory_desc_t diff_dst_desc, + const_dnnl_primitive_desc_t hint_fwd_pd, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_resampling + +/// @addtogroup dnnl_api_reduction Reduction +/// @{ + +/// Creates a primitive descriptor for a reduction primitive. +/// +/// @note +/// Destination memory descriptor is allowed to be initialized with +/// #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any. +/// +/// @param primitive_desc Output primitive descriptor. +/// @param engine Engine to use. +/// @param alg_kind reduction algorithm kind. Possible values: +/// #dnnl_reduction_max, #dnnl_reduction_min, #dnnl_reduction_sum, +/// #dnnl_reduction_mul, #dnnl_reduction_mean, #dnnl_reduction_norm_lp_max, +/// #dnnl_reduction_norm_lp_sum, #dnnl_reduction_norm_lp_power_p_max, +/// #dnnl_reduction_norm_lp_power_p_sum. +/// @param p Algorithm specific parameter. +/// @param eps Algorithm specific parameter. +/// @param src_desc Source memory descriptor. +/// @param dst_desc Destination memory descriptor. +/// @param attr Primitive attributes (can be NULL). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_reduction_primitive_desc_create( + dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine, + dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc, + const_dnnl_memory_desc_t dst_desc, float p, float eps, + const_dnnl_primitive_attr_t attr); + +/// @} dnnl_api_reduction + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_primitive_cache +/// @{ + +/// Returns the number of primitives that can be held in the primitive cache +/// at the same time. +/// +/// @param capacity Primitive cache capacity to query. Concurrently +/// accessing @p capacity is safe. +/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the +/// @p capacity value is invalid, and #dnnl_success/#dnnl::status::success on +/// success. +dnnl_status_t DNNL_API dnnl_get_primitive_cache_capacity(int *capacity); + +/// Sets a number of primitives that can be held in the primitive cache +/// at a time. +/// +/// @param capacity Primitive cache capacity to set. If a new @p capacity is +/// less than a number of primitives that the primitive cache already has +/// then the excess entries will be evicted. Setting the @p capacity to 0 +/// clears the primitive cache and disables it. Concurrently modifying +/// @p capacity is safe. +/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the +/// @p capacity value is invalid, and #dnnl_success/#dnnl::status::success on +/// success. +dnnl_status_t DNNL_API dnnl_set_primitive_cache_capacity(int capacity); + +/// @} dnnl_api_primitive_cache + +/// @addtogroup dnnl_api_service +/// @{ + +/// Configures dumping of JIT-generated code. +/// +/// @note +/// This setting overrides the DNNL_JIT_DUMP environment variable. +/// +/// @param enable Flag value. Set to 0 to disable and set to 1 to enable. +/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the +/// @p flag value is invalid, and #dnnl_success/#dnnl::status::success on +/// success. +dnnl_status_t DNNL_API dnnl_set_jit_dump(int enable); + +/// Sets library profiling flags. The flags define which profilers are +/// supported. +/// +/// @note +/// This setting overrides DNNL_JIT_PROFILE environment variable. +/// +/// @sa @ref dev_guide_profilers +/// +/// @param flags Profiling flags that can contain the following bits: +/// - @ref DNNL_JIT_PROFILE_VTUNE -- integration with VTune Profiler +/// (on by default) +/// - @ref DNNL_JIT_PROFILE_LINUX_JITDUMP -- produce Linux-specific +/// jit-pid.dump output (off by default). The location of the output +/// is controlled via JITDUMPDIR environment variable or via +/// dnnl_set_jit_profiling_jitdumpdir() function. +/// - @ref DNNL_JIT_PROFILE_LINUX_PERFMAP -- produce Linux-specific +/// perf-pid.map output (off by default). The output is always placed +/// into /tmp. +/// +/// Passing @ref DNNL_JIT_PROFILE_NONE disables profiling completely. +/// +/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the +/// @p flags value is invalid, and #dnnl_success/#dnnl::status::success on +/// success. +dnnl_status_t DNNL_API dnnl_set_jit_profiling_flags(unsigned flags); + +/// Sets JIT dump output path. Only applicable to Linux and is only +/// used when profiling flags have DNNL_JIT_PROFILE_LINUX_PERF bit set. +/// +/// After the first JIT kernel is generated, the jitdump output will be placed +/// into temporary directory created using the mkdtemp template +/// 'dir/.debug/jit/dnnl.XXXXXX'. +/// +/// @sa @ref dev_guide_profilers +/// +/// @note +/// This setting overrides JITDUMPDIR environment variable. If +/// JITDUMPDIR is not set, and this function is never called, the path +/// defaults to HOME. Passing NULL reverts the value to default. +/// +/// @note +/// The directory is accessed only when the first JIT kernel is being +/// created. JIT profiling will be disabled in case of any errors +/// accessing or creating this directory. +/// +/// @param dir JIT dump output path. +/// @returns #dnnl_success/#dnnl::status::success if the +/// output directory was set correctly and an error status otherwise. +/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented on Windows. +dnnl_status_t DNNL_API dnnl_set_jit_profiling_jitdumpdir(const char *dir); + +/// Sets the maximal ISA the library can dispatch to on the CPU. See +/// #dnnl_cpu_isa_t and #dnnl::cpu_isa for the list of the values accepted by +/// the C and C++ API functions respectively. +/// +/// This function has effect only once, and returns an error on subsequent +/// calls. It should also be invoked before any other oneDNN API call, otherwise +/// it may return an error. +/// +/// This function overrides the DNNL_MAX_CPU_ISA environment variable. The +/// environment variable can be set to the desired maximal ISA name in upper +/// case and with dnnl_cpu_isa prefix removed. For example: +/// `DNNL_MAX_CPU_ISA=AVX2`. +/// +/// @note +/// The ISAs are only partially ordered: +/// - SSE41 < AVX < AVX2 < AVX2_VNNI < AVX2_VNNI_2, +/// - AVX2 < AVX512_CORE < AVX512_CORE_VNNI < AVX512_CORE_BF16 +/// < AVX10_1_512 < AVX10_1_512_AMX < AVX10_1_512_AMX_FP16, +/// - AVX2_VNNI < AVX10_1_512. +/// Aliases: +/// - AVX512_CORE_FP16 = AVX10_1_512 +/// - AVX512_CORE_AMX = AVX10_1_512_AMX +/// - AVX512_CORE_AMX_FP16 = AVX10_1_512_AMX_FP16 +/// +/// @sa @ref dev_guide_cpu_dispatcher_control for more details +/// +/// @param isa Maximal ISA the library should dispatch to. Pass +/// #dnnl_cpu_isa_default/#dnnl::cpu_isa::isa_default to remove ISA restrictions +/// (except for ISAs with initial support in the library). +/// @returns #dnnl_success/#dnnl::status::success on success and a +/// #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the @p isa +/// parameter is invalid or the ISA cannot be changed at this time. +/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented if the feature +/// was disabled at build time (see @ref dev_guide_build_options for more +/// details). +dnnl_status_t DNNL_API dnnl_set_max_cpu_isa(dnnl_cpu_isa_t isa); + +/// Gets the maximal ISA the library can dispatch to on the CPU. See +/// #dnnl_cpu_isa_t and #dnnl::cpu_isa for the list of the values returned by +/// the C and C++ API functions respectively. +/// +/// @sa @ref dev_guide_cpu_dispatcher_control for more details +/// +/// @returns #dnnl_cpu_isa_t value reflecting the maximal ISA the library may +/// dispatch to. +dnnl_cpu_isa_t DNNL_API dnnl_get_effective_cpu_isa(void); + +/// Sets the hints flag for the CPU ISA. See #dnnl_cpu_isa_hints_t and +/// #dnnl::cpu_isa_hints for the list of the values accepted by the C and C++ +/// API functions respectively. +/// +/// This function has effect only once, and returns an error on subsequent +/// calls. It should also be invoked before any other oneDNN API call, otherwise +/// it may return an error. +/// +/// This function overrides the DNNL_CPU_ISA_HINTS environment variable. +/// @sa @ref dev_guide_cpu_isa_hints for more details +/// +/// @param isa_hints CPU ISA hints to be passed over to the implementation. +/// Pass #dnnl_cpu_isa_no_hints/#dnnl::cpu_isa_hints::no_hints to use +/// default features i.e. no hints. +/// @returns #dnnl_success/#dnnl::status::success on success and a +/// #dnnl_runtime_error/#dnnl::status::runtime_error if the ISA hints cannot +/// be specified at the current time. +/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented if the feature +/// was disabled at build time (see @ref dev_guide_build_options for more +/// details). +dnnl_status_t DNNL_API dnnl_set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints); + +/// Gets the ISA specific hints that library can follow. See +/// #dnnl_cpu_isa_hints_t and #dnnl::cpu_isa_hints for the list of the values +/// returned by the C and C++ API functions respectively. +/// +/// @sa @ref dev_guide_cpu_isa_hints for more details +/// +/// @returns #dnnl_cpu_isa_hints_t value reflecting the ISA specific hints the +/// library can follow. +dnnl_cpu_isa_hints_t DNNL_API dnnl_get_cpu_isa_hints(void); + +/// @} dnnl_api_service + +#ifdef DNNL_EXPERIMENTAL_PROFILING + +/// @addtogroup dnnl_api_profiling Profiling +/// @{ + +/// Resets a profiler's state. +/// +/// @param stream Stream associated with the profiler. +/// +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_reset_profiling(dnnl_stream_t stream); + +/// Queries profiling data. The profiling data accumulates for each primitive +/// execution. The @p num_entries will be equal to the number of executions +/// since the last `dnnl_reset_profiling` call. In order to query the +/// @p num_entries the @p data parameter should be NULL. When @p data is NULL +/// then the @p data_kind parameter is ignored. +/// +/// The profiling data can be reset by calling #dnnl_reset_profiling. +/// +/// @note +/// It is required to wait for all submitted primitives to complete +/// using #dnnl_stream_wait prior to querying profiling data. +/// +/// @param stream Stream that was used for executing a primitive that +/// is being profiled. +/// @param data_kind Profiling data kind to query. +/// @param num_entries Number of profiling data entries. +/// @param data Profiling data. +/// +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_query_profiling_data(dnnl_stream_t stream, + dnnl_profiling_data_kind_t data_kind, int *num_entries, uint64_t *data); + +/// @} dnnl_api_profiling +#endif + +/// @addtogroup dnnl_api_blas +/// @{ + +/// Performs single-precision matrix-matrix multiply. +/// +/// The operation is defined as: +/// +/// `C := alpha * op( A ) * op( B ) + beta * C` +/// +/// where +/// - `op( X ) = X` or `op( X ) = X**T`, +/// - `alpha` and `beta` are scalars, and +/// - `A`, `B`, and `C` are matrices: +/// - `op( A )` is an `MxK` matrix, +/// - `op( B )` is an `KxN` matrix, +/// - `C` is an `MxN` matrix. +/// +/// The matrices are assumed to be stored in row-major order (the elements in +/// each of the matrix rows are contiguous in memory). +/// +/// @note +/// This API does not support XERBLA. Instead, unlike the standard BLAS +/// functions, this one returns a dnnl_status_t value to allow error +/// handling. +/// +/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not +/// transposed, and 'T' or 't' means that A is transposed. +/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not +/// transposed, and 'T' or 't' means that B is transposed. +/// @param M The M dimension. +/// @param N The N dimension. +/// @param K The K dimension. +/// @param alpha The alpha parameter that is used to scale the product of +/// matrices A and B. +/// @param A A pointer to the A matrix data. +/// @param lda The leading dimension for the matrix A. +/// @param B A pointer to the B matrix data. +/// @param ldb The leading dimension for the matrix B. +/// @param beta The beta parameter that is used to scale the matrix C. +/// @param C A pointer to the C matrix data. +/// @param ldc The leading dimension for the matrix C. +/// @returns #dnnl_success/#dnnl::status::success on success and a status +/// describing the error otherwise. +dnnl_status_t DNNL_API dnnl_sgemm(char transa, char transb, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda, + const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc); + +/// Performs integer matrix-matrix multiply on 8-bit unsigned matrix A, 8-bit +/// signed matrix B, and 32-bit signed resulting matrix C. +/// +/// The operation is defined as: +/// +/// `C := alpha * (op(A) - A_offset) * (op(B) - B_offset) + beta * C + C_offset` +/// +/// where +/// - `op( X ) = X` or `op( X ) = X**T`, +/// - `alpha` and `beta` are scalars, and +/// - `A`, `B`, and `C` are matrices: +/// - `op( A )` is an `MxK` matrix, +/// - `op( B )` is an `KxN` matrix, +/// - `C` is an `MxN` matrix. +/// - `A_offset` is an `MxK` matrix with every element equal the `ao` value, +/// - `B_offset` is an `KxN` matrix with every element equal the `bo` value, +/// - `C_offset` is an `MxN` matrix which is defined by the `co` array of size `len`: +/// - if `offsetc = F`: the `len` must be at least `1`, +/// - if `offsetc = C`: the `len` must be at least `max(1, m)`, +/// - if `offsetc = R`: the `len` must be at least `max(1, n)`, +/// +/// The matrices are assumed to be stored in row-major order (the elements in +/// each of the matrix rows are contiguous in memory). +/// +/// @note +/// This API does not support XERBLA. Instead, unlike the standard BLAS +/// functions, this one returns a dnnl_status_t value to allow error +/// handling. +/// +/// @warning +/// On some architectures saturation may happen during intermediate +/// computations, which would lead to unexpected results. For more +/// details, refer to @ref dev_guide_int8_computations. +/// +/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not +/// transposed, and 'T' or 't' means that A is transposed. +/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not +/// transposed, and 'T' or 't' means that B is transposed. +/// @param offsetc Flag specifying how offsets should be applied to matrix C: +/// - 'F' means that the same offset will be applied to each element of +/// the matrix C, +/// - 'C' means that individual offset will be applied to each element +/// within each column, +/// - 'R' means that individual offset will be applied to each element +/// within each row. +/// @param M The M dimension. +/// @param N The N dimension. +/// @param K The K dimension. +/// @param alpha The alpha parameter that is used to scale the product of +/// matrices A and B. +/// @param A A pointer to the A matrix data. +/// @param lda The leading dimension for the matrix A. +/// @param ao The offset value for the matrix A. +/// @param B A pointer to the B matrix data. +/// @param ldb The leading dimension for the matrix B. +/// @param bo The offset value for the matrix B. +/// @param beta The beta parameter that is used to scale the matrix C. +/// @param C A pointer to the C matrix data. +/// @param ldc The leading dimension for the matrix C. +/// @param co An array of offset values for the matrix C. The number of +/// elements in the array depends on the value of @p offsetc. +/// @returns #dnnl_success/#dnnl::status::success on success and a status +/// describing the error otherwise. +dnnl_status_t DNNL_API dnnl_gemm_u8s8s32(char transa, char transb, char offsetc, + dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A, + dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co); + +/// Performs integer matrix-matrix multiply on 8-bit signed matrix A, 8-bit +/// signed matrix B, and 32-bit signed resulting matrix C. +/// +/// The operation is defined as: +/// +/// `C := alpha * (op(A) - A_offset) * (op(B) - B_offset) + beta * C + C_offset` +/// +/// where +/// - `op( X ) = X` or `op( X ) = X**T`, +/// - `alpha` and `beta` are scalars, and +/// - `A`, `B`, and `C` are matrices: +/// - `op( A )` is an `MxK` matrix, +/// - `op( B )` is an `KxN` matrix, +/// - `C` is an `MxN` matrix. +/// - `A_offset` is an `MxK` matrix with every element equal the `ao` value, +/// - `B_offset` is an `KxN` matrix with every element equal the `bo` value, +/// - `C_offset` is an `MxN` matrix which is defined by the `co` array of size `len`: +/// - if `offsetc = F`: the `len` must be at least `1`, +/// - if `offsetc = C`: the `len` must be at least `max(1, m)`, +/// - if `offsetc = R`: the `len` must be at least `max(1, n)`, +/// +/// The matrices are assumed to be stored in row-major order (the elements in +/// each of the matrix rows are contiguous in memory). +/// +/// @note +/// This API does not support XERBLA. Instead, unlike the standard BLAS +/// functions, this one returns a dnnl_status_t value to allow error +/// handling. +/// +/// @warning +/// On some architectures saturation may happen during intermediate +/// computations, which would lead to unexpected results. For more +/// details, refer to @ref dev_guide_int8_computations. +/// +/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not +/// transposed, and 'T' or 't' means that A is transposed. +/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not +/// transposed, and 'T' or 't' means that B is transposed. +/// @param offsetc Flag specifying how offsets should be applied to matrix C: +/// - 'F' means that the same offset will be applied to each element of +/// the matrix C, +/// - 'C' means that individual offset will be applied to each element +/// within each column, +/// - 'R' means that individual offset will be applied to each element +/// within each row. +/// @param M The M dimension. +/// @param N The N dimension. +/// @param K The K dimension. +/// @param alpha The alpha parameter that is used to scale the product of +/// matrices A and B. +/// @param A A pointer to the A matrix data. +/// @param lda The leading dimension for the matrix A. +/// @param ao The offset value for the matrix A. +/// @param B A pointer to the B matrix data. +/// @param ldb The leading dimension for the matrix B. +/// @param bo The offset value for the matrix B. +/// @param beta The beta parameter that is used to scale the matrix C. +/// @param C A pointer to the C matrix data. +/// @param ldc The leading dimension for the matrix C. +/// @param co An array of offset values for the matrix C. The number of +/// elements in the array depends on the value of @p offsetc. +/// @returns #dnnl_success/#dnnl::status::success on success and a status +/// describing the error otherwise. +dnnl_status_t DNNL_API dnnl_gemm_s8s8s32(char transa, char transb, char offsetc, + dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A, + dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co); + +/// @} dnnl_api_blas + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_H */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..92dd01c09049bec24644e16d78861ee9c829b875 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp @@ -0,0 +1,14071 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2016-2025 Intel Corporation +* Copyright 2024 FUJITSU LIMITED +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C++ API + +#ifndef ONEAPI_DNNL_DNNL_HPP +#define ONEAPI_DNNL_DNNL_HPP + +#include "oneapi/dnnl/dnnl_config.h" + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include +#include +#include +#include +#include +#include + +#include "oneapi/dnnl/dnnl.h" +#include "oneapi/dnnl/dnnl_common.hpp" + +/// @endcond + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// oneDNN namespace +namespace dnnl { + +/// @addtogroup dnnl_api_utils Utilities +/// Utility types and definitions. +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS +template +void validate_container_size(const T &v, const char *error_message, + int min_size = 1, int max_size = -1) { + const int size = (int)v.size(); + if (size < min_size || (max_size >= 0 && size > max_size)) + DNNL_THROW_ERROR(dnnl_invalid_arguments, error_message); +} +/// @endcond + +/// @cond DO_NOT_DOCUMENT_THIS +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_memory_desc_t p) { + return dnnl_memory_desc_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_memory_t p) { + return dnnl_memory_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_primitive_desc_t p) { + return dnnl_primitive_desc_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_primitive_t p) { + return dnnl_primitive_destroy(p); + } +}; + +/// @endcond + +/// @} dnnl_api_utils + +struct stream; +struct memory; +struct primitive_desc; + +/// @addtogroup dnnl_api_primitives Primitives +/// Compute primitives +/// @sa @ref dev_guide_basic_concepts +/// @{ + +/// @addtogroup dnnl_api_primitives_common Common +/// Common operations to create, destroy and inspect primitives +/// @{ + +/// Base class for all computational primitives. +struct primitive : public handle { + /// Kinds of primitives supported by the library. + enum class kind { + /// Undefined primitive + undef = dnnl_undefined_primitive, + /// A reorder primitive. + reorder = dnnl_reorder, + /// A shuffle primitive. + shuffle = dnnl_shuffle, + /// A (out-of-place) tensor concatenation primitive. + concat = dnnl_concat, + /// A summation primitive. + sum = dnnl_sum, + /// A convolution primitive. + convolution = dnnl_convolution, + /// A deconvolution primitive. + deconvolution = dnnl_deconvolution, + /// An element-wise primitive. + eltwise = dnnl_eltwise, + /// An LRN primitive. + lrn = dnnl_lrn, + /// A batch normalization primitive. + batch_normalization = dnnl_batch_normalization, + /// An inner product primitive. + inner_product = dnnl_inner_product, + /// An RNN primitive. + rnn = dnnl_rnn, + /// A binary primitive. + binary = dnnl_binary, + /// A matmul (matrix multiplication) primitive. + matmul = dnnl_matmul, + /// A resampling primitive. + resampling = dnnl_resampling, + /// A pooling primitive. + pooling = dnnl_pooling, + /// A reduction primitive. + reduction = dnnl_reduction, + /// A PReLU primitive. + prelu = dnnl_prelu, + /// A softmax primitive. + softmax = dnnl_softmax, + /// A layer normalization primitive. + layer_normalization = dnnl_layer_normalization, + /// A group normalization primitive + group_normalization = dnnl_group_normalization, + }; + + using handle::handle; + + /// Default constructor. Constructs an empty object. + primitive() = default; + + /// Constructs a primitive from a C API primitive descriptor. + /// + /// @param c_pd C API primitive descriptor. + primitive(const_dnnl_primitive_desc_t c_pd); + + /// Constructs a primitive from a C API primitive descriptor and a cache blob. + /// + /// @param c_pd C API primitive descriptor. + /// @param cache_blob Cache blob. + primitive(const_dnnl_primitive_desc_t c_pd, + const std::vector &cache_blob); + + /// Constructs a primitive from a primitive descriptor. + /// + /// @param pd Primitive descriptor. + primitive(const primitive_desc &pd); + + /// Constructs a primitive from a primitive descriptor and a cache blob. + /// + /// @param pd Primitive descriptor. + /// @param cache_blob Cache blob. + primitive(const primitive_desc &pd, const std::vector &cache_blob); + + /// Returns the C API primitive descriptor of the underlying C API + /// primitive. + /// + /// @returns The underlying C API primitive descriptor. + inline const_dnnl_primitive_desc_t get_primitive_desc() const; + + /// Returns the kind of the primitive. + /// + /// @returns The primitive kind. + inline kind get_kind() const; + + /// Returns a cache blob for the primitive. + /// + /// @returns Vector containing the cache blob. + /// + /// @note The cache blob can be empty. It's the user's responsibility to + /// check whether it's empty prior to passing it to the primitive + /// constructor. + inline std::vector get_cache_blob() const; + + /// Executes computations specified by the primitive in a specified stream. + /// + /// Arguments are passed via an arguments map containing pairs. The index must be one of the `DNNL_ARG_*` values + /// such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor + /// matching the one returned by + /// primitive_desc::query_md(#query::exec_arg_md, index) unless using + /// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL). + /// + /// @param astream Stream object. The stream must belong to the same engine + /// as the primitive. + /// @param args Arguments map. + void execute(const stream &astream, + const std::unordered_map &args) const; +}; + +/// Converts primitive kind enum value from C++ API to C API type. +/// +/// @param akind C++ API primitive kind enum value. +/// @returns Corresponding C API primitive kind enum value. +inline dnnl_primitive_kind_t convert_to_c(primitive::kind akind) { + return static_cast(akind); +} + +const_dnnl_primitive_desc_t primitive::get_primitive_desc() const { + const_dnnl_primitive_desc_t pd; + error::wrap_c_api(dnnl_primitive_get_primitive_desc(get(), &pd), + "could not get a primitive descriptor from a primitive"); + return pd; +} + +dnnl::primitive::kind primitive::get_kind() const { + const_dnnl_primitive_desc_t pd = get_primitive_desc(); + // TODO (Roma): the code below is only needed because get_primitive_desc + // returns a C type. + dnnl_primitive_kind_t kind; + error::wrap_c_api(dnnl_primitive_desc_query( + pd, dnnl_query_primitive_kind, 0, (void *)&kind), + "could not get a primitive kind from a primitive descriptor"); + return static_cast(kind); +} + +std::vector primitive::get_cache_blob() const { + size_t size; + error::wrap_c_api(dnnl_primitive_get_cache_blob(get(), &size, nullptr), + "could not get cache blob size from a primitive"); + + std::vector cache_blob(size); + error::wrap_c_api( + dnnl_primitive_get_cache_blob(get(), &size, cache_blob.data()), + "could not get a cache blob from a primitive"); + return cache_blob; +} + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_attributes +/// +/// A container for parameters that extend primitives behavior. +/// +/// Attributes can also contain Post-ops, which are computations executed +/// after the primitive. +/// +/// @sa @ref dev_guide_attributes +/// @sa @ref dev_guide_attributes_post_ops +/// +/// @{ + +/// Scratchpad mode +enum class scratchpad_mode { + /// The library manages the scratchpad allocation according to the policy + /// specified by the `DNNL_ENABLE_CONCURRENT_EXEC` + /// [build option](@ref dev_guide_build_options) (default). + /// + /// When `DNNL_ENABLE_CONCURRENT_EXEC=OFF` (default), the library + /// scratchpad is common to all primitives to reduce the memory footprint. + /// This configuration comes with limited thread-safety properties, namely + /// primitives can be created and executed in parallel but cannot migrate + /// between threads (in other words, each primitive should be executed in + /// the same thread it was created in). + /// + /// When `DNNL_ENABLE_CONCURRENT_EXEC=ON`, the library scratchpad is + /// private to each primitive. The memory footprint is larger than when + /// using `DNNL_ENABLE_CONCURRENT_EXEC=OFF` but different primitives can be + /// created and run concurrently (the same primitive cannot be run + /// concurrently from two different threads though). + library = dnnl_scratchpad_mode_library, + /// The user manages the scratchpad allocation by querying and providing + /// the scratchpad memory to primitives. This mode is thread-safe as long + /// as the scratchpad buffers are not used concurrently by two primitive + /// executions. + user = dnnl_scratchpad_mode_user, +}; + +/// Converts a scratchpad mode enum value from C++ API to C API type. +/// +/// @param mode C++ API scratchpad mode enum value. +/// @returns Corresponding C API scratchpad mode enum value. +inline dnnl_scratchpad_mode_t convert_to_c(scratchpad_mode mode) { + return static_cast(mode); +} + +/// Rounding mode +enum class rounding_mode { + /// rounding mode dictated by the floating-point environment + environment = dnnl_rounding_mode_environment, + /// stochastic rounding mode where a random bias is added to the + /// trailing mantissa bits before conversion. + stochastic = dnnl_rounding_mode_stochastic +}; + +/// Converts a rounding mode enum value from C++ API to C API type. +/// +/// @param mode C++ API rounding mode enum value. +/// @returns Corresponding C API rounding mode enum value. +inline dnnl_rounding_mode_t convert_to_c(rounding_mode mode) { + return static_cast(mode); +} + +/// Propagation kind. +enum class prop_kind { + /// Undefined propagation kind. + undef = dnnl_prop_kind_undef, + /// Forward data propagation (training mode). In this mode, primitives + /// perform computations necessary for subsequent backward propagation. + forward_training = dnnl_forward_training, + /// Forward data propagation (inference mode). In this mode, primitives + /// perform only computations that are necessary for inference and omit + /// computations that are necessary only for backward propagation. + forward_inference = dnnl_forward_inference, + /// Forward data propagation, + /// alias for #dnnl::prop_kind::forward_training. + forward = dnnl_forward, + /// Backward propagation (with respect to all parameters). + backward = dnnl_backward, + /// Backward data propagation. + backward_data = dnnl_backward_data, + /// Backward weights propagation. + backward_weights = dnnl_backward_weights, + /// Backward bias propagation. + backward_bias = dnnl_backward_bias +}; + +/// Converts propagation kind enum value from C++ API to C API type. +/// +/// @param akind C++ API propagation kind enum value. +/// @returns Corresponding C API propagation kind enum value. +inline dnnl_prop_kind_t convert_to_c(prop_kind akind) { + return static_cast(akind); +} + +/// Kinds of algorithms. +enum class algorithm { + /// Undefined algorithm + undef = dnnl_alg_kind_undef, + /// Convolution algorithm that is chosen to be either direct or Winograd + /// automatically + convolution_auto = dnnl_convolution_auto, + /// Direct convolution + convolution_direct = dnnl_convolution_direct, + /// Winograd convolution + convolution_winograd = dnnl_convolution_winograd, + /// Direct deconvolution + deconvolution_direct = dnnl_deconvolution_direct, + /// Winograd deconvolution + deconvolution_winograd = dnnl_deconvolution_winograd, + /// Elementwise: rectified linear unit (ReLU) + eltwise_relu = dnnl_eltwise_relu, + /// Elementwise: hyperbolic tangent non-linearity (tanh) + eltwise_tanh = dnnl_eltwise_tanh, + /// Elementwise: exponential linear unit (ELU) + eltwise_elu = dnnl_eltwise_elu, + /// Elementwise: square + eltwise_square = dnnl_eltwise_square, + /// Elementwise: abs + eltwise_abs = dnnl_eltwise_abs, + /// Elementwise: square root + eltwise_sqrt = dnnl_eltwise_sqrt, + /// Elementwise: swish (\f$x \cdot sigmoid(a \cdot x)\f$) + eltwise_swish = dnnl_eltwise_swish, + /// Elementwise: linear + eltwise_linear = dnnl_eltwise_linear, + /// Elementwise: soft_relu + eltwise_soft_relu = dnnl_eltwise_soft_relu, + /// Elementwise: mish + eltwise_mish = dnnl_eltwise_mish, + /// Elementwise: logistic + eltwise_logistic = dnnl_eltwise_logistic, + /// Elementwise: exponent + eltwise_exp = dnnl_eltwise_exp, + /// Elementwise: tanh-based gelu + eltwise_gelu_tanh = dnnl_eltwise_gelu_tanh, + /// Elementwise: erf-based gelu + eltwise_gelu_erf = dnnl_eltwise_gelu_erf, + /// Elementwise: natural logarithm + eltwise_log = dnnl_eltwise_log, + /// Elementwise: clip + eltwise_clip = dnnl_eltwise_clip, + /// Eltwise: clip version 2 + eltwise_clip_v2 = dnnl_eltwise_clip_v2, + /// Elementwise: pow + eltwise_pow = dnnl_eltwise_pow, + /// Elementwise: round + eltwise_round = dnnl_eltwise_round, + /// Elementwise: hardswish + eltwise_hardswish = dnnl_eltwise_hardswish, + /// Elementwise: hardsigmoid + eltwise_hardsigmoid = dnnl_eltwise_hardsigmoid, + /// Elementwise: rectified linar unit (ReLU) (dst for backward) + eltwise_relu_use_dst_for_bwd = dnnl_eltwise_relu_use_dst_for_bwd, + /// Elementwise: hyperbolic tangent non-linearity (tanh) (dst for backward) + eltwise_tanh_use_dst_for_bwd = dnnl_eltwise_tanh_use_dst_for_bwd, + /// Elementwise: exponential linear unit (ELU) (dst for backward) + eltwise_elu_use_dst_for_bwd = dnnl_eltwise_elu_use_dst_for_bwd, + /// Elementwise: square root (dst for backward) + eltwise_sqrt_use_dst_for_bwd = dnnl_eltwise_sqrt_use_dst_for_bwd, + /// Elementwise: logistic (dst for backward) + eltwise_logistic_use_dst_for_bwd = dnnl_eltwise_logistic_use_dst_for_bwd, + /// Elementwise: exponent (dst for backward) + eltwise_exp_use_dst_for_bwd = dnnl_eltwise_exp_use_dst_for_bwd, + /// Elementwise: clip version 2 (dst for backward) + eltwise_clip_v2_use_dst_for_bwd = dnnl_eltwise_clip_v2_use_dst_for_bwd, + /// Local response normalization (LRN) across multiple channels + lrn_across_channels = dnnl_lrn_across_channels, + /// LRN within a single channel + lrn_within_channel = dnnl_lrn_within_channel, + /// Max pooling + pooling_max = dnnl_pooling_max, + /// Average pooling include padding + pooling_avg_include_padding = dnnl_pooling_avg_include_padding, + /// Average pooling exclude padding + pooling_avg_exclude_padding = dnnl_pooling_avg_exclude_padding, + /// RNN cell + vanilla_rnn = dnnl_vanilla_rnn, + /// LSTM cell + vanilla_lstm = dnnl_vanilla_lstm, + /// GRU cell + vanilla_gru = dnnl_vanilla_gru, + /// GRU cell with linear before reset. Differs from the vanilla GRU + /// in how the new memory gate is calculated: + /// \f$c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f$ + /// LRB GRU expects 4 bias tensors on input: + /// \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$ + lbr_gru = dnnl_lbr_gru, + /// AUGRU cell + vanilla_augru = dnnl_vanilla_augru, + /// AUGRU cell with linear before reset + lbr_augru = dnnl_lbr_augru, + /// Binary add + binary_add = dnnl_binary_add, + /// Binary mul + binary_mul = dnnl_binary_mul, + /// Binary max + binary_max = dnnl_binary_max, + /// Binary min + binary_min = dnnl_binary_min, + /// Binary div + binary_div = dnnl_binary_div, + /// Binary sub + binary_sub = dnnl_binary_sub, + /// Binary greater than or equal + binary_ge = dnnl_binary_ge, + /// Binary greater than + binary_gt = dnnl_binary_gt, + /// Binary less than or equal + binary_le = dnnl_binary_le, + /// Binary less than + binary_lt = dnnl_binary_lt, + /// Binary equal + binary_eq = dnnl_binary_eq, + /// Binary not equal + binary_ne = dnnl_binary_ne, + /// Binary select + binary_select = dnnl_binary_select, + /// Nearest Neighbor resampling method + resampling_nearest = dnnl_resampling_nearest, + /// Linear (Bilinear, Trilinear) resampling method + resampling_linear = dnnl_resampling_linear, + /// Reduction using max operation + reduction_max = dnnl_reduction_max, + /// Reduction using min operation + reduction_min = dnnl_reduction_min, + /// Reduction using sum operation + reduction_sum = dnnl_reduction_sum, + /// Reduction using mul operation + reduction_mul = dnnl_reduction_mul, + /// Reduction using mean operation + reduction_mean = dnnl_reduction_mean, + /// Reduction using norm_lp_max operation + reduction_norm_lp_max = dnnl_reduction_norm_lp_max, + /// Reduction using norm_lp_sum operation + reduction_norm_lp_sum = dnnl_reduction_norm_lp_sum, + /// Reduction using norm_lp_power_p_max operation + reduction_norm_lp_power_p_max = dnnl_reduction_norm_lp_power_p_max, + /// Reduction using norm_lp_power_p_sum operation + reduction_norm_lp_power_p_sum = dnnl_reduction_norm_lp_power_p_sum, + /// Softmax, numerically stable + softmax_accurate = dnnl_softmax_accurate, + /// LogSoftmax, numerically stable + softmax_log = dnnl_softmax_log, +}; + +/// Converts algorithm kind enum value from C++ API to C API type. +/// @param aalgorithm C++ API algorithm kind enum value. +/// @returns Corresponding C API algorithm kind enum value. +inline dnnl_alg_kind_t convert_to_c(algorithm aalgorithm) { + return static_cast(aalgorithm); +} + +/// @} dnnl_api_attributes + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Flags for normalization primitives. +enum class normalization_flags : unsigned { + /// Use no normalization flags. If specified, the library computes mean and + /// variance on forward propagation for training and inference, outputs them + /// on forward propagation for training, and computes the respective + /// derivatives on backward propagation. + none = dnnl_normalization_flags_none, + + /// Use global statistics. If specified, the library uses mean and + /// variance provided by the user as an input on forward propagation and + /// does not compute their derivatives on backward propagation. Otherwise, + /// the library computes mean and variance on forward propagation for + /// training and inference, outputs them on forward propagation for + /// training, and computes the respective derivatives on backward + /// propagation. + use_global_stats = dnnl_use_global_stats, + + /// Use scale parameter. If specified, the user is expected to pass scale as + /// input on forward propagation. On backward propagation of type + /// #dnnl::prop_kind::backward, the library computes its derivative. + use_scale = dnnl_use_scale, + + /// Use shift parameter. If specified, the user is expected to pass shift as + /// input on forward propagation. On backward propagation of type + /// #dnnl::prop_kind::backward, the library computes its derivative. + use_shift = dnnl_use_shift, + + /// Fuse normalization with ReLU. On training, normalization will require + /// the workspace to implement backward propagation. On inference, the + /// workspace is not required and behavior is the same as when normalization + /// is fused with ReLU using the post-ops API. + fuse_norm_relu = dnnl_fuse_norm_relu, + + /// Fuse normalization with elementwise binary Add and then fuse with ReLU. + /// On training, normalization will require the workspace to implement + /// backward propagation. On inference, the workspace is not required. + fuse_norm_add_relu = dnnl_fuse_norm_add_relu, +}; + +/// Converts normalization flags enum value from C++ API to C API type. +/// @param flags C++ API normalization flags enum value. +/// @returns Corresponding C API normalization flags enum value. +inline dnnl_normalization_flags_t convert_to_c(normalization_flags flags) { + return static_cast(flags); +} + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_rnn +/// @{ + +/// RNN cell flags. +enum class rnn_flags : unsigned { + /// Undefined RNN flags + undef = dnnl_rnn_flags_undef, + /// Do not add weights gradient to existing diff_weights memory + diff_weights_overwrite = dnnl_rnn_flags_diff_weights_overwrite, +}; + +/// Converts RNN cell flags enum value from C++ API to C API type. +/// @param flags C++ API RNN cell flags enum value. +/// @returns Corresponding C API RNN cell flags enum value. +inline dnnl_rnn_flags_t convert_to_c(rnn_flags flags) { + return static_cast(flags); +} + +DNNL_DEFINE_BITMASK_OPS(normalization_flags) +DNNL_DEFINE_BITMASK_OPS(rnn_flags) + +/// A direction of RNN primitive execution +enum class rnn_direction { + /// Undefined RNN direction. + undef = dnnl_rnn_direction_undef, + /// Unidirectional execution of RNN primitive from left to right. + unidirectional_left2right = dnnl_unidirectional_left2right, + /// Unidirectional execution of RNN primitive from right to left. + unidirectional_right2left = dnnl_unidirectional_right2left, + /// Bidirectional execution of RNN primitive with concatenation of the + /// results. + bidirectional_concat = dnnl_bidirectional_concat, + /// Bidirectional execution of RNN primitive with summation of the + /// results. + bidirectional_sum = dnnl_bidirectional_sum, +}; + +/// Converts RNN direction enum value from C++ API to C API type. +/// @param dir C++ API RNN direction enum value. +/// @returns Corresponding C API RNN direction enum value. +inline dnnl_rnn_direction_t convert_to_c(rnn_direction dir) { + return static_cast(dir); +} + +/// @} dnnl_api_rnn + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Primitive descriptor query specification. +/// +/// In general, queries are not used with the C++ API because most queries are +/// implemented as class members. +/// +/// See @ref dnnl_query_t for more information. +enum class query { + /// no query + undef = dnnl_query_undef, + + /// execution engine + engine = dnnl_query_engine, + /// primitive kind + primitive_kind = dnnl_query_primitive_kind, + + /// number of inputs expected + num_of_inputs_s32 = dnnl_query_num_of_inputs_s32, + /// number of outputs expected + num_of_outputs_s32 = dnnl_query_num_of_outputs_s32, + + /// runtime estimation (seconds), unimplemented + time_estimate_f64 = dnnl_query_time_estimate_f64, + /// memory required for scratchpad (bytes) + /// + /// @sa @ref dev_guide_attributes_scratchpad + memory_consumption_s64 = dnnl_query_memory_consumption_s64, + + /// scratchpad engine + /// + /// engine to be used for creating scratchpad memory + scratchpad_engine = dnnl_query_scratchpad_engine, + + /// reorder source engine + reorder_src_engine = dnnl_query_reorder_src_engine, + /// reorder destination engine + reorder_dst_engine = dnnl_query_reorder_dst_engine, + + /// implementation name + impl_info_str = dnnl_query_impl_info_str, + + /// propagation kind + prop_kind = dnnl_query_prop_kind, + + /// size of cache blob ID in bytes + cache_blob_id_size_s64 = dnnl_query_cache_blob_id_size_s64, + + /// cache blob ID (pointer to array) + cache_blob_id = dnnl_query_cache_blob_id, + + /// strides + strides = dnnl_query_strides, + /// dilations + dilations = dnnl_query_dilations, + /// left padding + padding_l = dnnl_query_padding_l, + /// right padding + padding_r = dnnl_query_padding_r, + /// epsilon + epsilon_f32 = dnnl_query_epsilon_f32, + /// flags + flags = dnnl_query_flags, + /// algorithm kind + alg_kind = dnnl_query_alg_kind, + /// alpha + alpha_f32 = dnnl_query_alpha_f32, + /// beta + beta_f32 = dnnl_query_beta_f32, + /// axis + axis_s32 = dnnl_query_axis_s32, + /// LRN parameter local size + local_size_s64 = dnnl_query_local_size_s64, + /// LRN parameter K + k_f32 = dnnl_query_k_f32, + /// Reduction parameter P + p_f32 = dnnl_query_p_f32, + /// Resampling parameter factors + factors = dnnl_query_factors, + /// RNN parameter cell kind + cell_kind = dnnl_query_cell_kind, + /// RNN parameter direction + direction = dnnl_query_direction, + /// RNN parameter activation kind + activation_kind = dnnl_query_activation_kind, + /// Pooling parameter kernel + kernel = dnnl_query_kernel, + /// Shuffle parameter group size + group_size_s64 = dnnl_query_group_size_s64, + + /// source memory desc + src_md = dnnl_query_src_md, + /// source gradient (diff) memory desc + diff_src_md = dnnl_query_diff_src_md, + /// weights memory descriptor desc + weights_md = dnnl_query_weights_md, + /// weights gradient (diff) memory desc + diff_weights_md = dnnl_query_diff_weights_md, + /// destination memory desc + dst_md = dnnl_query_dst_md, + /// destination gradient (diff) memory desc + diff_dst_md = dnnl_query_diff_dst_md, + /// workspace memory desc + workspace_md = dnnl_query_workspace_md, + /// scratchpad memory desc + scratchpad_md = dnnl_query_scratchpad_md, + /// memory desc of an execute argument + exec_arg_md = dnnl_query_exec_arg_md, + + /// number of dimensions + ndims_s32 = dnnl_query_ndims_s32, + /// vector of dimensions + dims = dnnl_query_dims, + /// data type + data_type = dnnl_query_data_type, + /// submemory offset + submemory_offset_s64 = dnnl_query_submemory_offset_s64, + /// vector of padded dimensions + padded_dims = dnnl_query_padded_dims, + /// vector of padded offsets + padded_offsets = dnnl_query_padded_offsets, + /// format kind + format_kind = dnnl_query_format_kind, + /// number of innermost blocks + inner_nblks_s32 = dnnl_query_inner_nblks_s32, + /// vector of sizes of the innermost blocks + inner_blks = dnnl_query_inner_blks, + /// vector of logical indices of the blocks + inner_idxs = dnnl_query_inner_idxs, +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Sparse encoding + sparse_encoding = dnnl_query_sparse_encoding, + /// Number of non-zero entries + nnz_s64 = dnnl_query_nnz_s64, + /// Number of buffers required for a memory descriptor + num_handles_s32 = dnnl_query_num_handles_s32, +#endif +}; + +/// Converts query enum value from C++ API to C API type. +/// @param aquery C++ API query enum value. +/// @returns Corresponding C API query enum value. +inline dnnl_query_t convert_to_c(query aquery) { + return static_cast(aquery); +} + +/// @} dnnl_api_primitives_common + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_memory Memory +/// +/// A container that describes and stores data. Memory objects can contain +/// data of various types and formats. There are two levels of abstraction: +/// +/// 1. **Memory descriptor** -- engine-agnostic logical description of data +/// (number of dimensions, dimension sizes, and data type), and, +/// optionally, the information about the physical format of data in +/// memory. If this information is not known yet, a memory descriptor can +/// be created with #dnnl::memory::format_tag::any. This allows +/// compute-intensive primitives to choose the best format for +/// computation. The user is responsible for reordering the data into the +/// chosen format when formats do not match. +/// +/// A memory descriptor can be initialized either by specifying dimensions +/// and a memory format tag or strides for each of them, or by +/// manipulating the dnnl_memory_desc_t structure directly. +/// +/// @warning +/// The latter approach requires understanding how the physical data +/// representation is mapped to the structure and is discouraged. This +/// topic is discussed in @ref dev_guide_understanding_memory_formats. +/// +/// The user can query the amount of memory required by a memory +/// descriptor using the #dnnl::memory::desc::get_size() function. The +/// size of data in general cannot be computed as the product of +/// dimensions multiplied by the size of the data type. So users are +/// required to use this function for better code portability. +/// +/// Two memory descriptors can be compared using the equality and +/// inequality operators. The comparison is especially useful when +/// checking whether it is necessary to reorder data from the user's data +/// format to a primitive's format. +/// +/// 2. **Memory object** -- an engine-specific object that handles the memory +/// buffer and its description (a memory descriptor). For the CPU engine or +/// with USM, the memory buffer handle is simply a pointer to @c void. The +/// memory buffer can be queried using #dnnl::memory::get_data_handle() and +/// set using #dnnl::memory::set_data_handle(). The underlying SYCL buffer, +/// when used, can be queried using #dnnl::sycl_interop::get_buffer and set +/// using #dnnl::sycl_interop::set_buffer. A memory object can also be +/// queried for the underlying memory descriptor and for its engine using +/// #dnnl::memory::get_desc() and dnnl::memory::get_engine(). +/// +/// Along with ordinary memory descriptors with all dimensions being positive, +/// the library supports *zero-volume* memory descriptors with one or more +/// dimensions set to zero. This is used to support the NumPy\* convention. +/// If a zero-volume memory is passed to a primitive, the primitive typically +/// does not perform any computations with this memory. For example: +/// +/// - A concatenation primitive would ignore all memory object with zeroes in +/// the concat dimension / axis. +/// +/// - A forward convolution with a source memory object with zero in the +/// minibatch dimension would always produce a destination memory object +/// with a zero in the minibatch dimension and perform no computations. +/// +/// - However, a forward convolution with a zero in one of the weights +/// dimensions is ill-defined and is considered to be an error by the +/// library because there is no clear definition of what the output values +/// should be. +/// +/// Memory buffer of a zero-volume memory is never accessed. +/// +/// @{ + +/// Memory object. +/// +/// A memory object encapsulates a handle to a memory buffer allocated on a +/// specific engine, tensor dimensions, data type, and memory format, which is +/// the way tensor indices map to offsets in linear memory space. Memory +/// objects are passed to primitives during execution. +struct memory : public handle { + using handle::handle; + + /// Integer type for representing dimension sizes and indices. + typedef dnnl_dim_t dim; + /// Vector of dimensions. Implementations are free to force a limit on the + /// vector's length. + typedef std::vector dims; + + /// Helper function that validates that an `std::vector` of dimensions can + /// be safely converted to the C API array ::dnnl_dims_t. Throws if + /// validation fails. + /// + /// @param v Vector of dimensions. + /// @param min_size Minimum expected size of the vector. + template + static void validate_dims(const std::vector &v, int min_size = 0) { + validate_container_size( + v, "dimensions are invalid", min_size, DNNL_MAX_NDIMS); + } + + /// Data type specification. + enum class data_type { + /// Undefined data type (used for empty memory descriptors). + undef = dnnl_data_type_undef, + /// 4-bit float data type with 3-bit exponent and 0 bit mantissa. + f4_e3m0 = dnnl_f4_e3m0, + /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa. + f4_e2m1 = dnnl_f4_e2m1, + /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent. + e8m0 = dnnl_e8m0, + /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 5-bit exponent and a 2-bit mantissa. + f8_e5m2 = dnnl_f8_e5m2, + /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 4-bit exponent and a 3-bit mantissa. + f8_e4m3 = dnnl_f8_e4m3, + /// [16-bit/half-precision floating point](https://en.wikipedia.org/wiki/Half-precision_floating-point_format). + f16 = dnnl_f16, + /// non-standard + /// [16-bit floating point with 7-bit mantissa](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). + bf16 = dnnl_bf16, + /// [32-bit/single-precision floating point](https://en.wikipedia.org/wiki/Single-precision_floating-point_format). + f32 = dnnl_f32, + //// [64-bit/double-precision floating point](https://en.wikipedia.org/wiki/Double-precision_floating-point_format). + f64 = dnnl_f64, + /// 32-bit signed integer. + s32 = dnnl_s32, + /// 8-bit signed integer. + s8 = dnnl_s8, + /// 8-bit unsigned integer. + u8 = dnnl_u8, + /// 4-bit signed integer. + s4 = dnnl_s4, + /// 4-bit unsigned integer. + u4 = dnnl_u4, + }; + + /// Returns size of data type in bytes. + /// @returns The number of bytes occupied by data type. + static size_t data_type_size(data_type adata_type) { + return dnnl_data_type_size(convert_to_c(adata_type)); + } + + /// Memory format kind + enum class format_kind { + /// Undefined memory format kind, used for empty memory descriptors. + undef = dnnl_format_kind_undef, + /// A special format kind that indicates that the actual format will be + /// selected by a primitive automatically. + any = dnnl_format_kind_any, + /// A tensor in a generic format described by the stride and blocking + /// values in each dimension. + blocked = dnnl_blocked, +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Format kind for sparse tensors. + sparse = dnnl_format_kind_sparse, +#endif + /// A special format kind that indicates that tensor format is opaque. + opaque = dnnl_format_kind_opaque, + }; + +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Sparse encodings. + enum class sparse_encoding { + /// Undefined sparse encoding kind, used for empty memory descriptors. + undef = dnnl_sparse_encoding_undef, + /// Compressed Sparse Row (CSR) encoding. + csr = dnnl_csr, + /// An encoding that is used for an opaque storage schema for + /// tensors with unstructured sparsity. A memory descriptor with the + /// packed encoding cannot be used to create a memory object. It can + /// only be used to create a primitive descriptor to query the + /// actual memory descriptor (similar to the format tag `any`). + packed = dnnl_packed, + /// Coordinate Sparse (COO) encoding. + coo = dnnl_coo, + }; +#endif + + /// Memory format tag specification. + /// + /// Memory format tags can be further divided into two categories: + /// + /// - Domain-agnostic names, i.e. names that do not depend on the tensor + /// usage in the specific primitive. These names use letters from `a` + /// to `f` to denote logical dimensions and form the order in which the + /// dimensions are laid in memory. For example, + /// #dnnl::memory::format_tag::ab is used to denote a 2D tensor where the + /// second logical dimension (denoted as `b`) is the innermost, i.e. + /// has stride = 1, and the first logical dimension (`a`) is laid out in + /// memory with stride equal to the size of the second dimension. On the + /// other hand, #dnnl::memory::format_tag::ba is the transposed version + /// of the same tensor: the outermost dimension (`a`) becomes the + /// innermost one. + /// + /// - Domain-specific names, i.e. names that make sense only in the + /// context of a certain domain, such as CNN. These names are + /// aliases to the corresponding domain-agnostic tags and used mostly + /// for convenience. For example, #dnnl::memory::format_tag::nc + /// is used to denote 2D CNN activations tensor memory format, where + /// the channels dimension is the innermost one and the batch dimension + /// is the outermost one. Moreover, #dnnl::memory::format_tag::nc is + /// an alias for #dnnl::memory::format_tag::ab, because for + /// CNN primitives the logical dimensions of activations tensors come + /// in order: batch, channels, spatial. In other words, batch + /// corresponds to the first logical dimension (`a`), and channels + /// correspond to the second one (`b`). + /// + /// The following domain-specific notation applies to memory format tags: + /// - @c 'n' denotes the mini-batch dimension + /// - @c 'c' denotes a channels dimension + /// - When there are multiple channel dimensions (for example, + /// in convolution weights tensor), @c 'i' and @c 'o' denote dimensions + /// of input and output channels + /// - @c 'g' denotes a groups dimension for convolution weights + /// - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width + /// respectively + /// + /// See @ref dnnl_format_tag_t for a detailed description. + enum class format_tag { + /// Undefined memory format tag + undef = dnnl_format_tag_undef, + /// Placeholder memory format tag. Used to instruct the primitive to + /// select a format automatically. + any = dnnl_format_tag_any, + + /// plain 1D tensor + a = dnnl_a, + + /// plain 2D tensor + ab = dnnl_ab, + /// permuted 2D tensor + ba = dnnl_ba, + + /// plain 3D tensor + abc = dnnl_abc, + /// permuted 3D tensor + acb = dnnl_acb, + /// permuted 3D tensor + bac = dnnl_bac, + /// permuted 3D tensor + bca = dnnl_bca, + /// permuted 3D tensor + cba = dnnl_cba, + + /// plain 4D tensor + abcd = dnnl_abcd, + /// permuted 4D tensor + abdc = dnnl_abdc, + /// permuted 4D tensor + acbd = dnnl_acbd, + /// permuted 4D tensor + acdb = dnnl_acdb, + /// permuted 4D tensor + adbc = dnnl_adbc, + /// permuted 4D tensor + bacd = dnnl_bacd, + /// permuted 4D tensor + bcda = dnnl_bcda, + /// permuted 4D tensor + cdba = dnnl_cdba, + /// permuted 4D tensor + dcab = dnnl_dcab, + + /// plain 5D tensor + abcde = dnnl_abcde, + /// permuted 5D tensor + abdec = dnnl_abdec, + /// permuted 5D tensor + acbde = dnnl_acbde, + /// permuted 5D tensor + acdeb = dnnl_acdeb, + /// permuted 5D tensor + bacde = dnnl_bacde, + /// permuted 5D tensor + bcdea = dnnl_bcdea, + /// permuted 5D tensor + cdeba = dnnl_cdeba, + /// permuted 5D tensor + decab = dnnl_decab, + /// permuted 5D tensor + abced = dnnl_abced, + + /// plain 6D tensor + abcdef = dnnl_abcdef, + /// permuted 6D tensor + abdfce = dnnl_abdfce, + /// permuted 6D tensor + acbdef = dnnl_acbdef, + /// permuted 6D tensor + abdefc = dnnl_abdefc, + /// permuted 6D tensor + defcab = dnnl_defcab, + /// permuted 6D tensor + abcdfe = dnnl_abcdfe, + + /// plain 7D tensor + abcdefg = dnnl_abcdefg, + /// permuted 7D tensor + abcdegf = dnnl_abcdegf, + + /// plain 8D tensor + abcdefgh = dnnl_abcdefgh, + /// permuted 8D tensor + abcdefhg = dnnl_abcdefhg, + + /// plain 9D tensor + abcdefghi = dnnl_abcdefghi, + /// permuted 9D tensor + abcdefgih = dnnl_abcdefgih, + + /// plain 10D tensor + abcdefghij = dnnl_abcdefghij, + /// permuted 10D tensor + abcdefghji = dnnl_abcdefghji, + + /// plain 11D tensor + abcdefghijk = dnnl_abcdefghijk, + /// permuted 11D tensor + abcdefghikj = dnnl_abcdefghikj, + + /// plain 12D tensor + abcdefghijkl = dnnl_abcdefghijkl, + /// permuted 12D tensor + abcdefghijlk = dnnl_abcdefghijlk, + + /// 1D tensor; an alias for #dnnl::memory::format_tag::a + x = a, + /// 2D CNN activations tensor; an alias for #dnnl::memory::format_tag::ab + nc = ab, + /// 2D CNN activations tensor; an alias for #dnnl::memory::format_tag::ba + cn = ba, + /// 2D RNN statistics tensor; an alias for #dnnl::memory::format_tag::ab + tn = ab, + /// 2D RNN statistics tensor; an alias for #dnnl::memory::format_tag::ba + nt = ba, + /// 3D CNN activations tensor; an alias for #dnnl::memory::format_tag::abc + ncw = abc, + /// 3D CNN activations tensor; an alias for #dnnl::memory::format_tag::acb + nwc = acb, + /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::abcd + nchw = abcd, + /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::acdb + nhwc = acdb, + /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::bcda + chwn = bcda, + /// 5D CNN activations tensor; an alias for #dnnl::memory::format_tag::abcde + ncdhw = abcde, + /// 5D CNN activations tensor; an alias for #dnnl::memory::format_tag::acdeb + ndhwc = acdeb, + + /// 2D CNN weights tensor; an alias for #dnnl::memory::format_tag::ab + oi = ab, + /// 2D CNN weights tensor; an alias for #dnnl::memory::format_tag::ba + io = ba, + /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::abc + oiw = abc, + /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::acb + owi = acb, + /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::cba + wio = cba, + /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::bca + iwo = bca, + /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::abcd + oihw = abcd, + /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::cdba + hwio = cdba, + /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::acdb + ohwi = acdb, + /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::bcda + ihwo = bcda, + /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::bacd + iohw = bacd, + /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::abcde + oidhw = abcde, + /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::cdeba + dhwio = cdeba, + /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::acdeb + odhwi = acdeb, + /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::bacde + iodhw = bacde, + /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::bcdea + idhwo = bcdea, + + /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcd + goiw = abcd, + /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdc + gowi = abdc, + /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::dcab + wigo = dcab, + /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdec + gohwi = abdec, + /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcde + goihw = abcde, + /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::decab + hwigo = decab, + /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::acbde + giohw = acbde, + /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcdef + goidhw = abcdef, + /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcdef + giodhw = acbdef, + /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdefc + godhwi = abdefc, + /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::defcab + dhwigo = defcab, + + /// 3D RNN data tensor in the format (seq_length, batch, input + /// channels); an alias for #dnnl::memory::format_tag::abc. + tnc = abc, + /// 3D RNN data tensor in the format (batch, seq_length, input + /// channels); an alias for #dnnl::memory::format_tag::bac. + ntc = bac, + /// 4D RNN states tensor in the format (num_layers, num_directions, + /// batch, state channels); an alias for #dnnl::memory::format_tag::abcd. + ldnc = abcd, + /// 5D RNN weights tensor in the format (num_layers, num_directions, + /// input_channels, num_gates, output_channels); + /// an alias for #dnnl::memory::format_tag::abcde. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + ldigo = abcde, + /// 5D RNN weights tensor in the format (num_layers, num_directions, + /// num_gates, output_channels, input_channels); + /// an alias for #dnnl::memory::format_tag::abdec. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + ldgoi = abdec, + /// 4D LSTM projection tensor in the format (num_layers, num_directions, + /// num_channels_in_hidden_state, num_channels_in_recurrent_projection); + /// an alias for #dnnl::memory::format_tag::abcd. + ldio = abcd, + /// 4D LSTM projection tensor in the format (num_layers, num_directions, + /// num_channels_in_recurrent_projection, num_channels_in_hidden_state); + /// an alias for #dnnl::memory::format_tag::abdc. + ldoi = abdc, + /// 4D RNN bias tensor in the format (num_layers, num_directions, + /// num_gates, output_channels); + /// an alias for #dnnl::memory::format_tag::abcd. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + ldgo = abcd, + + // Opaque blocked formats + + AB16b16a = dnnl_AB16b16a, + AB16b32a = dnnl_AB16b32a, + AB16b48a = dnnl_AB16b48a, + AB16b64a = dnnl_AB16b64a, + AB8b16a2b = dnnl_AB8b16a2b, + AB8b32a2b = dnnl_AB8b32a2b, + AB8b64a2b = dnnl_AB8b64a2b, + AB4b16a4b = dnnl_AB4b16a4b, + AB4b32a4b = dnnl_AB4b32a4b, + AB4b64a4b = dnnl_AB4b64a4b, + AB16b16a4b = dnnl_AB16b16a4b, + AB16b32a4b = dnnl_AB16b32a4b, + AB16b48a4b = dnnl_AB16b48a4b, + AB16b64a4b = dnnl_AB16b64a4b, + AB16b16a2b = dnnl_AB16b16a2b, + AB16b32a2b = dnnl_AB16b32a2b, + AB16b48a2b = dnnl_AB16b48a2b, + AB16b64a2b = dnnl_AB16b64a2b, + Ab4a = dnnl_Ab4a, + Ab8a = dnnl_Ab8a, + Ab32a = dnnl_Ab32a, + Abc16a = dnnl_Abc16a, + ABc16a16b = dnnl_ABc16a16b, + ABc4a4b = dnnl_ABc4a4b, + aBc16b = dnnl_aBc16b, + aBc32b = dnnl_aBc32b, + ABc16b16a = dnnl_ABc16b16a, + AcB16b16a = dnnl_AcB16b16a, + ABc16b32a = dnnl_ABc16b32a, + AcB16b32a = dnnl_AcB16b32a, + ABc16b48a = dnnl_ABc16b48a, + AcB16b48a = dnnl_AcB16b48a, + ABc16b64a = dnnl_ABc16b64a, + AcB16b64a = dnnl_AcB16b64a, + Abc4a = dnnl_Abc4a, + aBc4b = dnnl_aBc4b, + ABc4b16a4b = dnnl_ABc4b16a4b, + AcB4b16a4b = dnnl_AcB4b16a4b, + ABc4b32a4b = dnnl_ABc4b32a4b, + AcB4b32a4b = dnnl_AcB4b32a4b, + ABc4b64a4b = dnnl_ABc4b64a4b, + AcB4b64a4b = dnnl_AcB4b64a4b, + ABc2b8a4b = dnnl_ABc2b8a4b, + ABc16a16b2a = dnnl_ABc16a16b2a, + ABc16b16a4b = dnnl_ABc16b16a4b, + ABc16b32a4b = dnnl_ABc16b32a4b, + ABc16b48a4b = dnnl_ABc16b48a4b, + ABc16b64a4b = dnnl_ABc16b64a4b, + ABc16b16a2b = dnnl_ABc16b16a2b, + ABc16b32a2b = dnnl_ABc16b32a2b, + ABc16b48a2b = dnnl_ABc16b48a2b, + ABc16b64a2b = dnnl_ABc16b64a2b, + ABc4b4a = dnnl_ABc4b4a, + ABc8a16b2a = dnnl_ABc8a16b2a, + ABc8a8b = dnnl_ABc8a8b, + ABc8a4b = dnnl_ABc8a4b, + aBc8b = dnnl_aBc8b, + ABc8b16a2b = dnnl_ABc8b16a2b, + AcB8b16a2b = dnnl_AcB8b16a2b, + ABc8b32a2b = dnnl_ABc8b32a2b, + AcB8b32a2b = dnnl_AcB8b32a2b, + ABc8b64a2b = dnnl_ABc8b64a2b, + AcB8b64a2b = dnnl_AcB8b64a2b, + ABc8b8a = dnnl_ABc8b8a, + AcB8b8a = dnnl_AcB8b8a, + Abcd8a = dnnl_Abcd8a, + Abcd16a = dnnl_Abcd16a, + Abcd32a = dnnl_Abcd32a, + ABcd16a16b = dnnl_ABcd16a16b, + aBcd16b = dnnl_aBcd16b, + aBcd32b = dnnl_aBcd32b, + ABcd16b16a = dnnl_ABcd16b16a, + AcdB16b16a = dnnl_AcdB16b16a, + ABcd16b32a = dnnl_ABcd16b32a, + AcdB16b32a = dnnl_AcdB16b32a, + ABcd16b48a = dnnl_ABcd16b48a, + AcdB16b48a = dnnl_AcdB16b48a, + ABcd16b64a = dnnl_ABcd16b64a, + AcdB16b64a = dnnl_AcdB16b64a, + aBCd16b16c = dnnl_aBCd16b16c, + aBCd16c16b = dnnl_aBCd16c16b, + Abcd4a = dnnl_Abcd4a, + aBcd4b = dnnl_aBcd4b, + ABcd4b16a4b = dnnl_ABcd4b16a4b, + AcdB4b16a4b = dnnl_AcdB4b16a4b, + ABcd4b32a4b = dnnl_ABcd4b32a4b, + AcdB4b32a4b = dnnl_AcdB4b32a4b, + ABcd4b64a4b = dnnl_ABcd4b64a4b, + AcdB4b64a4b = dnnl_AcdB4b64a4b, + ABcd2b8a4b = dnnl_ABcd2b8a4b, + ABcd4b4a = dnnl_ABcd4b4a, + ABcd4a4b = dnnl_ABcd4a4b, + aBCd4c16b4c = dnnl_aBCd4c16b4c, + aBCd2c8b4c = dnnl_aBCd2c8b4c, + ABcd16a16b2a = dnnl_ABcd16a16b2a, + ABcd16b16a4b = dnnl_ABcd16b16a4b, + ABcd16b32a4b = dnnl_ABcd16b32a4b, + ABcd16b48a4b = dnnl_ABcd16b48a4b, + ABcd16b64a4b = dnnl_ABcd16b64a4b, + ABcd16b16a2b = dnnl_ABcd16b16a2b, + ABcd16b32a2b = dnnl_ABcd16b32a2b, + ABcd16b48a2b = dnnl_ABcd16b48a2b, + ABcd16b64a2b = dnnl_ABcd16b64a2b, + aBCd16b16c2b = dnnl_aBCd16b16c2b, + aBCd16c16b4c = dnnl_aBCd16c16b4c, + aBCd16c16b2c = dnnl_aBCd16c16b2c, + aBCd4c4b = dnnl_aBCd4c4b, + aBCd4b4c = dnnl_aBCd4b4c, + ABcd8a16b2a = dnnl_ABcd8a16b2a, + ABcd8a8b = dnnl_ABcd8a8b, + ABcd8a4b = dnnl_ABcd8a4b, + ABcd8a2b = dnnl_ABcd8a2b, + /// 4D tensor blocked by 2nd dimension with block size 8 + aBcd8b = dnnl_aBcd8b, + ABcd8b16a2b = dnnl_ABcd8b16a2b, + AcdB8b16a2b = dnnl_AcdB8b16a2b, + ABcd8b32a2b = dnnl_ABcd8b32a2b, + AcdB8b32a2b = dnnl_AcdB8b32a2b, + ABcd8b64a2b = dnnl_ABcd8b64a2b, + AcdB8b64a2b = dnnl_AcdB8b64a2b, + aBCd8b16c2b = dnnl_aBCd8b16c2b, + /// 4D tensor blocked by 1st and 2nd dimension with block size 8 + ABcd8b8a = dnnl_ABcd8b8a, + AcdB8b8a = dnnl_AcdB8b8a, + aBCd8b8c = dnnl_aBCd8b8c, + aBCd8b4c = dnnl_aBCd8b4c, + aBCd8c16b2c = dnnl_aBCd8c16b2c, + aBCd8c8b = dnnl_aBCd8c8b, + Abcde16a = dnnl_Abcde16a, + Abcde32a = dnnl_Abcde32a, + ABcde16a16b = dnnl_ABcde16a16b, + aBcde16b = dnnl_aBcde16b, + aBcde32b = dnnl_aBcde32b, + ABcde16b16a = dnnl_ABcde16b16a, + AcdeB16b16a = dnnl_AcdeB16b16a, + ABcde16b32a = dnnl_ABcde16b32a, + AcdeB16b32a = dnnl_AcdeB16b32a, + ABcde16b48a = dnnl_ABcde16b48a, + AcdeB16b48a = dnnl_AcdeB16b48a, + ABcde16b64a = dnnl_ABcde16b64a, + AcdeB16b64a = dnnl_AcdeB16b64a, + aBCde16b16c = dnnl_aBCde16b16c, + aBCde16c16b = dnnl_aBCde16c16b, + aBCde2c8b4c = dnnl_aBCde2c8b4c, + Abcde4a = dnnl_Abcde4a, + aBcde4b = dnnl_aBcde4b, + ABcde4b4a = dnnl_ABcde4b4a, + ABcde4a4b = dnnl_ABcde4a4b, + aBCde4b4c = dnnl_aBCde4b4c, + aBCde4c16b4c = dnnl_aBCde4c16b4c, + aBCde16b16c2b = dnnl_aBCde16b16c2b, + aBCde16c16b4c = dnnl_aBCde16c16b4c, + aBCde16c16b2c = dnnl_aBCde16c16b2c, + aBCdef16c16b2c = dnnl_aBCdef16c16b2c, + aBCde4c4b = dnnl_aBCde4c4b, + Abcde8a = dnnl_Abcde8a, + ABcde8a8b = dnnl_ABcde8a8b, + ABcde8a4b = dnnl_ABcde8a4b, + aBcde8b = dnnl_aBcde8b, + ABcde8b16a2b = dnnl_ABcde8b16a2b, + AcdeB8b16a2b = dnnl_AcdeB8b16a2b, + ABcde8b32a2b = dnnl_ABcde8b32a2b, + AcdeB8b32a2b = dnnl_AcdeB8b32a2b, + ABcde8b64a2b = dnnl_ABcde8b64a2b, + AcdeB8b64a2b = dnnl_AcdeB8b64a2b, + ABcde4b16a4b = dnnl_ABcde4b16a4b, + AcdeB4b16a4b = dnnl_AcdeB4b16a4b, + ABcde4b32a4b = dnnl_ABcde4b32a4b, + AcdeB4b32a4b = dnnl_AcdeB4b32a4b, + ABcde4b64a4b = dnnl_ABcde4b64a4b, + AcdeB4b64a4b = dnnl_AcdeB4b64a4b, + ABcde16b16a4b = dnnl_ABcde16b16a4b, + ABcde16b32a4b = dnnl_ABcde16b32a4b, + ABcde16b48a4b = dnnl_ABcde16b48a4b, + ABcde16b64a4b = dnnl_ABcde16b64a4b, + ABcde16b16a2b = dnnl_ABcde16b16a2b, + ABcde16b32a2b = dnnl_ABcde16b32a2b, + ABcde16b48a2b = dnnl_ABcde16b48a2b, + ABcde16b64a2b = dnnl_ABcde16b64a2b, + ABcde2b8a4b = dnnl_ABcde2b8a4b, + aBCde8b16c2b = dnnl_aBCde8b16c2b, + ABcde8b8a = dnnl_ABcde8b8a, + AcdeB8b8a = dnnl_AcdeB8b8a, + aBCde8b8c = dnnl_aBCde8b8c, + aBCde8b4c = dnnl_aBCde8b4c, + ABcd4a8b8a4b = dnnl_ABcd4a8b8a4b, + ABcd2a8b8a2b = dnnl_ABcd2a8b8a2b, + aBCde4b8c8b4c = dnnl_aBCde4b8c8b4c, + aBCde2b8c8b2c = dnnl_aBCde2b8c8b2c, + aBCde8c16b2c = dnnl_aBCde8c16b2c, + aBCde8c8b = dnnl_aBCde8c8b, + aBcdef16b = dnnl_aBcdef16b, + aBCdef16b16c = dnnl_aBCdef16b16c, + aBCdef16c16b = dnnl_aBCdef16c16b, + aBcdef4b = dnnl_aBcdef4b, + aBCdef2c8b4c = dnnl_aBCdef2c8b4c, + aBCdef4c4b = dnnl_aBCdef4c4b, + aBCdef4b4c = dnnl_aBCdef4b4c, + aBCdef8b8c = dnnl_aBCdef8b8c, + aBCdef8b4c = dnnl_aBCdef8b4c, + aBCdef8c16b2c = dnnl_aBCdef8c16b2c, + aBCdef4c16b4c = dnnl_aBCdef4c16b4c, + aBCdef8c8b = dnnl_aBCdef8c8b, + aBdc16b = dnnl_aBdc16b, + aBdc4b = dnnl_aBdc4b, + aBdc8b = dnnl_aBdc8b, + aBdC8b2c = dnnl_aBdC8b2c, + aBdC8b4c = dnnl_aBdC8b4c, + aBdec16b = dnnl_aBdec16b, + aBdec4b = dnnl_aBdec4b, + aBdec8b = dnnl_aBdec8b, + aBdeC8b2c = dnnl_aBdeC8b2c, + aBdeC8b4c = dnnl_aBdeC8b4c, + aBdefc16b = dnnl_aBdefc16b, + aCBdef16c16b = dnnl_aCBdef16c16b, + aCBdef8b8c = dnnl_aCBdef8b8c, + aCBdef16b16c = dnnl_aCBdef16b16c, + aBdefc4b = dnnl_aBdefc4b, + aBdefc8b = dnnl_aBdefc8b, + aBdefC8b2c = dnnl_aBdefC8b2c, + aBdefC8b4c = dnnl_aBdefC8b4c, + Acb16a = dnnl_Acb16a, + Acb4a = dnnl_Acb4a, + Acb8a = dnnl_Acb8a, + AcB8a2b = dnnl_AcB8a2b, + AcB8a4b = dnnl_AcB8a4b, + aCBd8b8c = dnnl_aCBd8b8c, + aCBd16b16c = dnnl_aCBd16b16c, + aCBd16c16b = dnnl_aCBd16c16b, + aCBde8b8c = dnnl_aCBde8b8c, + aCBde16b16c = dnnl_aCBde16b16c, + aCBde16c16b = dnnl_aCBde16c16b, + Acdb16a = dnnl_Acdb16a, + Acdb4a = dnnl_Acdb4a, + Acdb8a = dnnl_Acdb8a, + AcdB8a2b = dnnl_AcdB8a2b, + AcdB8a4b = dnnl_AcdB8a4b, + Acdeb16a = dnnl_Acdeb16a, + Acdeb4a = dnnl_Acdeb4a, + Acdeb8a = dnnl_Acdeb8a, + AcdeB8a2b = dnnl_AcdeB8a2b, + AcdeB8a4b = dnnl_AcdeB8a4b, + BAc8a8b = dnnl_BAc8a8b, + BAc16a16b = dnnl_BAc16a16b, + BAc16b16a = dnnl_BAc16b16a, + BAcd8a8b = dnnl_BAcd8a8b, + BAcd16a16b = dnnl_BAcd16a16b, + BAcd16b16a = dnnl_BAcd16b16a, + ABcd32a32b = dnnl_ABcd32a32b, + BAcde16b16a = dnnl_BAcde16b16a, + BAcde8a8b = dnnl_BAcde8a8b, + BAcde16a16b = dnnl_BAcde16a16b, + aBdec32b = dnnl_aBdec32b, + Abcdef16a = dnnl_Abcdef16a, + Abcdef32a = dnnl_Abcdef32a, + Acdb32a = dnnl_Acdb32a, + aBCd2b4c2b = dnnl_aBCd2b4c2b, + aBCde2b4c2b = dnnl_aBCde2b4c2b, + aBCdef2b4c2b = dnnl_aBCdef2b4c2b, + aBCd2c4b2c = dnnl_aBCd2c4b2c, + aBCde2c4b2c = dnnl_aBCde2c4b2c, + aBCdef2c4b2c = dnnl_aBCdef2c4b2c, + aBCd4b8c2b = dnnl_aBCd4b8c2b, + aBCde4b8c2b = dnnl_aBCde4b8c2b, + aBCdef4b8c2b = dnnl_aBCdef4b8c2b, + aBCd4c8b2c = dnnl_aBCd4c8b2c, + aBCde4c8b2c = dnnl_aBCde4c8b2c, + aBCdef4c8b2c = dnnl_aBCdef4c8b2c, + AB32a32b8a4b = dnnl_AB32a32b8a4b, + AB32a32b8a2b = dnnl_AB32a32b8a2b, + AB8a4b = dnnl_AB8a4b, + AB8a2b = dnnl_AB8a2b, + abDc16d = dnnl_abDc16d, + abDc32d = dnnl_abDc32d, + abDC16d4c = dnnl_abDC16d4c, + abDC32d4c = dnnl_abDC32d4c, + abCd32c = dnnl_abCd32c, + abdEc16e = dnnl_abdEc16e, + abdEc32e = dnnl_abdEc32e, + abdEC16e4c = dnnl_abdEC16e4c, + abdEC32e2c = dnnl_abdEC32e2c, + abdEC32e4c = dnnl_abdEC32e4c, + abdCe16c = dnnl_abdCe16c, + abdCe32c = dnnl_abdCe32c, + abdCE32c2e = dnnl_abdCE32c2e, + aBCdef16c16b4c = dnnl_aBCdef16c16b4c, + aBdC16b4c = dnnl_aBdC16b4c, + aBdeC16b4c = dnnl_aBdeC16b4c, + AcB16a4b = dnnl_AcB16a4b, + AcdB16a2b = dnnl_AcdB16a2b, + aBdefC16b4c = dnnl_aBdefC16b4c, + AcdeB16a4b = dnnl_AcdeB16a4b, + + Acb32a = dnnl_Acb32a, + AcB32a2b = dnnl_AcB32a2b, + AcB32a4b = dnnl_AcB32a4b, + Acb48a = dnnl_Acb48a, + AcB48a2b = dnnl_AcB48a2b, + AcB48a4b = dnnl_AcB48a4b, + Acb64a = dnnl_Acb64a, + AcB64a2b = dnnl_AcB64a2b, + AcB64a4b = dnnl_AcB64a4b, + cBa2b = dnnl_cBa2b, + cBa4b = dnnl_cBa4b, + aBdc32b = dnnl_aBdc32b, + aBdC32b2c = dnnl_aBdC32b2c, + aBdC32b4c = dnnl_aBdC32b4c, + aBdc48b = dnnl_aBdc48b, + aBdC48b2c = dnnl_aBdC48b2c, + aBdC48b4c = dnnl_aBdC48b4c, + aBdc64b = dnnl_aBdc64b, + aBdC64b2c = dnnl_aBdC64b2c, + aBdC64b4c = dnnl_aBdC64b4c, + adcb = dnnl_adcb, + adCb2c = dnnl_adCb2c, + adCb4c = dnnl_adCb4c, + AcdB32a2b = dnnl_AcdB32a2b, + AcdB32a4b = dnnl_AcdB32a4b, + Acdb48a = dnnl_Acdb48a, + AcdB48a2b = dnnl_AcdB48a2b, + AcdB48a4b = dnnl_AcdB48a4b, + Acdb64a = dnnl_Acdb64a, + AcdB64a2b = dnnl_AcdB64a2b, + AcdB64a4b = dnnl_AcdB64a4b, + cdBa2b = dnnl_cdBa2b, + cdBa4b = dnnl_cdBa4b, + aBdeC32b2c = dnnl_aBdeC32b2c, + aBdeC32b4c = dnnl_aBdeC32b4c, + aBdec48b = dnnl_aBdec48b, + aBdeC48b2c = dnnl_aBdeC48b2c, + aBdeC48b4c = dnnl_aBdeC48b4c, + aBdec64b = dnnl_aBdec64b, + aBdeC64b2c = dnnl_aBdeC64b2c, + aBdeC64b4c = dnnl_aBdeC64b4c, + adecb = dnnl_adecb, + adeCb2c = dnnl_adeCb2c, + adeCb4c = dnnl_adeCb4c, + Acdeb32a = dnnl_Acdeb32a, + AcdeB32a2b = dnnl_AcdeB32a2b, + AcdeB32a4b = dnnl_AcdeB32a4b, + Acdeb48a = dnnl_Acdeb48a, + AcdeB48a2b = dnnl_AcdeB48a2b, + AcdeB48a4b = dnnl_AcdeB48a4b, + Acdeb64a = dnnl_Acdeb64a, + AcdeB64a2b = dnnl_AcdeB64a2b, + AcdeB64a4b = dnnl_AcdeB64a4b, + cdeBa2b = dnnl_cdeBa2b, + cdeBa4b = dnnl_cdeBa4b, + aBdefc32b = dnnl_aBdefc32b, + aBdefC32b2c = dnnl_aBdefC32b2c, + aBdefC32b4c = dnnl_aBdefC32b4c, + aBdefc48b = dnnl_aBdefc48b, + aBdefC48b2c = dnnl_aBdefC48b2c, + aBdefC48b4c = dnnl_aBdefC48b4c, + aBdefc64b = dnnl_aBdefc64b, + aBdefC64b2c = dnnl_aBdefC64b2c, + aBdefC64b4c = dnnl_aBdefC64b4c, + adefcb = dnnl_adefcb, + adefCb2c = dnnl_adefCb2c, + adefCb4c = dnnl_adefCb4c, + ABc32a32b = dnnl_ABc32a32b, + BAc8a16b2a = dnnl_BAc8a16b2a, + BAcd8a16b2a = dnnl_BAcd8a16b2a, + ABcde8a16b2a = dnnl_ABcde8a16b2a, + aCBd8b16c2b = dnnl_aCBd8b16c2b, + BAcde8a16b2a = dnnl_BAcde8a16b2a, + aCBde8b16c2b = dnnl_aCBde8b16c2b, + ABcde32a32b = dnnl_ABcde32a32b, + ABc4a8b8a4b = dnnl_ABc4a8b8a4b, + ABcde4a8b8a4b = dnnl_ABcde4a8b8a4b, + BAc4b8a8b4a = dnnl_BAc4b8a8b4a, + BAcd4b8a8b4a = dnnl_BAcd4b8a8b4a, + BAcde4b8a8b4a = dnnl_BAcde4b8a8b4a, + aBCd4b8c8b4c = dnnl_aBCd4b8c8b4c, + aBCdef4b8c8b4c = dnnl_aBCdef4b8c8b4c, + aBCdef8b16c2b = dnnl_aBCdef8b16c2b, + aCBdef8b16c2b = dnnl_aCBdef8b16c2b, + aBdC16b2c = dnnl_aBdC16b2c, + aBdeC16b2c = dnnl_aBdeC16b2c, + aBdefC16b2c = dnnl_aBdefC16b2c, + aBedc16b = dnnl_aBedc16b, + AcB16a2b = dnnl_AcB16a2b, + AcdB16a4b = dnnl_AcdB16a4b, + AcdeB16a2b = dnnl_AcdeB16a2b, + Adcb16a = dnnl_Adcb16a, + aCBd4c8b8c4b = dnnl_aCBd4c8b8c4b, + aCBde4c8b8c4b = dnnl_aCBde4c8b8c4b, + aCBdef4c8b8c4b = dnnl_aCBdef4c8b8c4b, + ABc32a16b = dnnl_ABc32a16b, + ABcd16a32b = dnnl_ABcd16a32b, + ABcd32a16b = dnnl_ABcd32a16b, + ABcde32a16b = dnnl_ABcde32a16b, + AB48a16b = dnnl_AB48a16b, + AB48a32b = dnnl_AB48a32b, + ABc40a16b = dnnl_ABc40a16b, + ABc40a32b = dnnl_ABc40a32b, + aBC48b16c = dnnl_aBC48b16c, + aBC48b32c = dnnl_aBC48b32c, + ABcd40a16b = dnnl_ABcd40a16b, + ABcd40a32b = dnnl_ABcd40a32b, + BA16a16b = dnnl_BA16a16b, + BA16a32b = dnnl_BA16a32b, + BA16a48b = dnnl_BA16a48b, + BA16a64b = dnnl_BA16a64b, + BA16a16b2a = dnnl_BA16a16b2a, + BA16a32b2a = dnnl_BA16a32b2a, + BA16a48b2a = dnnl_BA16a48b2a, + BA16a64b2a = dnnl_BA16a64b2a, + BA16a16b4a = dnnl_BA16a16b4a, + BA16a32b4a = dnnl_BA16a32b4a, + BA16a48b4a = dnnl_BA16a48b4a, + BA16a64b4a = dnnl_BA16a64b4a, + decbA16a = dnnl_decbA16a, + decbA8a = dnnl_decbA8a, + defcbA16a = dnnl_defcbA16a, + defcbA8a = dnnl_defcbA8a, + aCB16b16c = dnnl_aCB16b16c, + aCB16b32c = dnnl_aCB16b32c, + aCB16b48c = dnnl_aCB16b48c, + aCB16b64c = dnnl_aCB16b64c, + aCB16b16c2b = dnnl_aCB16b16c2b, + aCB16b32c2b = dnnl_aCB16b32c2b, + aCB16b48c2b = dnnl_aCB16b48c2b, + aCB16b64c2b = dnnl_aCB16b64c2b, + aCB16b16c4b = dnnl_aCB16b16c4b, + aCB16b32c4b = dnnl_aCB16b32c4b, + aCB16b48c4b = dnnl_aCB16b48c4b, + aCB16b64c4b = dnnl_aCB16b64c4b, + Acb24a = dnnl_Acb24a, + Acdb24a = dnnl_Acdb24a, + Acdeb24a = dnnl_Acdeb24a, + aBdc24b = dnnl_aBdc24b, + aBdec24b = dnnl_aBdec24b, + aBdefc24b = dnnl_aBdefc24b, + AcB24a2b = dnnl_AcB24a2b, + AcdB24a2b = dnnl_AcdB24a2b, + AcdeB24a2b = dnnl_AcdeB24a2b, + aBdC24b2c = dnnl_aBdC24b2c, + aBdeC24b2c = dnnl_aBdeC24b2c, + aBdefC24b2c = dnnl_aBdefC24b2c, + AcB24a4b = dnnl_AcB24a4b, + AcdB24a4b = dnnl_AcdB24a4b, + AcdeB24a4b = dnnl_AcdeB24a4b, + aBdC24b4c = dnnl_aBdC24b4c, + aBdeC24b4c = dnnl_aBdeC24b4c, + aBdefC24b4c = dnnl_aBdefC24b4c, + AB8b32a = dnnl_AB8b32a, + ABc8b32a = dnnl_ABc8b32a, + AcB8b32a = dnnl_AcB8b32a, + ABcd8b32a = dnnl_ABcd8b32a, + AcdB8b32a = dnnl_AcdB8b32a, + ABcde8b32a = dnnl_ABcde8b32a, + AcdeB8b32a = dnnl_AcdeB8b32a, + AB8b24a = dnnl_AB8b24a, + ABc8b24a = dnnl_ABc8b24a, + AcB8b24a = dnnl_AcB8b24a, + ABcd8b24a = dnnl_ABcd8b24a, + AcdB8b24a = dnnl_AcdB8b24a, + ABcde8b24a = dnnl_ABcde8b24a, + AcdeB8b24a = dnnl_AcdeB8b24a, + AB8b16a = dnnl_AB8b16a, + ABc8b16a = dnnl_ABc8b16a, + AcB8b16a = dnnl_AcB8b16a, + ABcd8b16a = dnnl_ABcd8b16a, + AcdB8b16a = dnnl_AcdB8b16a, + ABcde8b16a = dnnl_ABcde8b16a, + AcdeB8b16a = dnnl_AcdeB8b16a, + AB8b8a = dnnl_AB8b8a, + + format_tag_last = dnnl_format_tag_last, + + nCdhw16c = dnnl_nCdhw16c, + nCdhw4c = dnnl_nCdhw4c, + nCdhw8c = dnnl_nCdhw8c, + nChw16c = dnnl_nChw16c, + nChw4c = dnnl_nChw4c, + nChw8c = dnnl_nChw8c, + nCw16c = dnnl_nCw16c, + nCw4c = dnnl_nCw4c, + nCw8c = dnnl_nCw8c, + NCw16n16c = dnnl_NCw16n16c, + NChw16n16c = dnnl_NChw16n16c, + NCdhw16n16c = dnnl_NCdhw16n16c, + NCdhw32n32c = dnnl_NCdhw32n32c, + NChw32n32c = dnnl_NChw32n32c, + IOhw16i16o = dnnl_IOhw16i16o, + OI16i16o = dnnl_OI16i16o, + OI16i32o = dnnl_OI16i32o, + OI16i48o = dnnl_OI16i48o, + OI16i64o = dnnl_OI16i64o, + OI8i16o2i = dnnl_OI8i16o2i, + OI8i32o2i = dnnl_OI8i32o2i, + OI8i64o2i = dnnl_OI8i64o2i, + OI4i8o4i = dnnl_OI4i8o4i, + OI4i16o4i = dnnl_OI4i16o4i, + OI4i24o4i = dnnl_OI4i24o4i, + OI4i32o4i = dnnl_OI4i32o4i, + OI4i64o4i = dnnl_OI4i64o4i, + Ohwi32o = dnnl_Ohwi32o, + IOdhw16i16o = dnnl_IOdhw16i16o, + gIOhw16i16o = dnnl_gIOhw16i16o, + gOhwi32o = dnnl_gOhwi32o, + Goidhw16g = dnnl_Goidhw16g, + IOw8o8i = dnnl_IOw8o8i, + IOw16o16i = dnnl_IOw16o16i, + OIw16i16o = dnnl_OIw16i16o, + OwI16i16o = dnnl_OwI16i16o, + OIw16i32o = dnnl_OIw16i32o, + OwI16i32o = dnnl_OwI16i32o, + OIw16i48o = dnnl_OIw16i48o, + OwI16i48o = dnnl_OwI16i48o, + OIw16i64o = dnnl_OIw16i64o, + OwI16i64o = dnnl_OwI16i64o, + IOw16i16o = dnnl_IOw16i16o, + gIOw16i16o = dnnl_gIOw16i16o, + OIw16o16i = dnnl_OIw16o16i, + Oiw16o = dnnl_Oiw16o, + OIw4i8o4i = dnnl_OIw4i8o4i, + OwI4i8o4i = dnnl_OwI4i8o4i, + OIw4i16o4i = dnnl_OIw4i16o4i, + OwI4i16o4i = dnnl_OwI4i16o4i, + OIw4i24o4i = dnnl_OIw4i24o4i, + OwI4i24o4i = dnnl_OwI4i24o4i, + OIw4i32o4i = dnnl_OIw4i32o4i, + OwI4i32o4i = dnnl_OwI4i32o4i, + OIw4i64o4i = dnnl_OIw4i64o4i, + OwI4i64o4i = dnnl_OwI4i64o4i, + OIw2i8o4i = dnnl_OIw2i8o4i, + OIw4i4o = dnnl_OIw4i4o, + OIw4o4i = dnnl_OIw4o4i, + Oiw4o = dnnl_Oiw4o, + OIw8i16o2i = dnnl_OIw8i16o2i, + OwI8i16o2i = dnnl_OwI8i16o2i, + OIw8i32o2i = dnnl_OIw8i32o2i, + OwI8i32o2i = dnnl_OwI8i32o2i, + OIw8i64o2i = dnnl_OIw8i64o2i, + OwI8i64o2i = dnnl_OwI8i64o2i, + OIw8i8o = dnnl_OIw8i8o, + OwI8i8o = dnnl_OwI8i8o, + OIw8o16i2o = dnnl_OIw8o16i2o, + OIw8o8i = dnnl_OIw8o8i, + OIw8o4i = dnnl_OIw8o4i, + OIw16i16o4i = dnnl_OIw16i16o4i, + OIw16i32o4i = dnnl_OIw16i32o4i, + OIw16i48o4i = dnnl_OIw16i48o4i, + OIw16i64o4i = dnnl_OIw16i64o4i, + OIw16i16o2i = dnnl_OIw16i16o2i, + OIw16i32o2i = dnnl_OIw16i32o2i, + OIw16i48o2i = dnnl_OIw16i48o2i, + OIw16i64o2i = dnnl_OIw16i64o2i, + OIw16o16i2o = dnnl_OIw16o16i2o, + Owi16o = dnnl_Owi16o, + OwI16o2i = dnnl_OwI16o2i, + Iwo16i = dnnl_Iwo16i, + IwO16i2o = dnnl_IwO16i2o, + IwO16i4o = dnnl_IwO16i4o, + Owi4o = dnnl_Owi4o, + Owi8o = dnnl_Owi8o, + OwI8o2i = dnnl_OwI8o2i, + OwI8o4i = dnnl_OwI8o4i, + IOhw8o8i = dnnl_IOhw8o8i, + IOhw16o16i = dnnl_IOhw16o16i, + Ohwi16o = dnnl_Ohwi16o, + OhwI16o2i = dnnl_OhwI16o2i, + Ihwo16i = dnnl_Ihwo16i, + IhwO16i2o = dnnl_IhwO16i2o, + IhwO16i4o = dnnl_IhwO16i4o, + Ohwi4o = dnnl_Ohwi4o, + Ohwi8o = dnnl_Ohwi8o, + OhwI8o2i = dnnl_OhwI8o2i, + OhwI8o4i = dnnl_OhwI8o4i, + OIhw16i16o = dnnl_OIhw16i16o, + OhwI16i16o = dnnl_OhwI16i16o, + OIhw16i32o = dnnl_OIhw16i32o, + OhwI16i32o = dnnl_OhwI16i32o, + OIhw16i48o = dnnl_OIhw16i48o, + OhwI16i48o = dnnl_OhwI16i48o, + OIhw16i64o = dnnl_OIhw16i64o, + OhwI16i64o = dnnl_OhwI16i64o, + OIhw16o16i = dnnl_OIhw16o16i, + Oihw16o = dnnl_Oihw16o, + OIhw4i8o4i = dnnl_OIhw4i8o4i, + OhwI4i8o4i = dnnl_OhwI4i8o4i, + OIhw4i16o4i = dnnl_OIhw4i16o4i, + OhwI4i16o4i = dnnl_OhwI4i16o4i, + OIhw4i24o4i = dnnl_OIhw4i24o4i, + OhwI4i24o4i = dnnl_OhwI4i24o4i, + OIhw4i32o4i = dnnl_OIhw4i32o4i, + OhwI4i32o4i = dnnl_OhwI4i32o4i, + OIhw4i64o4i = dnnl_OIhw4i64o4i, + OhwI4i64o4i = dnnl_OhwI4i64o4i, + OIhw4i4o = dnnl_OIhw4i4o, + OIhw4o4i = dnnl_OIhw4o4i, + Oihw4o = dnnl_Oihw4o, + OIhw8i16o2i = dnnl_OIhw8i16o2i, + OhwI8i16o2i = dnnl_OhwI8i16o2i, + OIhw8i32o2i = dnnl_OIhw8i32o2i, + OhwI8i32o2i = dnnl_OhwI8i32o2i, + OIhw8i64o2i = dnnl_OIhw8i64o2i, + OhwI8i64o2i = dnnl_OhwI8i64o2i, + OIhw8i8o = dnnl_OIhw8i8o, + OhwI8i8o = dnnl_OhwI8i8o, + OIhw8o16i2o = dnnl_OIhw8o16i2o, + OIhw8o8i = dnnl_OIhw8o8i, + OIhw8o4i = dnnl_OIhw8o4i, + OIhw2i8o4i = dnnl_OIhw2i8o4i, + IOdhw8o8i = dnnl_IOdhw8o8i, + IOdhw16o16i = dnnl_IOdhw16o16i, + Odhwi16o = dnnl_Odhwi16o, + OdhwI16o2i = dnnl_OdhwI16o2i, + Idhwo16i = dnnl_Idhwo16i, + IdhwO16i2o = dnnl_IdhwO16i2o, + IdhwO16i4o = dnnl_IdhwO16i4o, + Odhwi4o = dnnl_Odhwi4o, + Odhwi8o = dnnl_Odhwi8o, + OdhwI8o2i = dnnl_OdhwI8o2i, + OdhwI8o4i = dnnl_OdhwI8o4i, + OIdhw16i16o = dnnl_OIdhw16i16o, + OdhwI16i16o = dnnl_OdhwI16i16o, + OIdhw16i32o = dnnl_OIdhw16i32o, + OdhwI16i32o = dnnl_OdhwI16i32o, + OIdhw16i48o = dnnl_OIdhw16i48o, + OdhwI16i48o = dnnl_OdhwI16i48o, + OIdhw16i64o = dnnl_OIdhw16i64o, + OdhwI16i64o = dnnl_OdhwI16i64o, + OIdhw16o16i = dnnl_OIdhw16o16i, + OIdhw16o16i2o = dnnl_OIdhw16o16i2o, + Oidhw16o = dnnl_Oidhw16o, + OIdhw4i4o = dnnl_OIdhw4i4o, + OIdhw4o4i = dnnl_OIdhw4o4i, + Oidhw4o = dnnl_Oidhw4o, + OIdhw8i16o2i = dnnl_OIdhw8i16o2i, + OdhwI8i16o2i = dnnl_OdhwI8i16o2i, + OIdhw8i32o2i = dnnl_OIdhw8i32o2i, + OdhwI8i32o2i = dnnl_OdhwI8i32o2i, + OIdhw8i64o2i = dnnl_OIdhw8i64o2i, + OdhwI8i64o2i = dnnl_OdhwI8i64o2i, + OIdhw4i8o4i = dnnl_OIdhw4i8o4i, + OdhwI4i8o4i = dnnl_OdhwI4i8o4i, + OIdhw4i16o4i = dnnl_OIdhw4i16o4i, + OdhwI4i16o4i = dnnl_OdhwI4i16o4i, + OIdhw16i16o4i = dnnl_OIdhw16i16o4i, + OIdhw16i32o4i = dnnl_OIdhw16i32o4i, + OIdhw16i48o4i = dnnl_OIdhw16i48o4i, + OIdhw16i64o4i = dnnl_OIdhw16i64o4i, + OIdhw16i16o2i = dnnl_OIdhw16i16o2i, + OIdhw16i32o2i = dnnl_OIdhw16i32o2i, + OIdhw16i48o2i = dnnl_OIdhw16i48o2i, + OIdhw16i64o2i = dnnl_OIdhw16i64o2i, + OIdhw4i24o4i = dnnl_OIdhw4i24o4i, + OdhwI4i24o4i = dnnl_OdhwI4i24o4i, + OIdhw4i32o4i = dnnl_OIdhw4i32o4i, + OdhwI4i32o4i = dnnl_OdhwI4i32o4i, + OIdhw4i64o4i = dnnl_OIdhw4i64o4i, + OdhwI4i64o4i = dnnl_OdhwI4i64o4i, + OIdhw2i8o4i = dnnl_OIdhw2i8o4i, + OIdhw8i8o = dnnl_OIdhw8i8o, + OdhwI8i8o = dnnl_OdhwI8i8o, + OIdhw8o8i = dnnl_OIdhw8o8i, + OIdhw8o4i = dnnl_OIdhw8o4i, + gIOw8o8i = dnnl_gIOw8o8i, + gIOw16o16i = dnnl_gIOw16o16i, + gOIw16i16o = dnnl_gOIw16i16o, + gOIw16o16i = dnnl_gOIw16o16i, + gOiw16o = dnnl_gOiw16o, + gOIw4i16o4i = dnnl_gOIw4i16o4i, + gOIw2i8o4i = dnnl_gOIw2i8o4i, + gOIw4i4o = dnnl_gOIw4i4o, + gOIw4o4i = dnnl_gOIw4o4i, + gOiw4o = dnnl_gOiw4o, + gOIw8i16o2i = dnnl_gOIw8i16o2i, + gOIw8i8o = dnnl_gOIw8i8o, + gOIw8o16i2o = dnnl_gOIw8o16i2o, + gOIw8o8i = dnnl_gOIw8o8i, + gOIw8o4i = dnnl_gOIw8o4i, + gOIw16i16o4i = dnnl_gOIw16i16o4i, + gOIw16i16o2i = dnnl_gOIw16i16o2i, + gOIw16o16i2o = dnnl_gOIw16o16i2o, + gOwi16o = dnnl_gOwi16o, + gOwI16o2i = dnnl_gOwI16o2i, + gIwo16i = dnnl_gIwo16i, + gIwO16i2o = dnnl_gIwO16i2o, + gIwO16i4o = dnnl_gIwO16i4o, + gOwi4o = dnnl_gOwi4o, + gOwi8o = dnnl_gOwi8o, + gOwI8o2i = dnnl_gOwI8o2i, + gOwI8o4i = dnnl_gOwI8o4i, + Goiw8g = dnnl_Goiw8g, + Goiw16g = dnnl_Goiw16g, + gIOhw8o8i = dnnl_gIOhw8o8i, + gIOhw16o16i = dnnl_gIOhw16o16i, + gOhwi16o = dnnl_gOhwi16o, + gOhwI16o2i = dnnl_gOhwI16o2i, + gIhwo16i = dnnl_gIhwo16i, + gIhwO16i2o = dnnl_gIhwO16i2o, + gIhwO16i4o = dnnl_gIhwO16i4o, + gOhwi4o = dnnl_gOhwi4o, + gOhwi8o = dnnl_gOhwi8o, + gOhwI8o2i = dnnl_gOhwI8o2i, + gOhwI8o4i = dnnl_gOhwI8o4i, + Goihw16g = dnnl_Goihw16g, + gOIhw16i16o = dnnl_gOIhw16i16o, + gOIhw16o16i = dnnl_gOIhw16o16i, + gOihw16o = dnnl_gOihw16o, + gOIhw4i16o4i = dnnl_gOIhw4i16o4i, + gOIhw2i8o4i = dnnl_gOIhw2i8o4i, + gOIhw4i4o = dnnl_gOIhw4i4o, + gOIhw4o4i = dnnl_gOIhw4o4i, + gOihw4o = dnnl_gOihw4o, + Goihw8g = dnnl_Goihw8g, + gOIhw8i16o2i = dnnl_gOIhw8i16o2i, + gOIhw8i8o = dnnl_gOIhw8i8o, + gOIhw8o16i2o = dnnl_gOIhw8o16i2o, + OIw4o8i8o4i = dnnl_OIw4o8i8o4i, + OIdhw4o8i8o4i = dnnl_OIdhw4o8i8o4i, + OIhw4o8i8o4i = dnnl_OIhw4o8i8o4i, + OIhw2o8i8o2i = dnnl_OIhw2o8i8o2i, + gOIw4o8i8o4i = dnnl_gOIw4o8i8o4i, + gOIdhw4o8i8o4i = dnnl_gOIdhw4o8i8o4i, + gOIhw4o8i8o4i = dnnl_gOIhw4o8i8o4i, + gOIhw2o8i8o2i = dnnl_gOIhw2o8i8o2i, + OIhw16i16o4i = dnnl_OIhw16i16o4i, + OIhw16i32o4i = dnnl_OIhw16i32o4i, + OIhw16i48o4i = dnnl_OIhw16i48o4i, + OIhw16i64o4i = dnnl_OIhw16i64o4i, + OIhw16i16o2i = dnnl_OIhw16i16o2i, + OIhw16i32o2i = dnnl_OIhw16i32o2i, + OIhw16i48o2i = dnnl_OIhw16i48o2i, + OIhw16i64o2i = dnnl_OIhw16i64o2i, + OIhw16o16i2o = dnnl_OIhw16o16i2o, + gOIhw16i16o4i = dnnl_gOIhw16i16o4i, + gOIhw16i16o2i = dnnl_gOIhw16i16o2i, + gOIhw16o16i2o = dnnl_gOIhw16o16i2o, + gOIhw8o8i = dnnl_gOIhw8o8i, + gOIhw8o4i = dnnl_gOIhw8o4i, + gIOdhw16i16o = dnnl_gIOdhw16i16o, + gIOdhw8o8i = dnnl_gIOdhw8o8i, + gIOdhw16o16i = dnnl_gIOdhw16o16i, + gOdhwi16o = dnnl_gOdhwi16o, + gOdhwI16o2i = dnnl_gOdhwI16o2i, + gIdhwo16i = dnnl_gIdhwo16i, + gIdhwO16i2o = dnnl_gIdhwO16i2o, + gIdhwO16i4o = dnnl_gIdhwO16i4o, + gOdhwi4o = dnnl_gOdhwi4o, + gOdhwi8o = dnnl_gOdhwi8o, + gOdhwI8o2i = dnnl_gOdhwI8o2i, + gOdhwI8o4i = dnnl_gOdhwI8o4i, + gOIdhw16i16o = dnnl_gOIdhw16i16o, + gOIdhw16o16i = dnnl_gOIdhw16o16i, + gOIdhw16o16i2o = dnnl_gOIdhw16o16i2o, + gOidhw16o = dnnl_gOidhw16o, + gOIdhw4i4o = dnnl_gOIdhw4i4o, + gOIdhw4o4i = dnnl_gOIdhw4o4i, + gOidhw4o = dnnl_gOidhw4o, + gOIdhw8i16o2i = dnnl_gOIdhw8i16o2i, + gOIdhw4i16o4i = dnnl_gOIdhw4i16o4i, + gOIdhw16i16o4i = dnnl_gOIdhw16i16o4i, + gOIdhw16i16o2i = dnnl_gOIdhw16i16o2i, + gOIdhw2i8o4i = dnnl_gOIdhw2i8o4i, + gOIdhw8i8o = dnnl_gOIdhw8i8o, + gOIdhw8o8i = dnnl_gOIdhw8o8i, + gOIdhw8o4i = dnnl_gOIdhw8o4i, + gOIw2i4o2i = dnnl_gOIw2i4o2i, + gOIhw2i4o2i = dnnl_gOIhw2i4o2i, + gOIdhw2i4o2i = dnnl_gOIdhw2i4o2i, + gOIw2o4i2o = dnnl_gOIw2o4i2o, + gOIhw2o4i2o = dnnl_gOIhw2o4i2o, + gOIdhw2o4i2o = dnnl_gOIdhw2o4i2o, + gOIw4i8o2i = dnnl_gOIw4i8o2i, + gOIhw4i8o2i = dnnl_gOIhw4i8o2i, + gOIdhw4i8o2i = dnnl_gOIdhw4i8o2i, + gOIw4o8i2o = dnnl_gOIw4o8i2o, + gOIhw4o8i2o = dnnl_gOIhw4o8i2o, + gOIdhw4o8i2o = dnnl_gOIdhw4o8i2o, + + ldOi16o = abDc16d, + ldOi32o = abDc32d, + ldOI16o4i = abDC16d4c, + ldOI32o4i = abDC32d4c, + ldgOi16o = abdEc16e, + ldgOI16o4i = abdEC16e4c, + ldgOi32o = abdEc32e, + ldgOI32o2i = abdEC32e2c, + ldgOI32o4i = abdEC32e4c, + OwI16o4i = dnnl_OwI16o4i, + OhwI16o4i = dnnl_OhwI16o4i, + gOwI16o4i = dnnl_gOwI16o4i, + gOhwI16o4i = dnnl_gOhwI16o4i, + OdhwI16o4i = dnnl_OdhwI16o4i, + gOdhwI16o4i = dnnl_gOdhwI16o4i, + + Owi32o = dnnl_Owi32o, + OwI32o2i = dnnl_OwI32o2i, + OwI32o4i = dnnl_OwI32o4i, + Owi48o = dnnl_Owi48o, + OwI48o2i = dnnl_OwI48o2i, + OwI48o4i = dnnl_OwI48o4i, + Owi64o = dnnl_Owi64o, + OwI64o2i = dnnl_OwI64o2i, + OwI64o4i = dnnl_OwI64o4i, + Iwo32i = dnnl_Iwo32i, + IwO32i2o = dnnl_IwO32i2o, + IwO32i4o = dnnl_IwO32i4o, + Iwo48i = dnnl_Iwo48i, + IwO48i2o = dnnl_IwO48i2o, + IwO48i4o = dnnl_IwO48i4o, + Iwo64i = dnnl_Iwo64i, + IwO64i2o = dnnl_IwO64i2o, + IwO64i4o = dnnl_IwO64i4o, + wIo2i = dnnl_wIo2i, + wIo4i = dnnl_wIo4i, + gOwi32o = dnnl_gOwi32o, + gOwI32o2i = dnnl_gOwI32o2i, + gOwI32o4i = dnnl_gOwI32o4i, + gOwi48o = dnnl_gOwi48o, + gOwI48o2i = dnnl_gOwI48o2i, + gOwI48o4i = dnnl_gOwI48o4i, + gOwi64o = dnnl_gOwi64o, + gOwI64o2i = dnnl_gOwI64o2i, + gOwI64o4i = dnnl_gOwI64o4i, + gIwo32i = dnnl_gIwo32i, + gIwO32i2o = dnnl_gIwO32i2o, + gIwO32i4o = dnnl_gIwO32i4o, + gIwo48i = dnnl_gIwo48i, + gIwO48i2o = dnnl_gIwO48i2o, + gIwO48i4o = dnnl_gIwO48i4o, + gIwo64i = dnnl_gIwo64i, + gIwO64i2o = dnnl_gIwO64i2o, + gIwO64i4o = dnnl_gIwO64i4o, + gwio = dnnl_gwio, + gwIo2i = dnnl_gwIo2i, + gwIo4i = dnnl_gwIo4i, + OhwI32o = dnnl_OhwI32o, + OhwI32o2i = dnnl_OhwI32o2i, + OhwI32o4i = dnnl_OhwI32o4i, + Ohwi48o = dnnl_Ohwi48o, + OhwI48o2i = dnnl_OhwI48o2i, + OhwI48o4i = dnnl_OhwI48o4i, + Ohwi64o = dnnl_Ohwi64o, + OhwI64o2i = dnnl_OhwI64o2i, + OhwI64o4i = dnnl_OhwI64o4i, + Ihwo32i = dnnl_Ihwo32i, + IhwO32i2o = dnnl_IhwO32i2o, + IhwO32i4o = dnnl_IhwO32i4o, + Ihwo48i = dnnl_Ihwo48i, + IhwO48i2o = dnnl_IhwO48i2o, + IhwO48i4o = dnnl_IhwO48i4o, + Ihwo64i = dnnl_Ihwo64i, + IhwO64i2o = dnnl_IhwO64i2o, + IhwO64i4o = dnnl_IhwO64i4o, + hwIo2i = dnnl_hwIo2i, + hwIo4i = dnnl_hwIo4i, + gOhwI32o = dnnl_gOhwI32o, + gOhwI32o2i = dnnl_gOhwI32o2i, + gOhwI32o4i = dnnl_gOhwI32o4i, + gOhwi48o = dnnl_gOhwi48o, + gOhwI48o2i = dnnl_gOhwI48o2i, + gOhwI48o4i = dnnl_gOhwI48o4i, + gOhwi64o = dnnl_gOhwi64o, + gOhwI64o2i = dnnl_gOhwI64o2i, + gOhwI64o4i = dnnl_gOhwI64o4i, + gIhwo32i = dnnl_gIhwo32i, + gIhwO32i2o = dnnl_gIhwO32i2o, + gIhwO32i4o = dnnl_gIhwO32i4o, + gIhwo48i = dnnl_gIhwo48i, + gIhwO48i2o = dnnl_gIhwO48i2o, + gIhwO48i4o = dnnl_gIhwO48i4o, + gIhwo64i = dnnl_gIhwo64i, + gIhwO64i2o = dnnl_gIhwO64i2o, + gIhwO64i4o = dnnl_gIhwO64i4o, + ghwio = dnnl_ghwio, + ghwIo2i = dnnl_ghwIo2i, + ghwIo4i = dnnl_ghwIo4i, + Odhwi32o = dnnl_Odhwi32o, + OdhwI32o2i = dnnl_OdhwI32o2i, + OdhwI32o4i = dnnl_OdhwI32o4i, + Odhwi48o = dnnl_Odhwi48o, + OdhwI48o2i = dnnl_OdhwI48o2i, + OdhwI48o4i = dnnl_OdhwI48o4i, + Odhwi64o = dnnl_Odhwi64o, + OdhwI64o2i = dnnl_OdhwI64o2i, + OdhwI64o4i = dnnl_OdhwI64o4i, + Idhwo32i = dnnl_Idhwo32i, + IdhwO32i2o = dnnl_IdhwO32i2o, + IdhwO32i4o = dnnl_IdhwO32i4o, + Idhwo48i = dnnl_Idhwo48i, + IdhwO48i2o = dnnl_IdhwO48i2o, + IdhwO48i4o = dnnl_IdhwO48i4o, + Idhwo64i = dnnl_Idhwo64i, + IdhwO64i2o = dnnl_IdhwO64i2o, + IdhwO64i4o = dnnl_IdhwO64i4o, + dhwIo2i = dnnl_dhwIo2i, + dhwIo4i = dnnl_dhwIo4i, + gOdhwi32o = dnnl_gOdhwi32o, + gOdhwI32o2i = dnnl_gOdhwI32o2i, + gOdhwI32o4i = dnnl_gOdhwI32o4i, + gOdhwi48o = dnnl_gOdhwi48o, + gOdhwI48o2i = dnnl_gOdhwI48o2i, + gOdhwI48o4i = dnnl_gOdhwI48o4i, + gOdhwi64o = dnnl_gOdhwi64o, + gOdhwI64o2i = dnnl_gOdhwI64o2i, + gOdhwI64o4i = dnnl_gOdhwI64o4i, + gIdhwo32i = dnnl_gIdhwo32i, + gIdhwO32i2o = dnnl_gIdhwO32i2o, + gIdhwO32i4o = dnnl_gIdhwO32i4o, + gIdhwo48i = dnnl_gIdhwo48i, + gIdhwO48i2o = dnnl_gIdhwO48i2o, + gIdhwO48i4o = dnnl_gIdhwO48i4o, + gIdhwo64i = dnnl_gIdhwo64i, + gIdhwO64i2o = dnnl_gIdhwO64i2o, + gIdhwO64i4o = dnnl_gIdhwO64i4o, + gdhwio = dnnl_gdhwio, + gdhwIo2i = dnnl_gdhwIo2i, + gdhwIo4i = dnnl_gdhwIo4i, + ldIo32i = dnnl_ldIo32i, + ldgIo16i = dnnl_ldgIo16i, + ldgIo32i = dnnl_ldgIo32i, + ldgIO32i2o = dnnl_ldgIO32i2o, + nCdhw32c = dnnl_nCdhw32c, + nChw32c = dnnl_nChw32c, + nCw32c = dnnl_nCw32c, + NCw32n16c = dnnl_NCw32n16c, + NChw32n16c = dnnl_NChw32n16c, + NCdhw32n16c = dnnl_NCdhw32n16c, + NCw32n32c = dnnl_NCw32n32c, + OI16i16o4i = dnnl_OI16i16o4i, + IOw8o16i2o = dnnl_IOw8o16i2o, + IOhw8o16i2o = dnnl_IOhw8o16i2o, + Owhi16o = dnnl_Owhi16o, + OIdhw8o16i2o = dnnl_OIdhw8o16i2o, + IOdhw8o16i2o = dnnl_IOdhw8o16i2o, + Goiw4g = dnnl_Goiw4g, + gIOw8o16i2o = dnnl_gIOw8o16i2o, + Goiw32g = dnnl_Goiw32g, + Goihw4g = dnnl_Goihw4g, + gIOhw8o16i2o = dnnl_gIOhw8o16i2o, + Goihw32g = dnnl_Goihw32g, + gOwhi16o = dnnl_gOwhi16o, + IOw4i8o8i4o = dnnl_IOw4i8o8i4o, + IOhw4i8o8i4o = dnnl_IOhw4i8o8i4o, + IOdhw4i8o8i4o = dnnl_IOdhw4i8o8i4o, + gIOw4i8o8i4o = dnnl_gIOw4i8o8i4o, + gIOhw4i8o8i4o = dnnl_gIOhw4i8o8i4o, + gIOdhw4i8o8i4o = dnnl_gIOdhw4i8o8i4o, + gOIdhw8o16i2o = dnnl_gOIdhw8o16i2o, + gIOdhw8o16i2o = dnnl_gIOdhw8o16i2o, + Goidhw32g = dnnl_Goidhw32g, + OI16i32o4i = dnnl_OI16i32o4i, + OI16i48o4i = dnnl_OI16i48o4i, + OI16i64o4i = dnnl_OI16i64o4i, + OI16i16o2i = dnnl_OI16i16o2i, + OI16i32o2i = dnnl_OI16i32o2i, + OI16i48o2i = dnnl_OI16i48o2i, + OI16i64o2i = dnnl_OI16i64o2i, + aBdeC16c16b4c = dnnl_aBdeC16c16b4c, + AcB16b16a2b = dnnl_AcB16b16a2b, + aBdC16c16b2c = dnnl_aBdC16c16b2c, + AcB16b16a4b = dnnl_AcB16b16a4b, + aBdC16c16b4c = dnnl_aBdC16c16b4c, + AcdB16b16a2b = dnnl_AcdB16b16a2b, + aBdefC16c16b4c = dnnl_aBdefC16c16b4c, + AcdeB16b16a4b = dnnl_AcdeB16b16a4b, + AcB16b32a2b = dnnl_AcB16b32a2b, + AcB16b32a4b = dnnl_AcB16b32a4b, + AcB16b48a2b = dnnl_AcB16b48a2b, + AcB16b48a4b = dnnl_AcB16b48a4b, + AcB16b64a2b = dnnl_AcB16b64a2b, + AcB16b64a4b = dnnl_AcB16b64a4b, + aBdC16c32b2c = dnnl_aBdC16c32b2c, + aBdC16c32b4c = dnnl_aBdC16c32b4c, + aBdC16c48b2c = dnnl_aBdC16c48b2c, + aBdC16c48b4c = dnnl_aBdC16c48b4c, + aBdC16c64b2c = dnnl_aBdC16c64b2c, + aBdC16c64b4c = dnnl_aBdC16c64b4c, + AcdB16b32a2b = dnnl_AcdB16b32a2b, + AcdB16b32a4b = dnnl_AcdB16b32a4b, + AcdB16b48a2b = dnnl_AcdB16b48a2b, + AcdB16b48a4b = dnnl_AcdB16b48a4b, + AcdB16b64a2b = dnnl_AcdB16b64a2b, + AcdB16b64a4b = dnnl_AcdB16b64a4b, + aBdeC16c32b2c = dnnl_aBdeC16c32b2c, + aBdeC16c32b4c = dnnl_aBdeC16c32b4c, + aBdeC16c48b2c = dnnl_aBdeC16c48b2c, + aBdeC16c48b4c = dnnl_aBdeC16c48b4c, + aBdeC16c64b2c = dnnl_aBdeC16c64b2c, + aBdeC16c64b4c = dnnl_aBdeC16c64b4c, + AcdeB16b32a2b = dnnl_AcdeB16b32a2b, + AcdeB16b32a4b = dnnl_AcdeB16b32a4b, + AcdeB16b48a2b = dnnl_AcdeB16b48a2b, + AcdeB16b48a4b = dnnl_AcdeB16b48a4b, + AcdeB16b64a2b = dnnl_AcdeB16b64a2b, + AcdeB16b64a4b = dnnl_AcdeB16b64a4b, + aBdefC16c32b2c = dnnl_aBdefC16c32b2c, + aBdefC16c32b4c = dnnl_aBdefC16c32b4c, + aBdefC16c48b2c = dnnl_aBdefC16c48b2c, + aBdefC16c48b4c = dnnl_aBdefC16c48b4c, + aBdefC16c64b2c = dnnl_aBdefC16c64b2c, + aBdefC16c64b4c = dnnl_aBdefC16c64b4c, + OwI16i16o2i = dnnl_OwI16i16o2i, + gOwI16i16o2i = dnnl_gOwI16i16o2i, + OhwI16i16o2i = dnnl_OhwI16i16o2i, + gOhwI16i16o2i = dnnl_gOhwI16i16o2i, + OdhwI16i16o2i = dnnl_OdhwI16i16o2i, + gOdhwI16i16o2i = dnnl_gOdhwI16i16o2i, + OwI16i16o4i = dnnl_OwI16i16o4i, + gOwI16i16o4i = dnnl_gOwI16i16o4i, + OhwI16i16o4i = dnnl_OhwI16i16o4i, + gOhwI16i16o4i = dnnl_gOhwI16i16o4i, + OdhwI16i16o4i = dnnl_OdhwI16i16o4i, + gOdhwI16i16o4i = dnnl_gOdhwI16i16o4i, + OwI16i32o2i = dnnl_OwI16i32o2i, + OwI16i32o4i = dnnl_OwI16i32o4i, + OwI16i48o2i = dnnl_OwI16i48o2i, + OwI16i48o4i = dnnl_OwI16i48o4i, + OwI16i64o2i = dnnl_OwI16i64o2i, + OwI16i64o4i = dnnl_OwI16i64o4i, + gOwI16i32o2i = dnnl_gOwI16i32o2i, + gOwI16i32o4i = dnnl_gOwI16i32o4i, + gOwI16i48o2i = dnnl_gOwI16i48o2i, + gOwI16i48o4i = dnnl_gOwI16i48o4i, + gOwI16i64o2i = dnnl_gOwI16i64o2i, + gOwI16i64o4i = dnnl_gOwI16i64o4i, + OhwI16i32o2i = dnnl_OhwI16i32o2i, + OhwI16i32o4i = dnnl_OhwI16i32o4i, + OhwI16i48o2i = dnnl_OhwI16i48o2i, + OhwI16i48o4i = dnnl_OhwI16i48o4i, + OhwI16i64o2i = dnnl_OhwI16i64o2i, + OhwI16i64o4i = dnnl_OhwI16i64o4i, + gOhwI16i32o2i = dnnl_gOhwI16i32o2i, + gOhwI16i32o4i = dnnl_gOhwI16i32o4i, + gOhwI16i48o2i = dnnl_gOhwI16i48o2i, + gOhwI16i48o4i = dnnl_gOhwI16i48o4i, + gOhwI16i64o2i = dnnl_gOhwI16i64o2i, + gOhwI16i64o4i = dnnl_gOhwI16i64o4i, + OdhwI16i32o2i = dnnl_OdhwI16i32o2i, + OdhwI16i32o4i = dnnl_OdhwI16i32o4i, + OdhwI16i48o2i = dnnl_OdhwI16i48o2i, + OdhwI16i48o4i = dnnl_OdhwI16i48o4i, + OdhwI16i64o2i = dnnl_OdhwI16i64o2i, + OdhwI16i64o4i = dnnl_OdhwI16i64o4i, + IdhwO16o32i2o = dnnl_IdhwO16o32i2o, + IdhwO16o32i4o = dnnl_IdhwO16o32i4o, + IdhwO16o48i2o = dnnl_IdhwO16o48i2o, + IdhwO16o48i4o = dnnl_IdhwO16o48i4o, + IdhwO16o64i2o = dnnl_IdhwO16o64i2o, + IdhwO16o64i4o = dnnl_IdhwO16o64i4o, + gOdhwI16i32o2i = dnnl_gOdhwI16i32o2i, + gOdhwI16i32o4i = dnnl_gOdhwI16i32o4i, + gOdhwI16i48o2i = dnnl_gOdhwI16i48o2i, + gOdhwI16i48o4i = dnnl_gOdhwI16i48o4i, + gOdhwI16i64o2i = dnnl_gOdhwI16i64o2i, + gOdhwI16i64o4i = dnnl_gOdhwI16i64o4i, + gIdhwO16o32i2o = dnnl_gIdhwO16o32i2o, + gIdhwO16o32i4o = dnnl_gIdhwO16o32i4o, + gIdhwO16o48i2o = dnnl_gIdhwO16o48i2o, + gIdhwO16o48i4o = dnnl_gIdhwO16o48i4o, + gIdhwO16o64i2o = dnnl_gIdhwO16o64i2o, + gIdhwO16o64i4o = dnnl_gIdhwO16o64i4o, + IwO16o16i2o = dnnl_IwO16o16i2o, + IwO16o16i4o = dnnl_IwO16o16i4o, + IhwO16o16i2o = dnnl_IhwO16o16i2o, + IhwO16o16i4o = dnnl_IhwO16o16i4o, + IdhwO16o16i2o = dnnl_IdhwO16o16i2o, + IdhwO16o16i4o = dnnl_IdhwO16o16i4o, + gIwO16o16i2o = dnnl_gIwO16o16i2o, + gIwO16o16i4o = dnnl_gIwO16o16i4o, + gIhwO16o16i2o = dnnl_gIhwO16o16i2o, + gIhwO16o16i4o = dnnl_gIhwO16o16i4o, + gIdhwO16o16i2o = dnnl_gIdhwO16o16i2o, + gIdhwO16o16i4o = dnnl_gIdhwO16o16i4o, + IwO16o32i2o = dnnl_IwO16o32i2o, + IwO16o32i4o = dnnl_IwO16o32i4o, + IwO16o48i2o = dnnl_IwO16o48i2o, + IwO16o48i4o = dnnl_IwO16o48i4o, + IwO16o64i2o = dnnl_IwO16o64i2o, + IwO16o64i4o = dnnl_IwO16o64i4o, + gIwO16o32i2o = dnnl_gIwO16o32i2o, + gIwO16o32i4o = dnnl_gIwO16o32i4o, + gIwO16o48i2o = dnnl_gIwO16o48i2o, + gIwO16o48i4o = dnnl_gIwO16o48i4o, + gIwO16o64i2o = dnnl_gIwO16o64i2o, + gIwO16o64i4o = dnnl_gIwO16o64i4o, + IhwO16o32i2o = dnnl_IhwO16o32i2o, + IhwO16o32i4o = dnnl_IhwO16o32i4o, + IhwO16o48i2o = dnnl_IhwO16o48i2o, + IhwO16o48i4o = dnnl_IhwO16o48i4o, + IhwO16o64i2o = dnnl_IhwO16o64i2o, + IhwO16o64i4o = dnnl_IhwO16o64i4o, + gIhwO16o32i2o = dnnl_gIhwO16o32i2o, + gIhwO16o32i4o = dnnl_gIhwO16o32i4o, + gIhwO16o48i2o = dnnl_gIhwO16o48i2o, + gIhwO16o48i4o = dnnl_gIhwO16o48i4o, + gIhwO16o64i2o = dnnl_gIhwO16o64i2o, + gIhwO16o64i4o = dnnl_gIhwO16o64i4o, + aBdeC16c16b2c = dnnl_aBdeC16c16b2c, + aBdefC16c16b2c = dnnl_aBdefC16c16b2c, + AcdB16b16a4b = dnnl_AcdB16b16a4b, + AcdeB16b16a2b = dnnl_AcdeB16b16a2b, + hwioG16g = dnnl_hwioG16g, + hwioG8g = dnnl_hwioG8g, + dhwioG16g = dnnl_dhwioG16g, + dhwioG8g = dnnl_dhwioG8g, + ABc4a2b = dnnl_ABc4a2b, + ABc8a2b = dnnl_ABc8a2b, + ABcd4a2b = dnnl_ABcd4a2b, + ABcde4a2b = dnnl_ABcde4a2b, + ABcde8a2b = dnnl_ABcde8a2b, + ABcd4a8b8a2b = dnnl_ABcd4a8b8a2b, + NCdhw40n32c = dnnl_NCdhw40n32c, + NChw40n32c = dnnl_NChw40n32c, + NCw40n32c = dnnl_NCw40n32c, + OIdhw4o8i8o2i = dnnl_OIdhw4o8i8o2i, + OIhw4o8i8o2i = dnnl_OIhw4o8i8o2i, + OIw4o8i8o2i = dnnl_OIw4o8i8o2i, + gOIdhw4o8i8o2i = dnnl_gOIdhw4o8i8o2i, + gOIhw4o8i8o2i = dnnl_gOIhw4o8i8o2i, + gOIw4o8i8o2i = dnnl_gOIw4o8i8o2i, + IOdhw4i8o8i2o = dnnl_IOdhw4i8o8i2o, + IOhw4i8o8i2o = dnnl_IOhw4i8o8i2o, + IOw4i8o8i2o = dnnl_IOw4i8o8i2o, + gIOdhw4i8o8i2o = dnnl_gIOdhw4i8o8i2o, + gIOhw4i8o8i2o = dnnl_gIOhw4i8o8i2o, + gIOw4i8o8i2o = dnnl_gIOw4i8o8i2o, + aBCd8b2c = dnnl_aBCd8b2c, + ABcde40a16b = dnnl_ABcde40a16b, + ABcde40a32b = dnnl_ABcde40a32b, + aBCde8b2c = dnnl_aBCde8b2c, + ABcde4a8b8a2b = dnnl_ABcde4a8b8a2b, + ABc4a8b8a2b = dnnl_ABc4a8b8a2b, + aBCdef4b8c8b2c = dnnl_aBCdef4b8c8b2c, + aBCde4b8c8b2c = dnnl_aBCde4b8c8b2c, + aBCd4b8c8b2c = dnnl_aBCd4b8c8b2c, + BAcde4b8a8b2a = dnnl_BAcde4b8a8b2a, + BAcd4b8a8b2a = dnnl_BAcd4b8a8b2a, + BAc4b8a8b2a = dnnl_BAc4b8a8b2a, + aCBdef4c8b8c2b = dnnl_aCBdef4c8b8c2b, + aCBde4c8b8c2b = dnnl_aCBde4c8b8c2b, + aCBd4c8b8c2b = dnnl_aCBd4c8b8c2b, + aBCdef8b2c = dnnl_aBCdef8b2c, + AB32a16b = dnnl_AB32a16b, + AB32a32b = dnnl_AB32a32b, + BA4b8a8b2a = dnnl_BA4b8a8b2a, + BA4b8a8b4a = dnnl_BA4b8a8b4a, + aBC32b16c = dnnl_aBC32b16c, + aBC32b32c = dnnl_aBC32b32c, + aCB4c8b8c2b = dnnl_aCB4c8b8c2b, + aCB4c8b8c4b = dnnl_aCB4c8b8c4b, + ABc2b8a16b4a = dnnl_ABc2b8a16b4a, + ABcd2b8a16b4a = dnnl_ABcd2b8a16b4a, + ABcde2b8a16b4a = dnnl_ABcde2b8a16b4a, + ABc2a8b16a4b = dnnl_ABc2a8b16a4b, + ABc2a8b16a2b = dnnl_ABc2a8b16a2b, + ABc2b32a8b = dnnl_ABc2b32a8b, + ABcd2a8b16a4b = dnnl_ABcd2a8b16a4b, + ABcd2a8b16a2b = dnnl_ABcd2a8b16a2b, + aCBd2c8b16c2b = dnnl_aCBd2c8b16c2b, + ABcd2b32a8b = dnnl_ABcd2b32a8b, + aBCd2c8b16c2b = dnnl_aBCd2c8b16c2b, + ABcde2a8b16a4b = dnnl_ABcde2a8b16a4b, + ABcde2a8b16a2b = dnnl_ABcde2a8b16a2b, + aCBde2c8b16c2b = dnnl_aCBde2c8b16c2b, + ABcde2b32a8b = dnnl_ABcde2b32a8b, + aBC2b8c16b2c = dnnl_aBC2b8c16b2c, + aBCd2b8c16b2c = dnnl_aBCd2b8c16b2c, + aBCde2b8c16b2c = dnnl_aBCde2b8c16b2c, + aBCdef2b8c16b2c = dnnl_aBCdef2b8c16b2c, + BAcde2b8a16b4a = dnnl_BAcde2b8a16b4a, + BAcd2b8a16b4a = dnnl_BAcd2b8a16b4a, + BAc2b8a16b4a = dnnl_BAc2b8a16b4a, + BAcde2b8a16b2a = dnnl_BAcde2b8a16b2a, + BAcd2b8a16b2a = dnnl_BAcd2b8a16b2a, + BAc2b8a16b2a = dnnl_BAc2b8a16b2a, + aBCde2c8b16c2b = dnnl_aBCde2c8b16c2b, + aBCdef2c8b16c2b = dnnl_aBCdef2c8b16c2b, + aCBdef2c8b16c2b = dnnl_aCBdef2c8b16c2b, + aBCd2b8c16b4c = dnnl_aBCd2b8c16b4c, + aBCde2b8c16b4c = dnnl_aBCde2b8c16b4c, + NCdhw40n16c = dnnl_NCdhw40n16c, + NCw40n16c = dnnl_NCw40n16c, + NChw40n16c = dnnl_NChw40n16c, + NCw2c32n8c = dnnl_NCw2c32n8c, + NChw2c32n8c = dnnl_NChw2c32n8c, + NCdhw2c32n8c = dnnl_NCdhw2c32n8c, + OIw2i8o16i4o = dnnl_OIw2i8o16i4o, + OIhw2i8o16i4o = dnnl_OIhw2i8o16i4o, + OIdhw2i8o16i4o = dnnl_OIdhw2i8o16i4o, + OIw2o8i16o4i = dnnl_OIw2o8i16o4i, + OIw2o8i16o2i = dnnl_OIw2o8i16o2i, + IOw2i8o16i4o = dnnl_IOw2i8o16i4o, + IOw2i8o16i2o = dnnl_IOw2i8o16i2o, + OIhw2o8i16o4i = dnnl_OIhw2o8i16o4i, + OIhw2o8i16o2i = dnnl_OIhw2o8i16o2i, + IOhw2i8o16i4o = dnnl_IOhw2i8o16i4o, + IOhw2i8o16i2o = dnnl_IOhw2i8o16i2o, + OIdhw2o8i16o4i = dnnl_OIdhw2o8i16o4i, + OIdhw2o8i16o2i = dnnl_OIdhw2o8i16o2i, + IOdhw2i8o16i4o = dnnl_IOdhw2i8o16i4o, + IOdhw2i8o16i2o = dnnl_IOdhw2i8o16i2o, + gOIw2o8i16o2i = dnnl_gOIw2o8i16o2i, + gIOw2i8o16i2o = dnnl_gIOw2i8o16i2o, + gIOhw2i8o16i2o = dnnl_gIOhw2i8o16i2o, + gIOdhw2i8o16i2o = dnnl_gIOdhw2i8o16i2o, + gOIhw2o8i16o2i = dnnl_gOIhw2o8i16o2i, + gOIdhw2o8i16o2i = dnnl_gOIdhw2o8i16o2i, + gOIw2o8i16o4i = dnnl_gOIw2o8i16o4i, + gOIhw2o8i16o4i = dnnl_gOIhw2o8i16o4i, + BA4b8a16b2a = dnnl_BA4b8a16b2a, + BA4b8a16b4a = dnnl_BA4b8a16b4a, + aCB4c8b16c2b = dnnl_aCB4c8b16c2b, + aCB4c8b16c4b = dnnl_aCB4c8b16c4b, + aCB16c2b = dnnl_aCB16c2b, + aCB16c4b = dnnl_aCB16c4b, + BA16b2a = dnnl_BA16b2a, + BA16b4a = dnnl_BA16b4a, + BA4b4a = dnnl_BA4b4a, + BA8b4a = dnnl_BA8b4a, + aBC16b16c = dnnl_aBC16b16c, + aBC16b32c = dnnl_aBC16b32c, + AB16a16b = dnnl_AB16a16b, + AB16a32b = dnnl_AB16a32b, + ABcde16a16b2a = dnnl_ABcde16a16b2a, + aBCdef16b16c2b = dnnl_aBCdef16b16c2b, + Acedb16a = dnnl_Acedb16a, + aBdfec16b = dnnl_aBdfec16b, + Odwhi16o = dnnl_Odwhi16o, + gOdwhi16o = dnnl_gOdwhi16o, + abdEC64e2c = dnnl_abdEC64e2c, + abdEC64e4c = dnnl_abdEC64e4c, + ldgOI64o2i = abdEC64e2c, + ldgOI64o4i = abdEC64e4c, + abCd4c = dnnl_abCd4c, + abCde4c = dnnl_abCde4c, + abCdef4c = dnnl_abCdef4c, + abCde32c = dnnl_abCde32c, + abCdef32c = dnnl_abCdef32c, + aCdefB16b32c2b = dnnl_aCdefB16b32c2b, + aCdefB16b32c4b = dnnl_aCdefB16b32c4b, + aCdefB16b48c2b = dnnl_aCdefB16b48c2b, + aCdefB16b48c4b = dnnl_aCdefB16b48c4b, + aCdefB16b64c2b = dnnl_aCdefB16b64c2b, + aCdefB16b64c4b = dnnl_aCdefB16b64c4b, + BcdeA16a32b2a = dnnl_BcdeA16a32b2a, + BcdeA16a32b4a = dnnl_BcdeA16a32b4a, + BcdeA16a48b2a = dnnl_BcdeA16a48b2a, + BcdeA16a48b4a = dnnl_BcdeA16a48b4a, + BcdeA16a64b2a = dnnl_BcdeA16a64b2a, + BcdeA16a64b4a = dnnl_BcdeA16a64b4a, + aCdefb32c = dnnl_aCdefb32c, + aCdefB32c2b = dnnl_aCdefB32c2b, + aCdefB32c4b = dnnl_aCdefB32c4b, + aCdefb48c = dnnl_aCdefb48c, + aCdefB48c2b = dnnl_aCdefB48c2b, + aCdefB48c4b = dnnl_aCdefB48c4b, + aCdefb64c = dnnl_aCdefb64c, + aCdefB64c2b = dnnl_aCdefB64c2b, + aCdefB64c4b = dnnl_aCdefB64c4b, + Bcdea32b = dnnl_Bcdea32b, + BcdeA32b2a = dnnl_BcdeA32b2a, + BcdeA32b4a = dnnl_BcdeA32b4a, + Bcdea48b = dnnl_Bcdea48b, + BcdeA48b2a = dnnl_BcdeA48b2a, + BcdeA48b4a = dnnl_BcdeA48b4a, + Bcdea64b = dnnl_Bcdea64b, + BcdeA64b2a = dnnl_BcdeA64b2a, + BcdeA64b4a = dnnl_BcdeA64b4a, + Bca32b = dnnl_Bca32b, + BcA32b2a = dnnl_BcA32b2a, + BcA32b4a = dnnl_BcA32b4a, + Bca48b = dnnl_Bca48b, + BcA48b2a = dnnl_BcA48b2a, + BcA48b4a = dnnl_BcA48b4a, + Bca64b = dnnl_Bca64b, + BcA64b2a = dnnl_BcA64b2a, + BcA64b4a = dnnl_BcA64b4a, + aCdb32c = dnnl_aCdb32c, + aCdB32c2b = dnnl_aCdB32c2b, + aCdB32c4b = dnnl_aCdB32c4b, + aCdb48c = dnnl_aCdb48c, + aCdB48c2b = dnnl_aCdB48c2b, + aCdB48c4b = dnnl_aCdB48c4b, + aCdb64c = dnnl_aCdb64c, + aCdB64c2b = dnnl_aCdB64c2b, + aCdB64c4b = dnnl_aCdB64c4b, + BcA16a16b2a = dnnl_BcA16a16b2a, + BcA16a16b4a = dnnl_BcA16a16b4a, + BcdA16a16b2a = dnnl_BcdA16a16b2a, + BcdA16a16b4a = dnnl_BcdA16a16b4a, + BcdeA16a16b2a = dnnl_BcdeA16a16b2a, + BcdeA16a16b4a = dnnl_BcdeA16a16b4a, + aCdB16b16c2b = dnnl_aCdB16b16c2b, + aCdB16b16c4b = dnnl_aCdB16b16c4b, + aCdeB16b16c2b = dnnl_aCdeB16b16c2b, + aCdeB16b16c4b = dnnl_aCdeB16b16c4b, + aCdefB16b16c2b = dnnl_aCdefB16b16c2b, + aCdefB16b16c4b = dnnl_aCdefB16b16c4b, + BcA16a32b2a = dnnl_BcA16a32b2a, + BcA16a32b4a = dnnl_BcA16a32b4a, + BcA16a48b2a = dnnl_BcA16a48b2a, + BcA16a48b4a = dnnl_BcA16a48b4a, + BcA16a64b2a = dnnl_BcA16a64b2a, + BcA16a64b4a = dnnl_BcA16a64b4a, + aCdB16b32c2b = dnnl_aCdB16b32c2b, + aCdB16b32c4b = dnnl_aCdB16b32c4b, + aCdB16b48c2b = dnnl_aCdB16b48c2b, + aCdB16b48c4b = dnnl_aCdB16b48c4b, + aCdB16b64c2b = dnnl_aCdB16b64c2b, + aCdB16b64c4b = dnnl_aCdB16b64c4b, + BcdA16a32b2a = dnnl_BcdA16a32b2a, + BcdA16a32b4a = dnnl_BcdA16a32b4a, + BcdA16a48b2a = dnnl_BcdA16a48b2a, + BcdA16a48b4a = dnnl_BcdA16a48b4a, + BcdA16a64b2a = dnnl_BcdA16a64b2a, + BcdA16a64b4a = dnnl_BcdA16a64b4a, + aCdeB16b32c2b = dnnl_aCdeB16b32c2b, + aCdeB16b32c4b = dnnl_aCdeB16b32c4b, + aCdeB16b48c2b = dnnl_aCdeB16b48c2b, + aCdeB16b48c4b = dnnl_aCdeB16b48c4b, + aCdeB16b64c2b = dnnl_aCdeB16b64c2b, + aCdeB16b64c4b = dnnl_aCdeB16b64c4b, + Bca16b = dnnl_Bca16b, + BcA16b2a = dnnl_BcA16b2a, + BcA16b4a = dnnl_BcA16b4a, + Bcda16b = dnnl_Bcda16b, + BcdA16b2a = dnnl_BcdA16b2a, + BcdA16b4a = dnnl_BcdA16b4a, + Bcdea16b = dnnl_Bcdea16b, + BcdeA16b2a = dnnl_BcdeA16b2a, + BcdeA16b4a = dnnl_BcdeA16b4a, + aCdb16c = dnnl_aCdb16c, + aCdB16c2b = dnnl_aCdB16c2b, + aCdB16c4b = dnnl_aCdB16c4b, + aCdeb16c = dnnl_aCdeb16c, + aCdeB16c2b = dnnl_aCdeB16c2b, + aCdeB16c4b = dnnl_aCdeB16c4b, + aCdefb16c = dnnl_aCdefb16c, + aCdefB16c2b = dnnl_aCdefB16c2b, + aCdefB16c4b = dnnl_aCdefB16c4b, + Bcda32b = dnnl_Bcda32b, + BcdA32b2a = dnnl_BcdA32b2a, + BcdA32b4a = dnnl_BcdA32b4a, + Bcda48b = dnnl_Bcda48b, + BcdA48b2a = dnnl_BcdA48b2a, + BcdA48b4a = dnnl_BcdA48b4a, + Bcda64b = dnnl_Bcda64b, + BcdA64b2a = dnnl_BcdA64b2a, + BcdA64b4a = dnnl_BcdA64b4a, + aCdeb32c = dnnl_aCdeb32c, + aCdeB32c2b = dnnl_aCdeB32c2b, + aCdeB32c4b = dnnl_aCdeB32c4b, + aCdeb48c = dnnl_aCdeb48c, + aCdeB48c2b = dnnl_aCdeB48c2b, + aCdeB48c4b = dnnl_aCdeB48c4b, + aCdeb64c = dnnl_aCdeb64c, + aCdeB64c2b = dnnl_aCdeB64c2b, + aCdeB64c4b = dnnl_aCdeB64c4b, + NChw16n32c = dnnl_NChw16n32c, + goIw4i = dnnl_goIw4i, + goIw32i = dnnl_goIw32i, + goIhw4i = dnnl_goIhw4i, + goIhw32i = dnnl_goIhw32i, + goIdhw4i = dnnl_goIdhw4i, + goIdhw32i = dnnl_goIdhw32i, + cab = dnnl_cab, + cdab = dnnl_cdab, + cdeab = dnnl_cdeab, + woi = dnnl_woi, + hwoi = dnnl_hwoi, + dhwoi = dnnl_dhwoi, + Owi24o = dnnl_Owi24o, + Ohwi24o = dnnl_Ohwi24o, + Odhwi24o = dnnl_Odhwi24o, + gOwi24o = dnnl_gOwi24o, + gOhwi24o = dnnl_gOhwi24o, + gOdhwi24o = dnnl_gOdhwi24o, + OwI24o2i = dnnl_OwI24o2i, + OhwI24o2i = dnnl_OhwI24o2i, + OdhwI24o2i = dnnl_OdhwI24o2i, + gOwI24o2i = dnnl_gOwI24o2i, + gOhwI24o2i = dnnl_gOhwI24o2i, + gOdhwI24o2i = dnnl_gOdhwI24o2i, + OwI24o4i = dnnl_OwI24o4i, + OhwI24o4i = dnnl_OhwI24o4i, + OdhwI24o4i = dnnl_OdhwI24o4i, + gOwI24o4i = dnnl_gOwI24o4i, + gOhwI24o4i = dnnl_gOhwI24o4i, + gOdhwI24o4i = dnnl_gOdhwI24o4i, + OI8i32o = dnnl_OI8i32o, + OIw8i32o = dnnl_OIw8i32o, + OwI8i32o = dnnl_OwI8i32o, + OIhw8i32o = dnnl_OIhw8i32o, + OhwI8i32o = dnnl_OhwI8i32o, + OIdhw8i32o = dnnl_OIdhw8i32o, + OdhwI8i32o = dnnl_OdhwI8i32o, + OI8i24o = dnnl_OI8i24o, + OIw8i24o = dnnl_OIw8i24o, + OwI8i24o = dnnl_OwI8i24o, + OIhw8i24o = dnnl_OIhw8i24o, + OhwI8i24o = dnnl_OhwI8i24o, + OIdhw8i24o = dnnl_OIdhw8i24o, + OdhwI8i24o = dnnl_OdhwI8i24o, + OI8i16o = dnnl_OI8i16o, + OIw8i16o = dnnl_OIw8i16o, + OwI8i16o = dnnl_OwI8i16o, + OIhw8i16o = dnnl_OIhw8i16o, + OhwI8i16o = dnnl_OhwI8i16o, + OIdhw8i16o = dnnl_OIdhw8i16o, + OdhwI8i16o = dnnl_OdhwI8i16o, + OI8i8o = dnnl_OI8i8o, + AB4b8a4b = dnnl_AB4b8a4b, + AB4b24a4b = dnnl_AB4b24a4b, + ABc4b8a4b = dnnl_ABc4b8a4b, + AcB4b8a4b = dnnl_AcB4b8a4b, + ABc4b24a4b = dnnl_ABc4b24a4b, + AcB4b24a4b = dnnl_AcB4b24a4b, + ABcd4b8a4b = dnnl_ABcd4b8a4b, + AcdB4b8a4b = dnnl_AcdB4b8a4b, + ABcd4b24a4b = dnnl_ABcd4b24a4b, + AcdB4b24a4b = dnnl_AcdB4b24a4b, + ABcde4b8a4b = dnnl_ABcde4b8a4b, + AcdeB4b8a4b = dnnl_AcdeB4b8a4b, + ABcde4b24a4b = dnnl_ABcde4b24a4b, + AcdeB4b24a4b = dnnl_AcdeB4b24a4b, + Bca8b = dnnl_Bca8b, + BcA8b2a = dnnl_BcA8b2a, + Bcda8b = dnnl_Bcda8b, + BcdA8b2a = dnnl_BcdA8b2a, + Bcdea8b = dnnl_Bcdea8b, + BcdeA8b2a = dnnl_BcdeA8b2a, + aCdb8c = dnnl_aCdb8c, + aCdB8c2b = dnnl_aCdB8c2b, + aCdeb8c = dnnl_aCdeb8c, + aCdeB8c2b = dnnl_aCdeB8c2b, + aCdefb8c = dnnl_aCdefb8c, + aCdefB8c2b = dnnl_aCdefB8c2b, + Bca24b = dnnl_Bca24b, + BcA24b2a = dnnl_BcA24b2a, + Bcda24b = dnnl_Bcda24b, + BcdA24b2a = dnnl_BcdA24b2a, + Bcdea24b = dnnl_Bcdea24b, + BcdeA24b2a = dnnl_BcdeA24b2a, + aCdb24c = dnnl_aCdb24c, + aCdB24c2b = dnnl_aCdB24c2b, + aCdeb24c = dnnl_aCdeb24c, + aCdeB24c2b = dnnl_aCdeB24c2b, + aCdefb24c = dnnl_aCdefb24c, + aCdefB24c2b = dnnl_aCdefB24c2b, + Iwo8i = dnnl_Iwo8i, + IwO8i2o = dnnl_IwO8i2o, + Iwo24i = dnnl_Iwo24i, + IwO24i2o = dnnl_IwO24i2o, + Ihwo8i = dnnl_Ihwo8i, + IhwO8i2o = dnnl_IhwO8i2o, + Ihwo24i = dnnl_Ihwo24i, + IhwO24i2o = dnnl_IhwO24i2o, + Idhwo8i = dnnl_Idhwo8i, + IdhwO8i2o = dnnl_IdhwO8i2o, + Idhwo24i = dnnl_Idhwo24i, + IdhwO24i2o = dnnl_IdhwO24i2o, + gIwo8i = dnnl_gIwo8i, + gIwO8i2o = dnnl_gIwO8i2o, + gIwo24i = dnnl_gIwo24i, + gIwO24i2o = dnnl_gIwO24i2o, + gIhwo8i = dnnl_gIhwo8i, + gIhwO8i2o = dnnl_gIhwO8i2o, + gIhwo24i = dnnl_gIhwo24i, + gIhwO24i2o = dnnl_gIhwO24i2o, + gIdhwo8i = dnnl_gIdhwo8i, + gIdhwO8i2o = dnnl_gIdhwO8i2o, + gIdhwo24i = dnnl_gIdhwo24i, + gIdhwO24i2o = dnnl_gIdhwO24i2o, + OhwI24o = dnnl_OhwI24o, + gOhwI24o = dnnl_gOhwI24o, + AB8b24a2b = dnnl_AB8b24a2b, + ABc8b24a2b = dnnl_ABc8b24a2b, + AcB8b24a2b = dnnl_AcB8b24a2b, + ABcd8b24a2b = dnnl_ABcd8b24a2b, + AcdB8b24a2b = dnnl_AcdB8b24a2b, + ABcde8b24a2b = dnnl_ABcde8b24a2b, + AcdeB8b24a2b = dnnl_AcdeB8b24a2b, + AB8b8a2b = dnnl_AB8b8a2b, + ABc8b8a2b = dnnl_ABc8b8a2b, + AcB8b8a2b = dnnl_AcB8b8a2b, + ABcd8b8a2b = dnnl_ABcd8b8a2b, + AcdB8b8a2b = dnnl_AcdB8b8a2b, + ABcde8b8a2b = dnnl_ABcde8b8a2b, + AcdeB8b8a2b = dnnl_AcdeB8b8a2b, + OI8i8o2i = dnnl_OI8i8o2i, + OI8i24o2i = dnnl_OI8i24o2i, + OIw8i8o2i = dnnl_OIw8i8o2i, + OwI8i8o2i = dnnl_OwI8i8o2i, + OIw8i24o2i = dnnl_OIw8i24o2i, + OwI8i24o2i = dnnl_OwI8i24o2i, + OIhw8i8o2i = dnnl_OIhw8i8o2i, + OhwI8i8o2i = dnnl_OhwI8i8o2i, + OIhw8i24o2i = dnnl_OIhw8i24o2i, + OhwI8i24o2i = dnnl_OhwI8i24o2i, + OIdhw8i8o2i = dnnl_OIdhw8i8o2i, + OdhwI8i8o2i = dnnl_OdhwI8i8o2i, + OIdhw8i24o2i = dnnl_OIdhw8i24o2i, + OdhwI8i24o2i = dnnl_OdhwI8i24o2i, + BcA8b4a = dnnl_BcA8b4a, + BcdA8b4a = dnnl_BcdA8b4a, + BcdeA8b4a = dnnl_BcdeA8b4a, + aCdB8c4b = dnnl_aCdB8c4b, + aCdeB8c4b = dnnl_aCdeB8c4b, + aCdefB8c4b = dnnl_aCdefB8c4b, + BcA24b4a = dnnl_BcA24b4a, + BcdA24b4a = dnnl_BcdA24b4a, + BcdeA24b4a = dnnl_BcdeA24b4a, + aCdB24c4b = dnnl_aCdB24c4b, + aCdeB24c4b = dnnl_aCdeB24c4b, + aCdefB24c4b = dnnl_aCdefB24c4b, + ABc16a4b = dnnl_ABc16a4b, + ABcd16a4b = dnnl_ABcd16a4b, + ABcde16a4b = dnnl_ABcde16a4b, + IwO8i4o = dnnl_IwO8i4o, + IwO24i4o = dnnl_IwO24i4o, + IhwO8i4o = dnnl_IhwO8i4o, + IhwO24i4o = dnnl_IhwO24i4o, + IdhwO8i4o = dnnl_IdhwO8i4o, + IdhwO24i4o = dnnl_IdhwO24i4o, + gIwO8i4o = dnnl_gIwO8i4o, + gIwO24i4o = dnnl_gIwO24i4o, + gIhwO8i4o = dnnl_gIhwO8i4o, + gIhwO24i4o = dnnl_gIhwO24i4o, + gIdhwO8i4o = dnnl_gIdhwO8i4o, + gIdhwO24i4o = dnnl_gIdhwO24i4o, + BA2a24b = dnnl_BA2a24b, + aCB2b24c = dnnl_aCB2b24c, + BA2a8b = dnnl_BA2a8b, + aCB2b8c = dnnl_aCB2b8c, + BA8a24b = dnnl_BA8a24b, + aCB8b24c = dnnl_aCB8b24c, + BA8a16b = dnnl_BA8a16b, + aCB8b16c = dnnl_aCB8b16c, + BA8a8b = dnnl_BA8a8b, + aCB8b8c = dnnl_aCB8b8c, + bcad = dnnl_bcad, + cabd = dnnl_cabd, + dabc = dnnl_dabc, + }; + + /// A memory descriptor. + struct desc : public handle { + using handle::handle; + + friend struct memory; + + /// Constructs a zero (empty) memory descriptor. Such a memory + /// descriptor can be used to indicate absence of an argument. + desc() { + dnnl_memory_desc_t zero_md = nullptr; + error::wrap_c_api( + dnnl_memory_desc_create_with_tag(&zero_md, 0, nullptr, + dnnl_data_type_undef, dnnl_format_tag_undef), + "could not create a zero memory descriptor"); + reset(zero_md); + } + + /// Constructs a memory descriptor. + /// + /// @note + /// The logical order of dimensions corresponds to the `abc...` + /// format tag, and the physical meaning of the dimensions depends + /// both on the primitive that would operate on this memory and + /// the operation context. + /// + /// @param adims Tensor dimensions. + /// @param adata_type Data precision/type. + /// @param aformat_tag Memory format tag. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be constructed. This flag is + /// optional and defaults to false. + desc(const dims &adims, data_type adata_type, format_tag aformat_tag, + bool allow_empty = false) { + validate_dims(adims); + dnnl_memory_desc_t md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_with_tag(&md, + (int)adims.size(), adims.data(), convert_to_c(adata_type), + convert_to_c(aformat_tag)); + if (!allow_empty) + error::wrap_c_api(status, + "could not construct a memory descriptor using a " + "format tag"); + reset(md); + } + + /// Constructs a memory descriptor by strides. + /// + /// @note + /// The logical order of dimensions corresponds to the `abc...` + /// format tag, and the physical meaning of the dimensions depends + /// both on the primitive that would operate on this memory and + /// the operation context. + /// + /// @param adims Tensor dimensions. + /// @param adata_type Data precision/type. + /// @param strides Strides for each dimension. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be constructed. This flag is + /// optional and defaults to false. + desc(const dims &adims, data_type adata_type, const dims &strides, + bool allow_empty = false) { + validate_dims(adims); + if (!strides.empty()) validate_dims(strides, (int)adims.size()); + dnnl_memory_desc_t md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_with_strides(&md, + (int)adims.size(), adims.data(), convert_to_c(adata_type), + strides.empty() ? nullptr : &strides[0]); + if (!allow_empty) + error::wrap_c_api(status, + "could not construct a memory descriptor using " + "strides"); + reset(md); + } +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Function for creating a memory descriptor for CSR sparse encoding. + /// + /// The created memory descriptor will describe a memory object that + /// contains 3 buffers. The buffers have the following meaning and + /// assigned numbers (index): + /// - 0: values + /// - 1: indices + /// - 2: pointers + /// + /// @param adims Tensor dimensions. + /// @param adata_type Data precision/type. + /// @param nnz Number of non-zero entries. + /// @param index_dt Data type of indices. + /// @param pointer_dt Data type of pointers. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be constructed. This flag is + /// optional and defaults to false. + static desc csr(const dims &adims, data_type adata_type, dim nnz, + data_type index_dt, data_type pointer_dt, + bool allow_empty = false) { + validate_dims(adims); + dnnl_memory_desc_t md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_with_csr_encoding( + &md, (int)adims.size(), adims.data(), + convert_to_c(adata_type), nnz, convert_to_c(index_dt), + convert_to_c(pointer_dt)); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a memory descriptor for CSR sparse " + "encoding"); + return desc {md}; + } + + /// Function for creating a memory descriptor for COO sparse encodings. + /// + /// The created memory descriptor will describe a memory object that + /// contains n+1 buffers for an n-dimensional tensor. + /// The buffers have the following meaning and assigned numbers (index): + /// - 0: values + /// - 1: indices for dimension 0 + /// - 2: indices for dimension 1 ... + /// - n: indices for dimension n-1 + /// + /// @param adims Tensor dimensions. + /// @param adata_type Data precision/type. + /// @param nnz Number of non-zero entries. + /// @param index_dt Data type of indices. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be constructed. This flag is + /// optional and defaults to false. + static desc coo(const dims &adims, data_type adata_type, dim nnz, + data_type index_dt, bool allow_empty = false) { + validate_dims(adims); + dnnl_memory_desc_t md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_with_coo_encoding( + &md, (int)adims.size(), adims.data(), + convert_to_c(adata_type), nnz, convert_to_c(index_dt)); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a memory descriptor for COO sparse " + "encoding"); + return desc {md}; + } + + /// Function for creating a memory descriptor for packed sparse + /// encoding. + /// + /// The created memory descriptor cannot be used to create a memory + /// object. It can only be used to create a primitive descriptor to + /// query the actual memory descriptor (similar to the format tag + /// `any`). + /// + /// @warning + /// The meaning and content of the handles of the memory object that + /// is created using the queried memory descriptor are unspecified + /// therefore using the content is an undefined behavior. + /// + /// @param adims Tensor dimensions. + /// @param adata_type Data precision/type. + /// @param nnz Number of non-zero entries. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be constructed. This flag is + /// optional and defaults to false. + static desc packed(const dims &adims, data_type adata_type, dim nnz, + bool allow_empty = false) { + validate_dims(adims); + dnnl_memory_desc_t md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_with_packed_encoding( + &md, (int)adims.size(), adims.data(), + convert_to_c(adata_type), nnz); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a memory descriptor for packed " + "sparse encoding"); + return desc {md}; + } +#endif + /// Construct a memory descriptor from a C API ::dnnl_memory_desc_t + /// handle. The resulting handle is not weak and the C handle will be + /// destroyed during the destruction of the C++ object. + /// + /// @param md The C API memory descriptor. + desc(dnnl_memory_desc_t md) : handle(md) {} + + /// Construct a memory descriptor from a binary blob. + /// + /// @param blob A binary blob previously queried from a memory descriptor. + desc(const std::vector &blob) { + dnnl_memory_desc_t md = nullptr; + error::wrap_c_api( + dnnl_memory_desc_create_with_blob(&md, blob.data()), + "could not create a memory descriptor from blob"); + reset(md); + } + + /// Constructs a memory descriptor for a region inside an area + /// described by this memory descriptor. + // + /// @param adims Sizes of the region. + /// @param offsets Offsets to the region from the encompassing + /// memory object in each dimension. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be returned. This flag is optional + /// and defaults to false. + /// @returns A memory descriptor for the region. + desc submemory_desc(const dims &adims, const dims &offsets, + bool allow_empty = false) const { + validate_dims(adims, get_ndims()); + validate_dims(offsets, get_ndims()); + dnnl_memory_desc_t sub_md = nullptr; + dnnl_status_t status = dnnl_memory_desc_create_submemory( + &sub_md, get(), adims.data(), offsets.data()); + if (!allow_empty) + error::wrap_c_api(status, "could not construct a sub-memory"); + return desc(sub_md); + } + + /// Constructs a memory descriptor by reshaping an existing one. The + /// new memory descriptor inherits the data type. This operation is + /// valid only for memory descriptors that have format_kind set to + /// #dnnl::memory::format_kind::blocked or + /// #dnnl::memory::format_kind::any. + /// + /// The operation ensures that the transformation of the physical memory + /// format corresponds to the transformation of the logical dimensions. + /// If such transformation is impossible, the function either throws an + /// exception (default) or returns a zero memory descriptor depending on + /// the `allow_empty` flag. + /// + /// The reshape operation can be described as a combination of the + /// following basic operations: + /// 1. Add a dimension of size `1`. This is always possible. + /// 2. Remove a dimension of size `1`. This is possible only if the + /// dimension has no padding (i.e. + /// `padded_dims[dim] == dims[dim] && dims[dim] == 1`). + /// 3. Split a dimension into multiple ones. This is possible only if + /// the product of all tensor dimensions stays constant and the + /// dimension being split does not have padding (i.e. + /// `padded_dims[dim] = dims[dim]`). + /// 4. Join multiple consecutive dimensions into a single one. As in + /// the cases above, this requires that the dimensions do not have + /// padding and that the memory format is such that in physical + /// memory these dimensions are dense and have the same order as + /// their logical counterparts. This also assumes that these + /// dimensions are not blocked. + /// - Here, 'dense' means: + /// `stride for dim[i] == (stride for dim[i + 1]) * dim[i + 1]`; + /// - And 'same order' means: + /// `i < j` if and only if `stride for dim[j] <= stride for dim[i]`. + /// + /// @warning + /// Some combinations of physical memory layout and/or offsets or + /// dimensions may result in a failure to make a reshape. + /// + /// @param adims New dimensions. The product of dimensions must + /// remain constant. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be returned. This flag is optional + /// and defaults to false. + /// @returns A new memory descriptor with new dimensions. + desc reshape(const dims &adims, bool allow_empty = false) const { + if (get_ndims()) validate_dims(adims, 1); + dnnl_memory_desc_t out_md = nullptr; + dnnl_status_t status = dnnl_memory_desc_reshape( + &out_md, get(), (int)adims.size(), adims.data()); + if (!allow_empty) + error::wrap_c_api( + status, "could not reshape a memory descriptor"); + return desc(out_md); + } + + /// Constructs a memory descriptor by permuting axes in an existing + /// one. + /// + /// The physical memory layout representation is adjusted accordingly + /// to maintain the consistency between the logical and physical parts + /// of the memory descriptor. The new memory descriptor inherits the + /// data type. + /// + /// The new memory descriptor inherits the data type. This operation is + /// valid only for memory descriptors that have format_kind set to + /// #dnnl::memory::format_kind::blocked or + /// #dnnl::memory::format_kind::any. + /// + /// The logical axes will be permuted in the following manner: + /// @code + /// for (i = 0; i < get_ndims(); i++) + /// new_desc.dims()[permutation[i]] = dims()[i]; + /// @endcode + /// + /// Example: + /// @code + /// std::vector permutation = {1, 0}; // swap the first and + /// // the second axes + /// dnnl::memory::desc in_md( + /// {2, 3}, data_type, memory::format_tag::ab); + /// dnnl::memory::desc expect_out_md( + /// {3, 2}, data_type, memory::format_tag::ba); + /// + /// assert(in_md.permute_axes(permutation) == expect_out_md); + /// @endcode + /// + /// @param permutation Axes permutation. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case a + /// zero memory descriptor will be returned. This flag is optional + /// and defaults to false. + /// @returns A new memory descriptor with new dimensions. + desc permute_axes(const std::vector &permutation, + bool allow_empty = false) const { + validate_dims(permutation, get_ndims()); + dnnl_memory_desc_t out_md = nullptr; + dnnl_status_t status = dnnl_memory_desc_permute_axes( + &out_md, get(), permutation.data()); + if (!allow_empty) + error::wrap_c_api(status, + "could not permute axes of a memory descriptor"); + return desc(out_md); + } + + /// Returns a number of dimensions of the memory descriptor. + /// + /// @returns A number of dimensions. + int get_ndims() const { return query_s32(query::ndims_s32); } + + /// Returns padded dimensions of the memory descriptor. + /// + /// @returns A copy of the padded dimensions vector. + memory::dims get_padded_dims() const { + return query_dims(query::padded_dims); + } + + /// Returns padded offsets of the memory descriptor. + /// + /// @returns A copy of the padded offsets vector. + memory::dims get_padded_offsets() const { + return query_dims(query::padded_offsets); + } + + /// Returns a submemory offset of the memory descriptor. + /// + /// @returns A submemory offset. + memory::dim get_submemory_offset() const { + dnnl_dim_t submemory_offset; + dnnl_status_t status = dnnl_memory_desc_query( + get(), dnnl_query_submemory_offset_s64, &submemory_offset); + return status == dnnl_success ? submemory_offset : 0; + } + + /// Returns strides of the memory descriptor. + /// + /// @note + /// This API is only applicable to memory descriptors with format + /// kind #dnnl_blocked. + /// + /// @returns A copy of the strides vector. + /// @returns An empty #dnnl::memory::dims if the memory descriptor + /// does not have strides. + memory::dims get_strides() const { return query_dims(query::strides); } + + /// Returns a number of inner blocks of the memory descriptor. + /// + /// @note + /// This API is only applicable to memory descriptors with format + /// kind #dnnl_blocked. + /// + /// @returns A number of inner blocks. + int get_inner_nblks() const { + return query_s32(query::inner_nblks_s32); + } + + /// Returns inner blocks of the memory descriptor. + /// + /// @note + /// This API is only applicable to memory descriptors with format + /// kind #dnnl_blocked. + /// + /// @returns A copy of the inner blocks vector. + /// @returns An empty #dnnl::memory::dims if the memory descriptor + /// does not have inner blocks. + memory::dims get_inner_blks() const { + return query_dims(query::inner_blks); + } + + /// Returns inner indices of the memory descriptor. + /// + /// @note + /// This API is only applicable to memory descriptors with format + /// kind #dnnl_blocked. + /// + /// @returns A copy of the inner indices vector. + /// @returns An empty #dnnl::memory::dims if the memory descriptor + /// does not have inner indices. + memory::dims get_inner_idxs() const { + return query_dims(query::inner_idxs); + } + +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Returns number of handles. + /// + /// @returns A number of handles. + int get_num_handles() const { + int nhandles; + dnnl_status_t status = dnnl_memory_desc_query_v2( + get(), dnnl_query_num_handles_s32, 0, &nhandles); + return status == dnnl_success ? nhandles : 0; + } + + /// Returns a number of non-zero entries of the memory descriptor. + /// + /// @returns A number non-zero entries. + dim get_nnz() const { + dnnl_dim_t nnz; + dnnl_status_t status = dnnl_memory_desc_query_v2( + get(), dnnl_query_nnz_s64, 0, &nnz); + return status == dnnl_success ? nnz : 0; + } + + /// Returns the sparse encoding of the memory descriptor. + /// + /// @returns the sparse encoding kind. + memory::sparse_encoding get_sparse_encoding() const { + dnnl_sparse_encoding_t sparse_encoding; + dnnl_status_t status = dnnl_memory_desc_query_v2( + get(), dnnl_query_sparse_encoding, 0, &sparse_encoding); + return status == dnnl_success + ? static_cast( + sparse_encoding) + : dnnl::memory::sparse_encoding::undef; + } + + /// Returns the data type of the memory descriptor. + /// + /// @returns The data type. + memory::data_type get_data_type(int index = 0) const { + return query_data_type(query::data_type, index); + } +#else + /// Returns the data type of the memory descriptor. + /// + /// @returns The data type. + memory::data_type get_data_type() const { + return query_data_type(query::data_type); + } +#endif + + /// Returns the format kind of the memory descriptor. + /// + /// @returns the format kind. + memory::format_kind get_format_kind() const { + dnnl_format_kind_t format_kind; + dnnl_status_t status = dnnl_memory_desc_query( + get(), dnnl_query_format_kind, &format_kind); + return status == dnnl_success + ? static_cast(format_kind) + : dnnl::memory::format_kind::undef; + } + + /// Returns dimensions of the memory descriptor. + /// + /// Potentially expensive due to the data copy involved. + /// @returns A copy of the dimensions vector. + memory::dims get_dims() const { return query_dims(query::dims); } + +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Returns size of the memory descriptor in bytes. + /// @param index Data index. Defaults to 0. + /// @returns The number of bytes required to allocate a memory buffer + /// for data with a particular @p index described by this memory + /// descriptor including the padding area. + size_t get_size(int index = 0) const { + return dnnl_memory_desc_get_size_v2(get(), index); + } +#else + /// Returns size of the memory descriptor in bytes. + /// @returns The number of bytes required to allocate a memory buffer + /// for the memory object described by this memory descriptor + /// including the padding area. + size_t get_size() const { return dnnl_memory_desc_get_size(get()); } +#endif + + /// Returns a binary blob associated with the given memory descriptor + /// @returns The memory descriptor blob associated with the memory descriptor + std::vector get_blob() { + size_t size; + dnnl_status_t status + = dnnl_memory_desc_get_blob(nullptr, &size, get()); + error::wrap_c_api( + status, "could not get memory descriptor blob size"); + + std::vector out_blob(size); + status = dnnl_memory_desc_get_blob(out_blob.data(), &size, get()); + error::wrap_c_api(status, "could not get memory descriptor blob"); + return out_blob; + } + + /// Checks whether the memory descriptor is zero (empty). + /// @returns @c true if the memory descriptor describes an empty + /// memory and @c false otherwise. + bool is_zero() const { return get_ndims() == 0; } + + /// An equality operator. + /// @param other Another memory descriptor. + /// @returns Whether this and the other memory descriptors have + /// the same format tag, dimensions, strides, blocking, etc. + bool operator==(const desc &other) const { + return dnnl_memory_desc_equal(get(), other.get()) != 0; + } + + /// An inequality operator. + /// @param other Another memory descriptor. + /// @returns Whether this and the other memory descriptors describe + /// different memory. + bool operator!=(const desc &other) const { return !operator==(other); } + + private: +#ifdef DNNL_EXPERIMENTAL_SPARSE + memory::data_type query_data_type(query what, int index) const { + dnnl_data_type_t data_type; + dnnl_status_t status = dnnl_memory_desc_query_v2( + get(), dnnl::convert_to_c(what), index, &data_type); + return status == dnnl_success + ? static_cast(data_type) + : dnnl::memory::data_type::undef; + } +#else + memory::data_type query_data_type(query what) const { + dnnl_data_type_t data_type; + dnnl_status_t status = dnnl_memory_desc_query( + get(), dnnl::convert_to_c(what), &data_type); + return status == dnnl_success + ? static_cast(data_type) + : dnnl::memory::data_type::undef; + } +#endif + + int query_s32(query what) const { + int res; + dnnl_status_t status = dnnl_memory_desc_query( + get(), dnnl::convert_to_c(what), &res); + return status == dnnl_success ? res : 0; + } + + memory::dims query_dims(query what) const { + dnnl_dims_t *c_dims; + dnnl_status_t status = dnnl_memory_desc_query( + get(), dnnl::convert_to_c(what), &c_dims); + + const int ndims + = (what == query::inner_idxs || what == query::inner_blks) + ? get_inner_nblks() + : get_ndims(); + + return status == dnnl_success + ? memory::dims(*c_dims, *c_dims + ndims) + : memory::dims {}; + } + }; + + /// Default constructor. + /// + /// Constructs an empty memory object, which can be used to indicate + /// absence of a parameter. + memory() = default; + +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Constructs a memory object. + /// + /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory + /// object will have the underlying buffer set. In this case, the buffer + /// will be initialized as if #dnnl::memory::set_data_handle() had been + /// called. + /// + /// @sa memory::set_data_handle() + /// + /// @param md Memory descriptor. + /// @param aengine Engine to store the data on. + /// @param handle Handle of the memory buffer to use. + /// - A pointer to the user-allocated buffer. In this case the library + /// doesn't own the buffer. + /// - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to + /// allocate the buffer for the memory object. In this case the + /// library owns the buffer. + /// - #DNNL_MEMORY_NONE to create dnnl::memory without an underlying + /// buffer. + memory(const desc &md, const engine &aengine, void *handle) + : memory(md, aengine, std::vector {handle}) {} + + /// Constructs a memory object with multiple handles. + /// + /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory + /// object will have the underlying buffer set. In this case, the buffer + /// will be initialized as if #dnnl::memory::set_data_handle() had been + /// called. + /// + /// @sa memory::set_data_handle() + /// + /// @param md Memory descriptor. + /// @param aengine Engine to store the data on. + /// @param handles Handles of the memory buffers to use. + /// For each element of the @p handles vector the following applies: + /// - A pointer to the user-allocated buffer. In this case the library + /// doesn't own the buffer. + /// - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to + /// allocate the buffer for the memory object. In this case the + /// library owns the buffer. + /// - #DNNL_MEMORY_NONE Instructs the library to skip allocation of the + /// memory buffer. + memory(const desc &md, const engine &aengine, std::vector handles) { + dnnl_memory_t result; + dnnl_status_t status = dnnl_memory_create_v2(&result, md.get(), + aengine.get(), (int)handles.size(), handles.data()); + error::wrap_c_api(status, "could not create a memory object"); + reset(result); + } + + /// Constructs a memory object. + /// + /// The underlying buffer(s) for the memory will be allocated by the + /// library. + /// @param md Memory descriptor. + /// @param aengine Engine to store the data on. + memory(const desc &md, const engine &aengine) { + dnnl_status_t status; + dnnl_memory_t result; + const int nhandles = md.get_num_handles(); + + std::vector handles(nhandles, DNNL_MEMORY_ALLOCATE); + status = dnnl_memory_create_v2(&result, md.get(), aengine.get(), + (int)handles.size(), handles.data()); + + error::wrap_c_api(status, "could not create a memory object"); + reset(result); + } +#else + /// Constructs a memory object. + /// + /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory + /// object will have the underlying buffer set. In this case, the buffer + /// will be initialized as if #dnnl::memory::set_data_handle() had been + /// called. + /// + /// @sa memory::set_data_handle() + /// + /// @param md Memory descriptor. + /// @param aengine Engine to store the data on. + /// @param handle Handle of the memory buffer to use. + /// - A pointer to the user-allocated buffer. In this case the library + /// doesn't own the buffer. + /// - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to + /// allocate the buffer for the memory object. In this case the + /// library owns the buffer. + /// - #DNNL_MEMORY_NONE to create dnnl::memory without an underlying + /// buffer. + memory(const desc &md, const engine &aengine, void *handle) { + dnnl_memory_t result; + error::wrap_c_api( + dnnl_memory_create(&result, md.get(), aengine.get(), handle), + "could not create a memory object"); + reset(result); + } + + /// Constructs a memory object. + /// + /// The underlying buffer for the memory will be allocated by the library. + /// + /// @param md Memory descriptor. + /// @param aengine Engine to store the data on. + memory(const desc &md, const engine &aengine) + : memory(md, aengine, DNNL_MEMORY_ALLOCATE) {} +#endif + + /// Returns the associated memory descriptor. + desc get_desc() const { + const_dnnl_memory_desc_t cdesc; + error::wrap_c_api(dnnl_memory_get_memory_desc(get(), &cdesc), + "could not get a memory descriptor from a memory object"); + dnnl_memory_desc_t cloned_md = nullptr; + error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc), + "could not clone a memory descriptor"); + return desc(cloned_md); + } + + /// Returns the associated engine. + engine get_engine() const { + dnnl_engine_t c_engine; + error::wrap_c_api(dnnl_memory_get_engine(get(), &c_engine), + "could not get an engine from a memory object"); + return engine(c_engine, true); + } + +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Returns an underlying memory buffer that corresponds to the given index. + /// + /// On the CPU engine, or when using USM, this is a pointer to the + /// allocated memory. + void *get_data_handle(int index = 0) const { + void *handle; + error::wrap_c_api(dnnl_memory_get_data_handle_v2(get(), &handle, index), + "could not get a native handle from a memory object"); + return handle; + } + + /// Sets an underlying memory buffer that corresponds to the given index. + /// + /// @param handle Memory buffer to use. On the CPU engine or when USM is + /// used, the memory buffer is a pointer to the actual data. For OpenCL + /// it is a cl_mem. It must have at least + /// #dnnl::memory::desc::get_size() bytes allocated. + /// @param index Memory index to attach the buffer. Defaults to 0. + void set_data_handle(void *handle, int index = 0) const { + error::wrap_c_api(dnnl_memory_set_data_handle_v2(get(), handle, index), + "could not set native handle of a memory object"); + } + + /// Maps a memory object and returns a host-side pointer to a memory + /// buffer with a copy of its contents. The memory buffer corresponds to + /// the given index. + /// + /// Mapping enables read/write directly from/to the memory contents for + /// engines that do not support direct memory access. + /// + /// Mapping is an exclusive operation - a memory object cannot be used in + /// other operations until it is unmapped via #dnnl::memory::unmap_data() + /// call. + /// + /// @note + /// Any primitives working with the memory should be completed before + /// the memory is mapped. Use #dnnl::stream::wait() to synchronize the + /// corresponding execution stream. + /// + /// @note + /// The map_data and unmap_data functions are provided mainly for + /// debug and testing purposes and their performance may be suboptimal. + /// + /// @tparam T Data type to return a pointer to. + /// @param index Index of the buffer. Defaults to 0. + /// @returns Pointer to the mapped memory. + template + T *map_data(int index = 0) const { + void *mapped_ptr; + error::wrap_c_api(dnnl_memory_map_data_v2(get(), &mapped_ptr, index), + "could not map memory object data"); + return static_cast(mapped_ptr); + } + + /// Unmaps a memory object and writes back any changes made to the + /// previously mapped memory buffer. The memory buffer corresponds to + /// the given index. + /// + /// @note + /// The map_data and unmap_data functions are provided mainly for + /// debug and testing purposes and their performance may be + /// suboptimal. + /// + /// @param mapped_ptr A pointer previously returned by + /// #dnnl::memory::map_data(). + /// @param index Index of the buffer. Defaults to 0. + void unmap_data(void *mapped_ptr, int index = 0) const { + error::wrap_c_api(dnnl_memory_unmap_data_v2(get(), mapped_ptr, index), + "could not unmap memory object data"); + } +#else + /// Returns the underlying memory buffer. + /// + /// On the CPU engine, or when using USM, this is a pointer to the + /// allocated memory. + void *get_data_handle() const { + void *handle; + error::wrap_c_api(dnnl_memory_get_data_handle(get(), &handle), + "could not get a native handle from a memory object"); + return handle; + } + + /// Sets the underlying memory buffer. + /// + /// @param handle Memory buffer to use. On the CPU engine or when USM is + /// used, the memory buffer is a pointer to the actual data. For OpenCL + /// it is a cl_mem. It must have at least + /// #dnnl::memory::desc::get_size() bytes allocated. + void set_data_handle(void *handle) const { + error::wrap_c_api(dnnl_memory_set_data_handle(get(), handle), + "could not set native handle of a memory object"); + } + + /// Maps a memory object and returns a host-side pointer to a memory + /// buffer with a copy of its contents. + /// + /// Mapping enables read/write directly from/to the memory contents for + /// engines that do not support direct memory access. + /// + /// Mapping is an exclusive operation - a memory object cannot be used in + /// other operations until it is unmapped via #dnnl::memory::unmap_data() + /// call. + /// + /// @note + /// Any primitives working with the memory should be completed before + /// the memory is mapped. Use #dnnl::stream::wait() to synchronize the + /// corresponding execution stream. + /// + /// @note + /// The map_data and unmap_data functions are provided mainly for + /// debug and testing purposes and their performance may be suboptimal. + /// + /// @tparam T Data type to return a pointer to. + /// @returns Pointer to the mapped memory. + template + T *map_data() const { + void *mapped_ptr; + error::wrap_c_api(dnnl_memory_map_data(get(), &mapped_ptr), + "could not map memory object data"); + return static_cast(mapped_ptr); + } + + /// Unmaps a memory object and writes back any changes made to the + /// previously mapped memory buffer. + /// + /// @note + /// The map_data and unmap_data functions are provided mainly for + /// debug and testing purposes and their performance may be + /// suboptimal. + /// + /// @param mapped_ptr A pointer previously returned by + /// #dnnl::memory::map_data(). + void unmap_data(void *mapped_ptr) const { + error::wrap_c_api(dnnl_memory_unmap_data(get(), mapped_ptr), + "could not unmap memory object data"); + } +#endif + + static dnnl_data_type_t convert_to_c(data_type adata_type) { + return static_cast(adata_type); + } + static dnnl_format_tag_t convert_to_c(format_tag format) { + return static_cast(format); + } +}; + +inline bool operator==(dnnl_data_type_t a, memory::data_type b) { + return a == memory::convert_to_c(b); +} +inline bool operator!=(dnnl_data_type_t a, memory::data_type b) { + return !(a == b); +} +inline bool operator==(memory::data_type a, dnnl_data_type_t b) { + return b == a; +} +inline bool operator!=(memory::data_type a, dnnl_data_type_t b) { + return !(a == b); +} + +inline bool operator==(dnnl_format_tag_t a, memory::format_tag b) { + return a == memory::convert_to_c(b); +} +inline bool operator!=(dnnl_format_tag_t a, memory::format_tag b) { + return !(a == b); +} +inline bool operator==(memory::format_tag a, dnnl_format_tag_t b) { + return b == a; +} +inline bool operator!=(memory::format_tag a, dnnl_format_tag_t b) { + return !(a == b); +} + +/// @} dnnl_api_memory + +/// @addtogroup dnnl_api_primitives +/// @{ +/// @addtogroup dnnl_api_attributes Attributes +/// +/// A container for parameters that extend primitives behavior. +/// +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_post_ops_t p) { + return dnnl_post_ops_destroy(p); + } +}; +/// @endcond + +/// Post-ops. +/// +/// Post-ops are computations executed after the main primitive computations +/// and are attached to the primitive via primitive attributes. +/// +/// @sa @ref dev_guide_attributes_post_ops +/// +struct post_ops : public handle { + using handle::handle; + + /// Constructs an empty sequence of post-ops. + post_ops() { + dnnl_post_ops_t result; + error::wrap_c_api( + dnnl_post_ops_create(&result), "could not create post-ops"); + reset(result); + } + + /// Creates post-ops primitive attribute from a C API ::dnnl_post_ops_t + /// handle. The resulting handle is not weak and the C handle will be + /// destroyed during the destruction of the C++ object. + /// + /// @param post_ops The C API post-ops primitive attribute. + post_ops(dnnl_post_ops_t post_ops) : handle(post_ops) {} + + /// Returns the number of post-ops entries. + int len() const { return dnnl_post_ops_len(get()); } + + /// Returns the primitive kind of post-op at entry with a certain index. + /// @param index Index of the post-op to return the kind for. + /// @returns Primitive kind of the post-op at the specified index. + primitive::kind kind(int index) const { + error::wrap_c_api(index < len() ? dnnl_success : dnnl_invalid_arguments, + "post-ops index is out of range"); + return static_cast( + dnnl_post_ops_get_kind(get(), index)); + } + + /// Appends an accumulation (sum) post-op. Prior to accumulating the + /// result, the previous value will be will be reduced by zero point + /// @p zero_point and multiplied by a scaling factor @p scale. + /// + /// The kind of this post-op is #dnnl::primitive::kind::sum. + /// + /// This feature may improve performance for cases like dequantize the + /// asymmetrically quantized sum's src1 tensor to f32 domain before + /// performing the sum operation by subtracting @p zero_point before the + /// scaling. + /// + /// In the simplest case when the accumulation is the only post-op, + /// the computations will be `dst[:] := scale * (dst[:] - zero_point) + + /// op(...)` instead of `dst[:] := op(...)`. + /// + /// If @p data_type is specified, the original dst tensor will be + /// reinterpreted as a tensor with the provided data type. Because it is a + /// reinterpretation, data_type and dst data type should have the same size. + /// As a result, computations will be `dst[:] <- scale * + /// (as_data_type(dst[:]) - zero_point) + op(...)` instead of + /// `dst[:] <- op(...)`. + /// + /// @note + /// This post-op executes in-place and does not change the + /// destination layout. + /// + /// @param scale Scaling factor. + /// @param zero_point Zero point. + /// @param data_type Data type. + void append_sum(float scale = 1.f, int32_t zero_point = 0, + memory::data_type data_type = memory::data_type::undef) { + error::wrap_c_api(dnnl_post_ops_append_sum(get(), scale, zero_point, + memory::convert_to_c(data_type)), + "could not append a sum post-op"); + } + + /// Returns the parameters of an accumulation (sum) post-op. + /// + /// @param index Index of the sum post-op. + /// @param scale Scaling factor of the sum post-op. + void get_params_sum(int index, float &scale) const { + error::wrap_c_api(dnnl_post_ops_get_params_sum( + get(), index, &scale, nullptr, nullptr), + "could not get parameters of a sum post-op"); + } + + /// Returns the parameters of an accumulation (sum) post-op. + /// + /// @param index Index of the sum post-op. + /// @param scale Scaling factor of the sum post-op. + /// @param data_type Data type of the sum post-op. + void get_params_sum( + int index, float &scale, memory::data_type &data_type) const { + dnnl_data_type_t c_data_type; + error::wrap_c_api(dnnl_post_ops_get_params_sum( + get(), index, &scale, nullptr, &c_data_type), + "could not get parameters of a sum post-op"); + data_type = static_cast(c_data_type); + } + + /// Returns the parameters of an accumulation (sum) post-op. + /// + /// @param index Index of the sum post-op. + /// @param scale Scaling factor of the sum post-op. + /// @param zero_point Single scalar int32_t value of zeropoint. + /// @param data_type Data type of the sum post-op. + void get_params_sum(int index, float &scale, int32_t &zero_point, + memory::data_type &data_type) const { + dnnl_data_type_t c_data_type; + error::wrap_c_api(dnnl_post_ops_get_params_sum(get(), index, &scale, + &zero_point, &c_data_type), + "could not get parameters of a sum post-op"); + data_type = static_cast(c_data_type); + } + + /// Appends an elementwise post-op. + /// + /// The kind of this post-op is #dnnl::primitive::kind::eltwise. + /// + /// In the simplest case when the elementwise is the only post-op, the + /// computations would be `dst[:] := eltwise_op (op(...))` instead + /// of `dst[:] <- op(...)`, where eltwise_op is configured with the given + /// parameters. + /// + /// @param aalgorithm Elementwise algorithm. + /// @param alpha Alpha parameter for the elementwise algorithm. + /// @param beta Beta parameter for the elementwise algorithm. + void append_eltwise(algorithm aalgorithm, float alpha, float beta) { + error::wrap_c_api(dnnl_post_ops_append_eltwise( + get(), convert_to_c(aalgorithm), alpha, beta), + "could not append an elementwise post-op"); + } + + /// Returns parameters of an elementwise post-op. + /// + /// @param index Index of the post-op. + /// @param aalgorithm Output elementwise algorithm kind. + /// @param alpha Output alpha parameter for the elementwise algorithm. + /// @param beta Output beta parameter for the elementwise algorithm. + void get_params_eltwise( + int index, algorithm &aalgorithm, float &alpha, float &beta) const { + dnnl_alg_kind_t c_alg; + error::wrap_c_api(dnnl_post_ops_get_params_eltwise( + get(), index, &c_alg, &alpha, &beta), + "could not get parameters of an elementwise post-op"); + aalgorithm = static_cast(c_alg); + } + + /// Appends a depthwise post-op convolution. + /// + /// This post-op can only be fused with a 2D 1x1 convolution (convolution + /// with weights spatial dimension equal to 1 i.e., kh=kw=1). + /// + /// The kind of this post-op is #dnnl_convolution. + /// + /// The number of outputs for primitive remain same as before. The output + /// spatial size can be derived as below: + /// + /// output_height = ceil(output_height_1x1_convolution, stride) + /// output_width = ceil(output_width_1x1_convolution, stride) + /// + /// See @ref dev_guide_attributes_post_ops_depthwise and + /// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info. + /// + /// @param weights_data_type Weights data type of depthwise post-op + /// @param bias_data_type Bias data type of depthwise post-op + /// @param dst_data_type Output data type of depthwise post-op + /// @param kernel_size Size of kernel of depthwise post-op + /// @param stride_size Size of stride of depthwise post-op + /// @param padding_l_size Size of left and top paddings of depthwise post-op + void append_dw(memory::data_type weights_data_type, + memory::data_type bias_data_type, memory::data_type dst_data_type, + memory::dim kernel_size, memory::dim stride_size, + memory::dim padding_l_size) { + + error::wrap_c_api(dnnl_post_ops_append_dw(get(), + memory::convert_to_c(weights_data_type), + memory::convert_to_c(bias_data_type), + memory::convert_to_c(dst_data_type), + kernel_size, stride_size, padding_l_size), + "could not append depthwise post-op"); + } + + /// Returns the parameters of an depthwise post-op. + /// + /// @param index Index of the elementwise post-op. + /// @param weights_data_type Weights data type of depthwise post-op + /// @param bias_data_type Bias data type of depthwise post-op + /// @param dst_data_type Output data type of depthwise post-op + /// @param kernel_size Size of kernel of depthwise post-op + /// @param stride_size Size of stride of depthwise post-op + /// @param padding_l_size Size of left and top paddings of depthwise post-op + void get_params_dw(int index, memory::data_type &weights_data_type, + memory::data_type &bias_data_type, memory::data_type &dst_data_type, + memory::dim &kernel_size, memory::dim &stride_size, + memory::dim &padding_l_size) const { + + dnnl_data_type_t c_weights_data_type; + dnnl_data_type_t c_bias_data_type; + dnnl_data_type_t c_dst_data_type; + dnnl_dim_t c_kernel_size; + dnnl_dim_t c_stride_size; + dnnl_dim_t c_padding_l_size; + error::wrap_c_api( + dnnl_post_ops_get_params_dw(get(), index, &c_weights_data_type, + &c_bias_data_type, &c_dst_data_type, &c_kernel_size, + &c_stride_size, &c_padding_l_size), + "could not get parameters of depthwise post-op"); + + weights_data_type = static_cast(c_weights_data_type); + bias_data_type = static_cast(c_bias_data_type); + dst_data_type = static_cast(c_dst_data_type); + kernel_size = c_kernel_size; + stride_size = c_stride_size; + padding_l_size = c_padding_l_size; + } + + /// Appends a binary post-op. + /// + /// The kind of this post operation is #dnnl_binary. + /// + /// In the simplest case when the binary is the only post operation, the + /// computations would be: + /// + /// dst[:] <- binary_op (dst[:], another_input[:]) + /// + /// where binary_op is configured with the given parameters. binary_op + /// supports broadcast semantics for a second operand. + /// + /// @param aalgorithm Binary algorithm for the post-op. + /// @param src1_desc Memory descriptor of a second operand. + void append_binary(algorithm aalgorithm, const memory::desc &src1_desc) { + error::wrap_c_api(dnnl_post_ops_append_binary(get(), + convert_to_c(aalgorithm), src1_desc.get()), + "could not append a binary post-op"); + } + + /// Returns the parameters of a binary post-op. + /// + /// @param index Index of the binary post-op. + /// @param aalgorithm Output binary algorithm kind. + /// @param src1_desc Output memory descriptor of a second operand. + void get_params_binary( + int index, algorithm &aalgorithm, memory::desc &src1_desc) const { + dnnl_alg_kind_t c_alg; + const_dnnl_memory_desc_t cdesc; + error::wrap_c_api( + dnnl_post_ops_get_params_binary(get(), index, &c_alg, &cdesc), + "could not get parameters of a binary post-op"); + aalgorithm = static_cast(c_alg); + dnnl_memory_desc_t cloned_md = nullptr; + error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc), + "could not clone a memory descriptor"); + src1_desc = memory::desc(cloned_md); + } + + /// Appends a prelu forward post-op. + /// + /// The kind of this post-op is #dnnl::primitive::kind::prelu. + /// + /// The post-op can be defined as: + /// + /// dst[:] <- prelu(dst[:], weights[:]) + /// prelu: + /// dst[:] <- dst[:] if dst[:] > 0 + /// dst[:] <- dst[:] * weights[:] if dst[:] <= 0 + /// + /// + /// Example usage: + /// @code + /// int mb = 32, oc = 32, + /// oh = 14, ow = 14; // convolution output params + /// // unique weights per output channel + /// vector weights = { ... }; + /// int oc_dim = 1; // mb_dim = 0, channel_dim = 1, height_dim = 2, ... + /// + /// // construct a convolution descriptor + /// dnnl::convolution::desc conv_d; + /// + /// dnnl::primitive_attr attr; + /// attr.append_prelu(1 << oc_dim); + /// + /// dnnl::primitive_desc conv_pd(conv_d, attr, engine); + /// memory prelu_weights({{1}, dt::f32, {1}}, eng, weights.data()); + /// + /// std::unordered_map conv_args; + /// + /// conv_args.insert( + /// {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_WEIGHTS, prelu_weights}) + /// @endcode + /// + /// @note + /// The order of dimensions does not depend on how elements are laid + /// out in memory. For example: + /// - for a 2D CNN activations tensor the order is always (n, c) + /// - for a 4D CNN activations tensor the order is always (n, c, h, w) + /// - for a 5D CNN weights tensor the order is always + /// (g, oc, ic, kh, kw) + /// + /// Prelu weights tensor is passed in runtime execution phase. Prelu + /// weights tensor data type is implicitly assumed as f32 using plain + /// layout (a, ab, acb, acdb, acdeb). + /// + /// @param mask Defines the correspondence between the output tensor + /// dimensions and the prelu weights tensor. The set i-th bit indicates + /// that a dedicated weights value is used for each index along that + /// dimension. Set the mask to 0 to use a common weights value + /// for the whole output tensor. + void append_prelu(int mask) { + error::wrap_c_api(dnnl_post_ops_append_prelu(get(), mask), + "could not append a prelu post-op"); + } + + /// Returns the parameters of a prelu post-op. + /// + /// @param index Index of the prelu post-op. + /// @param mask Weights mask of prelu post-op. + void get_params_prelu(int index, int &mask) const { + error::wrap_c_api(dnnl_post_ops_get_params_prelu(get(), index, &mask), + "could not get parameters of a binary post-op"); + } +}; + +/// @cond DO_NOT_DOCUMENT_THIS +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_primitive_attr_t p) { + return dnnl_primitive_attr_destroy(p); + } +}; +/// @endcond + +/// Primitive attributes. +/// +/// @sa @ref dev_guide_attributes +struct primitive_attr : public handle { + using handle::handle; + + /// Constructs default (empty) primitive attributes. + primitive_attr() { + dnnl_primitive_attr_t result; + error::wrap_c_api(dnnl_primitive_attr_create(&result), + "could not create primitive attribute"); + reset(result); + } + + /// Creates primitive attributes from a C API ::dnnl_primitive_attr_t + /// handle. The resulting handle is not weak and the C handle will be + /// destroyed during the destruction of the C++ object. + /// + /// @param attr The C API primitive attributes. + primitive_attr(dnnl_primitive_attr_t attr) + : handle(attr) {} + + /// Returns the parameters of a dropout attribute. + /// + /// @param mask_desc Output memory descriptor of a dropout mask. + void get_dropout(memory::desc &mask_desc) const { + const_dnnl_memory_desc_t cdesc; + error::wrap_c_api(dnnl_primitive_attr_get_dropout(get(), &cdesc), + "could not get parameters of a dropout attribute"); + dnnl_memory_desc_t cloned_md = nullptr; + error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc), + "could not clone a memory descriptor"); + mask_desc = memory::desc(cloned_md); + } + + /// Sets dropout probability. + /// + /// @param mask_desc Output memory descriptor of a dropout mask. + void set_dropout(const memory::desc &mask_desc) { + error::wrap_c_api( + dnnl_primitive_attr_set_dropout(get(), mask_desc.get()), + "could not set dropout primitive attribute"); + } + + /// Returns the fpmath mode + fpmath_mode get_fpmath_mode() const { + dnnl_fpmath_mode_t result; + error::wrap_c_api(dnnl_primitive_attr_get_fpmath_mode(get(), &result), + "could not get fpmath mode primitive attribute"); + return fpmath_mode(result); + } + + /// Returns the fpmath mode + /// + /// @param mode Specified fpmath mode. + /// @param apply_to_int Use floating-point arithmetic for integer primitives. + void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const { + dnnl_fpmath_mode_t c_mode; + int c_apply_to_int; + error::wrap_c_api(dnnl_primitive_attr_get_fpmath_mode_v2( + get(), &c_mode, &c_apply_to_int), + "could not get fpmath mode primitive attribute"); + mode = fpmath_mode(c_mode); + apply_to_int = static_cast(c_apply_to_int); + } + + /// Sets fpmath mode. + /// + /// @param mode Specified fpmath mode. + /// @param apply_to_int Boolean. Use of floating-point arithmetic for integer primitives. + void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) { + error::wrap_c_api(dnnl_primitive_attr_set_fpmath_mode_v2(get(), + dnnl::convert_to_c(mode), apply_to_int), + "could not set fpmath mode primitive attribute"); + } + + /// Returns the accumulation mode + accumulation_mode get_accumulation_mode() const { + dnnl_accumulation_mode_t result; + error::wrap_c_api( + dnnl_primitive_attr_get_accumulation_mode(get(), &result), + "could not get accumulation mode primitive attribute"); + return accumulation_mode(result); + } + + /// Sets accumulation mode. + /// + /// @param mode Specified accumulation mode. + void set_accumulation_mode(accumulation_mode mode) { + error::wrap_c_api(dnnl_primitive_attr_set_accumulation_mode( + get(), dnnl::convert_to_c(mode)), + "could not set accumulation mode primitive attribute"); + } + + /// Returns the deterministic attribute value + bool get_deterministic() const { + int result; + error::wrap_c_api(dnnl_primitive_attr_get_deterministic(get(), &result), + "could not get deterministic primitive attribute"); + return static_cast(result); + } + + /// Sets deterministic attribute value + /// + /// @param value Specified deterministic mode. + void set_deterministic(bool value) { + error::wrap_c_api(dnnl_primitive_attr_set_deterministic( + get(), static_cast(value)), + "could not set deterministic primitive attribute"); + } + + /// Returns the rounding mode attribute value + /// + /// @param arg Argument for which rounding mode query applies. + /// @returns The rounding mode applied to the specified argument. + rounding_mode get_rounding_mode(int arg) const { + dnnl_rounding_mode_t result; + error::wrap_c_api(dnnl_primitive_attr_get_rounding(get(), arg, &result), + "could not get rounding mode primitive attribute"); + return rounding_mode(result); + } + + /// Sets the rounding mode attribute value for a given argument + /// + /// @param arg Argument for which to set rounding mode. + /// @param mode Rounding mode to apply. + void set_rounding_mode(int arg, rounding_mode mode) { + error::wrap_c_api(dnnl_primitive_attr_set_rounding( + get(), arg, convert_to_c(mode)), + "could not set rounding mode primitive attribute"); + } + + /// Returns the scratchpad mode. + scratchpad_mode get_scratchpad_mode() const { + dnnl_scratchpad_mode_t result; + error::wrap_c_api( + dnnl_primitive_attr_get_scratchpad_mode(get(), &result), + "could not get scratchpad mode primitive attribute"); + return scratchpad_mode(result); + } + + /// Sets scratchpad mode. + /// + /// @param mode Specified scratchpad mode. + void set_scratchpad_mode(scratchpad_mode mode) { + error::wrap_c_api(dnnl_primitive_attr_set_scratchpad_mode( + get(), dnnl::convert_to_c(mode)), + "could not set scratchpad mode primitive attribute"); + } + + /// Sets scaling factors for primitive operations for a given memory + /// argument. The scaling factors must be passed at execution time + /// as an argument with index #DNNL_ARG_ATTR_SCALES | arg. + /// + /// @sa dnnl_primitive_attr_set_scales_mask + /// + /// @param arg Parameter argument index as passed to the + /// primitive::execute() call. + /// @param mask Scaling factors correspondence mask that defines the + /// correspondence between the tensor dimensions and the @p scales + /// vector. The set i-th bit indicates that a dedicated scaling factor + /// is used for each index along that dimension. Set the mask to 0 to + /// use a common scaling factor for the whole output tensor. + void set_scales_mask(int arg, int mask) { + error::wrap_c_api(dnnl_primitive_attr_set_scales_mask(get(), arg, mask), + "could not set scales primitive attribute"); + } + + /// Sets scaling factors for primitive operations for a given memory + /// argument. The scaling factors must be passed at execution time + /// as an argument with index #DNNL_ARG_ATTR_SCALES | arg. + /// + /// @sa dnnl_primitive_attr_set_scales + /// + /// @param arg Parameter argument index as passed to the + /// primitive::execute() call. + /// @param mask Scales correspondence mask that defines the + /// correspondence between the tensor dimensions and the @p + /// scales vector. The set i-th bit indicates that a dedicated + /// scale is used for each index along that dimension. Set the + /// mask to 0 to use a common scale for the whole output tensor. + /// @param groups Scaling factors correspondence groups that define the + /// correspondence between the tensor dimensions and the scales array. + /// The set i-th dimension indicates a number of groups of scaling + /// factors used for that logical dimension in a memory indicated by @p arg. + /// @param data_type Scaling factors data_type. + void set_scales(int arg, int mask, const memory::dims &groups, + memory::data_type data_type = memory::data_type::f32) { + error::wrap_c_api(dnnl_primitive_attr_set_scales(get(), arg, mask, + (int)groups.size(), groups.data(), + memory::convert_to_c(data_type)), + "could not set scales primitive attribute"); + } + + /// Sets zero points for primitive operations for a given memory argument. + /// The zero points must be passed at execution time as an argument with + /// index #DNNL_ARG_ATTR_ZERO_POINTS | arg. + /// + /// @sa dnnl_primitive_attr_set_zero_points_mask + /// + /// @param arg Parameter argument index as passed to the + /// primitive::execute() call. + /// @param mask Zero point correspondence mask that defines the + /// correspondence between the tensor dimensions and the @p + /// zero_points vector. The set i-th bit indicates that a dedicated + /// zero point is used for each index along that dimension. Set the + /// mask to 0 to use a common zero point for the whole output tensor. + void set_zero_points_mask(int arg, int mask) { + error::wrap_c_api( + dnnl_primitive_attr_set_zero_points_mask(get(), arg, mask), + "could not set zero points primitive attribute"); + } + + /// Sets zero points for primitive operations for a given memory argument. + /// The zero points must be passed at execution time as an argument with + /// index #DNNL_ARG_ATTR_ZERO_POINTS | arg. + /// + /// @sa dnnl_primitive_attr_set_zero_points + /// + /// @param arg Parameter argument index as passed to the + /// primitive::execute() call. + /// @param mask Zero point correspondence mask that defines the + /// correspondence between the tensor dimensions and the @p + /// zero_points vector. The set i-th bit indicates that a dedicated + /// zero point is used for each index along that dimension. Set the + /// mask to 0 to use a common zero point for the whole output tensor. + /// @param groups Zero point factors correspondence groups that define the + /// correspondence between the tensor dimensions and the zero_points array. + /// The set i-th dimension indicates a number of groups of zero point + /// factors used for that logical dimension in a memory indicated by @p arg. + /// @param data_type Zero point factors data_type. + void set_zero_points(int arg, int mask, const memory::dims &groups, + memory::data_type data_type = memory::data_type::s32) { + error::wrap_c_api(dnnl_primitive_attr_set_zero_points(get(), arg, mask, + (int)groups.size(), groups.data(), + memory::convert_to_c(data_type)), + "could not set zero points primitive attribute"); + } + + /// Returns post-ops previously set via set_post_ops(). + /// + /// @returns Post-ops. + const post_ops get_post_ops() const { + const_dnnl_post_ops_t const_c_post_ops; + error::wrap_c_api( + dnnl_primitive_attr_get_post_ops(get(), &const_c_post_ops), + "could not get post-ops primitive attribute"); + dnnl_post_ops_t c_post_ops; + error::wrap_c_api(dnnl_post_ops_clone(&c_post_ops, const_c_post_ops), + "could not clone post-ops primitive attribute"); + return post_ops(c_post_ops); + } + + /// Sets post-ops. + /// + /// @note + /// There is no way to check whether the post-ops would be supported + /// by the target primitive. Any error will be reported + /// by the respective primitive descriptor constructor. + /// + /// @param ops Post-ops object to copy post-ops from. + void set_post_ops(const post_ops ops) { + error::wrap_c_api(dnnl_primitive_attr_set_post_ops(get(), ops.get()), + "could not set post-ops primitive attribute"); + } + + /// Sets quantization scale and shift parameters for RNN data tensors. + /// + /// For performance reasons, the low-precision configuration of the RNN + /// primitives expect input activations to have the unsigned 8-bit integer + /// data type. The scale and shift parameters are used to quantize + /// floating-point data to unsigned integer and must be passed to the RNN + /// primitive using attributes. + /// + /// The quantization formula is `scale * data + shift`. + /// + /// Example usage: + /// @code + /// // RNN parameters + /// int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32; + /// // Activations quantization parameters + /// float scale = 63.f, shift = 64.f; + /// + /// primitive_attr attr; + /// + /// // Set scale and shift for int8 quantization of activation + /// attr.set_rnn_data_qparams(scale, shift); + /// + /// // Create an RNN primitive descriptor. + /// vanilla_rnn_forward::primitive_desc rnn_d( + /// engine, /* arguments */, attr); + /// @endcode + /// + /// @note + /// Quantization scale and shift are common for src_layer, src_iter, + /// dst_iter, and dst_layer. + /// + /// @param scale The value to scale the data by. + /// @param shift The value to shift the data by. + void set_rnn_data_qparams(float scale, float shift) { + error::wrap_c_api( + dnnl_primitive_attr_set_rnn_data_qparams(get(), scale, shift), + "could not set RNN data quantization parameters primitive " + "attribute"); + } + + /// Returns the quantization scale and shift parameters for RNN data + /// tensors. + /// + /// @note + /// Quantization scale and shift are common for src_layer, src_iter, + /// dst_iter, and dst_layer. + /// + /// @param scale The value to scale the data by. + /// @param shift The value to shift the data by. + void get_rnn_data_qparams(float &scale, float &shift) { + float c_scale, c_shift; + error::wrap_c_api(dnnl_primitive_attr_get_rnn_data_qparams( + get(), &c_scale, &c_shift), + "could not set RNN data quantization parameters primitive " + "attribute"); + scale = c_scale; + shift = c_shift; + } + + /// Sets quantization scaling factors for RNN weights tensors. The + /// low-precision configuration of the RNN primitives expect input weights + /// to use the signed 8-bit integer data type. The scaling factors are + /// used to quantize floating-point data to signed integer and must be + /// passed to RNN primitives using attributes. + /// + /// @note + /// The dimension order is always native and does not depend on the + /// actual layout used. For example, five-dimensional weights always + /// have (l, d, i, g, o) logical dimension ordering. + /// + /// @note + /// Quantization scales are common for weights_layer and + /// weights_iteration + /// + /// @param mask Scaling factors correspondence mask that defines the + /// correspondence between the output tensor dimensions and the @p + /// scales vector. The set i-th bit indicates that a dedicated scaling + /// factor should be used each index along that dimension. Set the + /// mask to 0 to use a common scaling factor for the whole output + /// tensor. + /// @param scales Constant vector of output scaling factors. The following + /// equality must hold: + /// \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$ + /// Violations can only be detected when the attributes are used to + /// create a primitive descriptor. + void set_rnn_weights_qparams(int mask, const std::vector &scales) { + error::wrap_c_api(dnnl_primitive_attr_set_rnn_weights_qparams(get(), + (int)scales.size(), mask, scales.data()), + "could not set RNN weights quantization parameters primitive " + "attribute"); + } + + /// Returns the quantization scaling factors for RNN projection weights + /// tensors. + /// + /// @note + /// The dimension order is always native and does not depend on the + /// actual layout used. For example, five-dimensional weights always + /// have (l, d, i, g, o) logical dimension ordering. + /// + /// @param mask Scaling factors correspondence mask that defines the + /// correspondence between the output tensor dimensions and the @p + /// scales vector. The set i-th bit indicates that a dedicated scaling + /// factor should be used each index along that dimension. Set the + /// mask to 0 to use a common scaling factor for the whole output + /// tensor. + /// @param scales Constant vector of output scaling factors. The following + /// equality must hold: + /// \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$ + /// Violations can only be detected when the attributes are used to + /// create a primitive descriptor. + void get_rnn_weights_qparams(int &mask, std::vector &scales) { + dnnl_dim_t count; + int c_mask; + const float *c_scales; + error::wrap_c_api(dnnl_primitive_attr_get_rnn_weights_qparams( + get(), &count, &c_mask, &c_scales), + "could not get primitive RNN weights quantization " + "parameters attributes"); + scales.resize(count); + + mask = c_mask; + for (dnnl_dim_t c = 0; c < count; c++) + scales[c] = c_scales[c]; + } + + /// Sets quantization scaling factors for RNN projection weights tensors. + // The low-precision configuration of the RNN primitives expect input + // weights to use the signed 8-bit integer data type. The scaling factors + // are used to quantize floating-point data to signed integer and must be + /// passed to RNN primitives using attributes. + /// + /// @note + /// The dimension order is always native and does not depend on the + /// actual layout used. For example, five-dimensional weights always + /// have (l, d, i, g, o) logical dimension ordering. + /// + /// @note + /// Quantization scales are common for weights_layer and + /// weights_iteration + /// + /// @param mask Scaling factors correspondence mask that defines the + /// correspondence between the output tensor dimensions and the @p + /// scales vector. The set i-th bit indicates that a dedicated scaling + /// factor should be used each index along that dimension. Set the + /// mask to 0 to use a common scaling factor for the whole output + /// tensor. + /// @param scales Constant vector of output scaling factors. The following + /// equality must hold: + /// \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$ + /// Violations can only be detected when the attributes are used to + /// create a primitive descriptor. + void set_rnn_weights_projection_qparams( + int mask, const std::vector &scales) { + error::wrap_c_api( + dnnl_primitive_attr_set_rnn_weights_projection_qparams( + get(), (int)scales.size(), mask, scales.data()), + "could not set primitive RNN weights projection quantization " + "parameters attributes"); + } + + /// Returns the quantization scaling factors for RNN projection weights + /// tensors. + /// + /// @note + /// The dimension order is always native and does not depend on the + /// actual layout used. For example, five-dimensional weights always + /// have (l, d, i, g, o) logical dimension ordering. + /// + /// @param mask Scaling factors correspondence mask that defines the + /// correspondence between the output tensor dimensions and the @p + /// scales vector. The set i-th bit indicates that a dedicated scaling + /// factor should be used each index along that dimension. Set the + /// mask to 0 to use a common scaling factor for the whole output + /// tensor. + /// @param scales Constant vector of output scaling factors. The following + /// equality must hold: + /// \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$ + /// Violations can only be detected when the attributes are used to + /// create a primitive descriptor. + void get_rnn_weights_projection_qparams( + int &mask, std::vector &scales) { + dnnl_dim_t count; + int c_mask; + const float *c_scales; + error::wrap_c_api( + dnnl_primitive_attr_get_rnn_weights_projection_qparams( + get(), &count, &c_mask, &c_scales), + "could not get primitive RNN weights projection quantization " + "parameters attributes"); + scales.resize(count); + + mask = c_mask; + for (dnnl_dim_t c = 0; c < count; c++) + scales[c] = c_scales[c]; + } +}; + +/// @} dnnl_api_attributes + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Base class for all primitive descriptors. +struct primitive_desc_base : public handle { + using handle::handle; + + /// Default constructor. Produces an empty object. + primitive_desc_base() = default; + + /// Returns the engine of the primitive descriptor. + /// @returns The engine of the primitive descriptor. + engine get_engine() const { return query_engine(query::engine); } + + /// Returns implementation name. + /// @returns The implementation name. + const char *impl_info_str() const { + const char *res; + error::wrap_c_api(dnnl_primitive_desc_query( + get(), dnnl_query_impl_info_str, 0, &res), + "could not retrieve implementation info string from a " + "primitive descriptor"); + return res; + } + + /// Returns a memory::dim value (same as int64_t). + /// @param what The value to query. + /// @returns The result of the query. + memory::dim query_s64(query what) const { + memory::dim res; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl::convert_to_c(what), 0, &res); + return status == dnnl_success ? res : 0; + } + + /// Returns strides. + /// @returns Strides. + /// @returns An empty #dnnl::memory::dims if the primitive does not have + /// a strides parameter. + memory::dims get_strides() const { return query_dims(query::strides); } + + /// Returns dilations. + /// @returns Dilations. + /// @returns An empty #dnnl::memory::dims if the primitive does not have + /// a dilations parameter. + memory::dims get_dilations() const { return query_dims(query::dilations); } + + /// Returns a left padding. + /// @returns A left padding. + /// @returns An empty #dnnl::memory::dims if the primitive does not have + /// a left padding parameter. + memory::dims get_padding_l() const { return query_dims(query::padding_l); } + + /// Returns a right padding. + /// @returns A right padding. + /// @returns An empty #dnnl::memory::dims if the primitive does not have + /// a right padding parameter. + memory::dims get_padding_r() const { return query_dims(query::padding_r); } + + /// Returns an epsilon. + /// @returns An epsilon. + /// @returns Zero if the primitive does not have an epsilon parameter. + float get_epsilon() const { return query_f32(query::epsilon_f32); } + + /// Returns flags. + /// @tparam T Flags enumeration type. + /// @returns Flags. + /// @returns Zero if the primitive does not have a flags parameter. + template + T get_flags() const { + unsigned res; + dnnl_status_t status + = dnnl_primitive_desc_query(get(), dnnl_query_flags, 0, &res); + return static_cast(status == dnnl_success ? res : 0x0U); + } + + /// Returns an algorithm kind. + /// @returns An algorithm kind. + /// @returns #dnnl::algorithm::undef if the primitive does not have an + /// algorithm parameter. + dnnl::algorithm get_algorithm() const { return query_alg(query::alg_kind); } + + /// Returns an alpha. + /// @returns An alpha. + /// @returns Zero if the primitive does not have an alpha parameter. + float get_alpha() const { return query_f32(query::alpha_f32); } + + /// Returns a beta. + /// @returns A beta. + /// @returns Zero if the primitive does not have a beta parameter. + float get_beta() const { return query_f32(query::beta_f32); } + + /// Returns an axis. + /// @returns An axis. + /// @returns A negative number if the primitive does not have an axis + /// parameter. + int get_axis() const { + int res; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl_query_axis_s32, 0, &res); + return status == dnnl_success ? res : -1; + } + + /// Returns an LRN local size parameter. + /// @returns An LRN local size parameter. + /// @returns Zero if the primitive does not have an LRN local size + /// parameter. + memory::dim get_local_size() const { + return query_s64(query::local_size_s64); + } + + /// Returns an LRN K parameter. + /// @returns An LRN K parameter. + /// @returns Zero if the primitive does not have an LRN K parameter. + float get_k() const { return query_f32(query::k_f32); } + + /// Returns a reduction P parameter. + /// @returns A reduction P parameter. + /// @returns Zero if the primitive does not have a reduction P parameter. + float get_p() const { return query_f32(query::p_f32); } + + /// Returns a resampling factors parameters. + /// @returns A vector of factors. + /// @returns An empty vector if the primitive does not have a resampling + /// factors parameter. + std::vector get_factors() const { + float *factors; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl_query_factors, 0, &factors); + + const bool is_backward = get_prop_kind() != prop_kind::forward_training + && get_prop_kind() != prop_kind::forward_inference; + const_dnnl_memory_desc_t md = dnnl_primitive_desc_query_md(get(), + is_backward ? dnnl_query_diff_dst_md : dnnl_query_dst_md, 0); + + int ndims; + error::wrap_c_api( + dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &ndims), + "could not query ndims from a memory descriptor"); + + return status == dnnl_success + ? std::vector(factors, factors + (ndims - 2)) + : std::vector {}; + } + + /// Returns an RNN cell kind parameter. + /// @returns An RNN cell kind parameter. + /// @returns #dnnl::algorithm::undef if the primitive does not have an + /// RNN cell kind parameter. + dnnl::algorithm get_cell_kind() const { + return query_alg(query::cell_kind); + } + + /// Returns an RNN direction parameter. + /// @returns An RNN direction parameter. + /// @returns #dnnl::rnn_direction::undef if the primitive does not have + /// an RNN direction parameter. + dnnl::rnn_direction get_direction() const { + dnnl_rnn_direction_t direction; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl_query_direction, 0, &direction); + return status == dnnl_success + ? static_cast(direction) + : dnnl::rnn_direction::undef; + } + + /// Returns an RNN activation kind parameter. + /// @returns An RNN activation kind parameter. + /// @returns #dnnl::algorithm::undef if the primitive does not have an + /// RNN activation kind parameter. + dnnl::algorithm get_activation_kind() const { + return query_alg(query::activation_kind); + } + + /// Returns a pooling kernel parameter. + /// @returns A pooling kernel parameter. + /// @returns An empty #dnnl::memory::dims if the primitive does not have + /// a pooling kernel parameter. + memory::dims get_kernel() const { return query_dims(query::kernel); } + + /// Returns a group size parameter. + /// @returns A group size parameter. + /// @returns Zero if the primitive does not have a group size + /// parameter. + memory::dim get_group_size() const { + return query_s64(query::group_size_s64); + } + + /// Returns a propagation kind. + /// @returns A propagation kind. + /// @returns #dnnl::prop_kind::undef if the primitive does not have + /// a propagation parameter. + dnnl::prop_kind get_prop_kind() const { + dnnl_prop_kind_t prop_kind; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl_query_prop_kind, 0, &prop_kind); + return status == dnnl_success ? static_cast(prop_kind) + : dnnl::prop_kind::undef; + } + + /// Returns a memory descriptor. + /// + /// @note + /// There are also convenience methods + /// #dnnl::primitive_desc_base::src_desc(), + /// #dnnl::primitive_desc_base::dst_desc(), and others. + /// + /// @param what The kind of parameter to query; can be + /// #dnnl::query::src_md, #dnnl::query::dst_md, etc. + /// @param idx Index of the parameter. For example, convolution bias can + /// be queried with what = #dnnl::query::weights_md and idx = 1. + /// @returns The requested memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// parameter of the specified kind or index. + memory::desc query_md(query what, int idx = 0) const { + std::vector valid_q {query::src_md, query::diff_src_md, + query::weights_md, query::diff_weights_md, query::dst_md, + query::diff_dst_md, query::workspace_md, query::scratchpad_md, + query::exec_arg_md}; + if (!std::any_of(valid_q.cbegin(), valid_q.cend(), + [=](query q) { return what == q; })) + DNNL_THROW_ERROR(dnnl_invalid_arguments, + "memory descriptor query is invalid"); + + const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md( + get(), dnnl::convert_to_c(what), idx); + if (!cdesc) return memory::desc(); + + dnnl_memory_desc_t cloned_md = nullptr; + error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc), + "could not clone a memory descriptor"); + + return memory::desc(cloned_md); + } + + /// Returns a source memory descriptor. + /// @param idx Source index. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter with index @p idx. + memory::desc src_desc(int idx) const { + return query_md(query::src_md, idx); + } + + /// Returns a destination memory descriptor. + /// @param idx Destination index. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter with index @p idx. + memory::desc dst_desc(int idx) const { + return query_md(query::dst_md, idx); + } + + /// Returns a weights memory descriptor. + /// @param idx Weights index. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter with index @p idx. + memory::desc weights_desc(int idx) const { + return query_md(query::weights_md, idx); + } + + /// Returns a diff source memory descriptor. + /// @param idx Diff source index. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source parameter with index @p idx. + memory::desc diff_src_desc(int idx) const { + return query_md(query::diff_src_md, idx); + } + + /// Returns a diff destination memory descriptor. + /// @param idx Diff destination index. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter with index @p idx. + memory::desc diff_dst_desc(int idx) const { + return query_md(query::diff_dst_md, idx); + } + + /// Returns a diff weights memory descriptor. + /// @param idx Diff weights index. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter with index @p idx. + memory::desc diff_weights_desc(int idx) const { + return query_md(query::diff_weights_md, idx); + } + + // Separate versions without the index argument for documentation + // purposes. + + /// Returns a source memory descriptor. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter. + memory::desc src_desc() const { return src_desc(0); } + + /// Returns a destination memory descriptor. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter. + memory::desc dst_desc() const { return dst_desc(0); } + + /// Returns a weights memory descriptor. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter. + memory::desc weights_desc() const { return weights_desc(0); } + + /// Returns a diff source memory descriptor. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source memory with. + memory::desc diff_src_desc() const { return diff_src_desc(0); } + + /// Returns a diff destination memory descriptor. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter. + memory::desc diff_dst_desc() const { return diff_dst_desc(0); } + + /// Returns a diff weights memory descriptor. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter. + memory::desc diff_weights_desc() const { return diff_weights_desc(0); } + + /// Returns the workspace memory descriptor. + /// @returns Workspace memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// workspace parameter. + memory::desc workspace_desc() const { + return query_md(query::workspace_md, 0); + } + + /// Returns the scratchpad memory descriptor. + /// @returns scratchpad memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// scratchpad parameter. + /// @sa @ref dev_guide_attributes_scratchpad + memory::desc scratchpad_desc() const { + return query_md(query::scratchpad_md, 0); + } + + /// Returns the engine on which the scratchpad memory is located. + /// @returns The engine on which the scratchpad memory is located. + engine scratchpad_engine() const { + dnnl_engine_t c_engine; + error::wrap_c_api(dnnl_primitive_desc_query(get(), + dnnl::convert_to_c(query::scratchpad_engine), + 0, &c_engine), + "could not retrieve scratchpad engine from a primitive " + "descriptor"); + return engine(c_engine, true); + } + + /// Returns the primitive attributes. + /// @returns The primitive attributes. + primitive_attr get_primitive_attr() const { + const_dnnl_primitive_attr_t const_c_attr; + error::wrap_c_api(dnnl_primitive_desc_get_attr(get(), &const_c_attr), + "could not get attributes from a primitive descriptor"); + dnnl_primitive_attr_t c_attr; + error::wrap_c_api(dnnl_primitive_attr_clone(&c_attr, const_c_attr), + "could not clone primitive attributes"); + return primitive_attr(c_attr); + } + + /// Returns the kind of the primitive descriptor. + /// @returns The kind of the primitive descriptor. + dnnl::primitive::kind get_kind() const { + dnnl_primitive_kind_t kind; + error::wrap_c_api(dnnl_primitive_desc_query(get(), + dnnl_query_primitive_kind, 0, (void *)&kind), + "could not get primitive kind from a primitive descriptor"); + return static_cast(kind); + } + + /// Returns the cache blob ID of the primitive descriptor. + /// @returns The cache blob ID of the primitive descriptor. + std::vector get_cache_blob_id() const { + dnnl_dim_t count; + const uint8_t *c_id; + error::wrap_c_api( + dnnl_primitive_desc_query(get(), + dnnl::convert_to_c(query::cache_blob_id_size_s64), 0, + (void *)&count), + "could not get size of cache blob ID from a primitive " + "descriptor"); + error::wrap_c_api(dnnl_primitive_desc_query(get(), + dnnl::convert_to_c(query::cache_blob_id), 0, + (void **)&c_id), + "could not get cache blob ID from a primitive descriptor"); + std::vector id(c_id, c_id + count); + return id; + } + +protected: + /// Returns a float value. + /// @param what The value to query. + /// @returns The result of the query. + /// @returns Zero if the primitive doesn't support the query. + float query_f32(query what) const { + float res; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl::convert_to_c(what), 0, &res); + return status == dnnl_success ? res : 0.0f; + } + + /// Returns an #dnnl::algorithm value. + /// @param what The value to query. + /// @returns The result of the query. + /// @returns #dnnl::algorithm::undef if the primitive doesn't support + /// the query. + algorithm query_alg(query what) const { + dnnl_alg_kind_t res; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl::convert_to_c(what), 0, &res); + return status == dnnl_success ? static_cast(res) + : algorithm::undef; + } + + /// Returns a memory::dims value. + /// @param what The value to query. + /// @returns The result of the query. + /// @returns An empty #dnnl::memory::dims if the primitive doesn't support + /// the query. + memory::dims query_dims(query what) const { + const bool is_backward = get_prop_kind() != prop_kind::forward_training + && get_prop_kind() != prop_kind::forward_inference; + const_dnnl_memory_desc_t md = dnnl_primitive_desc_query_md(get(), + is_backward ? dnnl_query_diff_dst_md : dnnl_query_dst_md, 0); + + int nspatial_dims = 0; + if (md) { + int ndims; + error::wrap_c_api( + dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &ndims), + "could not query ndims from a memory descriptor"); + nspatial_dims = ndims - 2; + } + + dnnl_dims_t *c_dims; + dnnl_status_t status = dnnl_primitive_desc_query( + get(), dnnl::convert_to_c(what), 0, &c_dims); + return status == dnnl_success + ? memory::dims(*c_dims, *c_dims + nspatial_dims) + : memory::dims {}; + } + + /// Returns an #dnnl::engine value. + /// @param what The value to query. + /// @returns The result of the query. + /// @returns A weak handle to the engine that the primitive descriptor was + /// created with. + engine query_engine(query what) const { + dnnl_engine_t c_engine; + error::wrap_c_api(dnnl_primitive_desc_query(get(), + dnnl::convert_to_c(what), 0, &c_engine), + "could not get an engine from a primitive_desc"); + return engine(c_engine, true); + } + + /// Resets the value of the handle to a clone of a C API primitive + /// descriptor. + /// @param pd A C API primitive descriptor to clone. + void reset_with_clone(const_dnnl_primitive_desc_t pd) { + dnnl_primitive_desc_t new_pd; + error::wrap_c_api(dnnl_primitive_desc_clone(&new_pd, pd), + "could not clone a primitive descriptor"); + reset(new_pd); + } + + /// Constructs a primitive descriptor base object from a clone of a C API + /// primitive descriptor after verifying that it is what the caller + /// expects. + /// + /// @note + /// The @p prim_kind should map to a primitive that does not have + /// different values of propagation kind (e.g. #dnnl::binary). + /// @note + /// Primitive descriptor base constructed this way does not support + /// next_impl() (will throw). + /// + /// @param pd C API primitive descriptor to clone. + /// @param prim_kind Expected primitive kind. + primitive_desc_base( + dnnl_primitive_desc_t pd, dnnl::primitive::kind prim_kind) + : primitive_desc_base(pd, prim_kind, dnnl::prop_kind::undef) {} + + /// Constructs a primitive descriptor base object from a clone of a C API + /// primitive descriptor after verifying that it is what the caller + /// expects. + /// + /// @note + /// Primitive descriptor base constructed this way does not support + /// next_impl() (will throw). + /// + /// @param pd C API primitive descriptor to clone. + /// @param prim_kind Expected primitive kind. + /// @param aprop_kind Expected propagation kind. + primitive_desc_base(dnnl_primitive_desc_t pd, + dnnl::primitive::kind prim_kind, dnnl::prop_kind aprop_kind) + : primitive_desc_base(pd, prim_kind, aprop_kind, aprop_kind) {} + + /// Constructs a primitive descriptor base object from a clone of a C API + /// primitive descriptor after verifying that it is what the caller + /// expects. + /// + /// @note + /// Primitive descriptor base constructed this way does not support + /// next_impl() (will throw). + /// + /// @param pd C API primitive descriptor to clone. + /// @param prim_kind Expected primitive kind. + /// @param prop_kind1 Expected propagation kind (option 1). + /// @param prop_kind2 Expected propagation kind (option 2). This value is + /// checked if the check with @p prop_kind1 fails. + primitive_desc_base(dnnl_primitive_desc_t pd, + dnnl::primitive::kind prim_kind, dnnl::prop_kind prop_kind1, + dnnl::prop_kind prop_kind2) { + // It is OK to pass an empty primitive descriptor + if (pd == nullptr) return; + + dnnl_status_t rc; + + dnnl_primitive_kind_t c_prim_kind = convert_to_c(prim_kind); + dnnl_prop_kind_t c_prop_kind1 = convert_to_c(prop_kind1); + dnnl_prop_kind_t c_prop_kind2 = convert_to_c(prop_kind2); + + // Check that primitive kind matches + dnnl_primitive_kind_t pd_kind; + rc = dnnl_primitive_desc_query( + pd, dnnl_query_primitive_kind, 0, (void *)&pd_kind); + error::wrap_c_api( + rc, "could not get primitive kind from a primitive descriptor"); + if (pd_kind != c_prim_kind) + DNNL_THROW_ERROR(dnnl_invalid_arguments, + "primitive descriptor operation kind mismatch"); + + // Check that propagation kind matches + dnnl_prop_kind_t pd_prop_kind; + rc = dnnl_primitive_desc_query( + pd, dnnl_query_prop_kind, 0, (void *)&pd_prop_kind); + + // Something went wrong + if (rc != dnnl_success && rc != dnnl_unimplemented) + DNNL_THROW_ERROR(dnnl_invalid_arguments, + "could not get propagation kind from the primitive " + "descriptor"); + + // Everything is fine + if ((rc == dnnl_unimplemented && c_prop_kind1 == dnnl_prop_kind_undef) + || (rc == dnnl_success + && (pd_prop_kind == c_prop_kind1 + || pd_prop_kind == c_prop_kind2))) { + reset_with_clone(pd); + return; + } + + // We could get the propagation kind but there is a mismatch + DNNL_THROW_ERROR(dnnl_invalid_arguments, + "primitive descriptor propagation kind mismatch"); + } + + /// Returns a constant reference to a static instance of default constructed + /// primitive attributes + static const primitive_attr &default_attr() { + static const primitive_attr attr; + return attr; + } + + const_dnnl_memory_desc_t optional_arg(const memory::desc *md) { + return md ? md->get() : nullptr; + } + + const dnnl_dim_t *optional_arg(const memory::dims *dims) { + return dims ? dims->data() : nullptr; + } + + const float *optional_arg(const std::vector *arg) { + return arg ? arg->data() : nullptr; + } + + using base = primitive_desc_base; +}; + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_reorder Reorder +/// +/// A primitive to copy data between two memory objects. This primitive is +/// typically used to change the way the data is laid out in memory. +/// +/// @sa @ref dev_guide_reorder in developer guide +/// +/// @{ + +/// Reorder primitive. +struct reorder : public primitive { + /// Primitive descriptor for a reorder primitive. + struct primitive_desc : public primitive_desc_base { + using primitive_desc_base::primitive_desc_base; + + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for reorder primitive. + /// + /// @note + /// If @p allow_empty is true, the constructor does not throw if a + /// primitive descriptor cannot be created. + /// + /// @param src_engine Engine on which the source memory object will be + /// located. + /// @param src_md Source memory descriptor. + /// @param dst_engine Engine on which the destination memory object + /// will be located. + /// @param dst_md Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is allowed + /// to fail without throwing an exception. In this case an empty + /// object will be produced. This flag is optional and defaults to + /// false. + primitive_desc(const engine &src_engine, const memory::desc &src_md, + const engine &dst_engine, const memory::desc &dst_md, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t result; + dnnl_status_t status = dnnl_reorder_primitive_desc_create(&result, + src_md.get(), src_engine.get(), dst_md.get(), + dst_engine.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the reorder primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for reorder primitive. + /// + /// @param src Source memory object. It is used to obtain the source + /// memory descriptor and engine. + /// @param dst Destination memory object. It is used to obtain the + /// destination memory descriptor and engine. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is allowed + /// to fail without throwing an exception. In this case an empty + /// object will be produced. This flag is optional and defaults to + /// false. + primitive_desc(const memory &src, const memory &dst, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t result; + auto src_md = src.get_desc(); + auto dst_md = dst.get_desc(); + dnnl_status_t status = dnnl_reorder_primitive_desc_create(&result, + src_md.get(), src.get_engine().get(), dst_md.get(), + dst.get_engine().get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the reorder primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for reorder primitive from a C + /// API primitive descriptor which must have a matching kind. + /// + /// @param pd C API primitive descriptor for reorder primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : primitive_desc_base(pd, dnnl::primitive::kind::reorder) {} + + /// Returns the engine on which the source memory is allocated. + /// @returns The engine on which the source memory is allocated. + engine get_src_engine() const { + return query_engine(dnnl::query::reorder_src_engine); + } + + /// Returns the engine on which the destination memory is allocated. + /// @returns The engine on which the destination memory is allocated. + engine get_dst_engine() const { + return query_engine(dnnl::query::reorder_dst_engine); + } + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + }; + + /// Default constructor. Produces an empty object. + reorder() = default; + + /// Constructs a reorder primitive. + /// @param pd Primitive descriptor for reorder primitive. + reorder(const primitive_desc &pd) : primitive(pd.get()) {} + + /// Constructs a reorder primitive from a cache blob. + /// @param pd Primitive descriptor for reorder primitive. + /// @param cache_blob Cache blob. + reorder(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd.get(), cache_blob) {} + + /// Constructs a reorder primitive that would reorder data between memory + /// objects having the same memory descriptors as memory objects @p src and + /// @p dst. + /// + /// @param src Source memory object. + /// @param dst Destination memory object. + /// @param attr Primitive attributes to use (optional). + reorder(const memory &src, const memory &dst, + const primitive_attr &attr = primitive_attr()) + : primitive(primitive_desc(src, dst, attr).get()) {} + + using primitive::execute; + + /// Executes the reorder primitive. + /// + /// @param astream Stream object. The stream must belong to the same engine + /// as the primitive. + /// @param src Source memory object. + /// @param dst Destination memory object. + void execute(const stream &astream, memory &src, memory &dst) const { + primitive::execute(astream, {{DNNL_ARG_FROM, src}, {DNNL_ARG_TO, dst}}); + } +}; + +/// @} dnnl_api_reorder + +/// @addtogroup dnnl_api_concat Concat +/// +/// A primitive to concatenate data by arbitrary dimension. +/// +/// @sa @ref dev_guide_concat in developer guide +/// +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS +inline std::vector convert_to_c( + const std::vector &mds) { + std::vector c_mds; + c_mds.reserve(mds.size()); + for (const auto &md : mds) + c_mds.push_back(md.get()); + return c_mds; +} +/// @endcond + +/// Tensor concatenation (concat) primitive. +struct concat : public primitive { + /// Primitive descriptor for a concat primitive. + struct primitive_desc : public primitive_desc_base { + using primitive_desc_base::primitive_desc_base; + + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an out-of-place concatenation + /// primitive. + /// + /// @param aengine Engine to perform the operation on. + /// @param dst Destination memory descriptor. + /// @param concat_dimension Source tensors will be concatenated over + /// dimension with this index. Note that order of dimensions does + /// not depend on memory format. + /// @param srcs Vector of source memory descriptors. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &dst, + int concat_dimension, const std::vector &srcs, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + auto c_srcs = convert_to_c(srcs); + + dnnl_primitive_desc_t result; + dnnl_status_t status = dnnl_concat_primitive_desc_create(&result, + aengine.get(), dst.get(), (int)c_srcs.size(), + concat_dimension, c_srcs.data(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the concat primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for an out-of-place concatenation + /// primitive. + /// + /// This version derives the destination memory descriptor + /// automatically. + /// + /// @param aengine Engine to perform the operation on. + /// @param concat_dimension Source tensors will be concatenated over + /// dimension with this index. Note that order of dimensions does + /// not depend on memory format. + /// @param srcs Vector of source memory descriptors. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, int concat_dimension, + const std::vector &srcs, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + auto c_api_srcs = convert_to_c(srcs); + + dnnl_primitive_desc_t result; + dnnl_status_t status = dnnl_concat_primitive_desc_create(&result, + aengine.get(), nullptr, (int)c_api_srcs.size(), + concat_dimension, c_api_srcs.data(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the concat primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for concat primitive from a C + /// API primitive descriptor which must have a matching kind. + /// + /// @param pd C API primitive descriptor for concat primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : primitive_desc_base(pd, dnnl::primitive::kind::concat) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc(int)const + memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + }; + + /// Default constructor. Produces an empty object. + concat() = default; + + /// Constructs a concatenation primitive. + /// @param pd Primitive descriptor for concatenation primitive. + concat(const primitive_desc &pd) : primitive(pd.get()) {} + + /// Constructs a concatenation primitive from a cache blob. + /// @param pd Primitive descriptor for concatenation primitive. + /// @param cache_blob Cache blob. + concat(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd.get(), cache_blob) {} +}; + +/// @} dnnl_api_concat + +/// @addtogroup dnnl_api_sum Sum +/// +/// A primitive to sum multiple tensors. +/// +/// @sa @ref dev_guide_sum in developer guide +/// +/// @{ + +/// Out-of-place summation (sum) primitive. +struct sum : public primitive { + /// Primitive descriptor for a sum primitive. + struct primitive_desc : public primitive_desc_base { + using primitive_desc_base::primitive_desc_base; + + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a sum primitive. + /// + /// @param aengine Engine to perform the operation on. + /// @param dst Destination memory descriptor. + /// @param scales Vector of scales to multiply data in each source + /// memory by. + /// @param srcs Vector of source memory descriptors. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &dst, + const std::vector &scales, + const std::vector &srcs, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + validate_container_size(scales, + "counts of scales and sources are not equal", + (int)srcs.size(), (int)srcs.size()); + + auto c_api_srcs = convert_to_c(srcs); + + dnnl_primitive_desc_t result; + dnnl_status_t status = dnnl_sum_primitive_desc_create(&result, + aengine.get(), dst.get(), (int)c_api_srcs.size(), + scales.data(), c_api_srcs.data(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the sum primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for a sum primitive. + /// + /// This version derives the destination memory descriptor + /// automatically. + /// + /// @param aengine Engine on which to perform the operation. + /// @param scales Vector of scales by which to multiply data in each + /// source memory object. + /// @param srcs Vector of source memory descriptors. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const std::vector &scales, + const std::vector &srcs, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + validate_container_size(scales, + "counts of scales and sources are not equal", + (int)srcs.size(), (int)srcs.size()); + + auto c_api_srcs = convert_to_c(srcs); + dnnl_primitive_desc_t result; + dnnl_status_t status = dnnl_sum_primitive_desc_create(&result, + aengine.get(), nullptr, (int)c_api_srcs.size(), + scales.data(), c_api_srcs.data(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the sum primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(status == dnnl_success ? result : dnnl_primitive_desc_t()); + } + + /// Constructs a primitive descriptor for sum primitive from a C API + /// primitive descriptor which must have a matching kind. + /// + /// @param pd C API primitive descriptor for sum primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : primitive_desc_base(pd, dnnl::primitive::kind::sum) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc(int)const + memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + }; + + /// Default constructor. Produces an empty object. + sum() = default; + + /// Constructs a sum primitive. + /// @param pd Primitive descriptor for sum primitive. + sum(const primitive_desc &pd) : primitive(pd.get()) {} + + /// Constructs a sum primitive from a cache blob. + /// @param pd Primitive descriptor for sum primitive. + /// @param cache_blob Cache blob. + sum(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd.get(), cache_blob) {} +}; + +/// @} dnnl_api_sum + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// A base class for descriptors of all primitives that support iteration +/// over multiple implementations. +struct primitive_desc : public primitive_desc_base { + using primitive_desc_base::primitive_desc_base; + + primitive_desc() = default; + + /// Changes the primitive descriptor to point to the next available + /// implementation. + /// + /// @returns @c true on success and @c false if the last available + /// implementation has already been reached. In the latter case, the + /// primitive descriptor itself is kept unchanged. + bool next_impl() { + dnnl_status_t status = dnnl_primitive_desc_next_impl(get()); + if (status == dnnl_last_impl_reached) return false; + error::wrap_c_api(status, "last available implementation is reached"); + return true; + } +}; + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_convolution Convolution +/// +/// A primitive to perform 1D, 2D or 3D convolution. Supported variants are +/// forward propagation, backward propagation, and weights gradient with or +/// without bias. +/// +/// @sa @ref dev_guide_convolution in developer guide +/// +/// @{ + +/// Convolution forward propagation primitive. +struct convolution_forward : public primitive { + /// Primitive descriptor for a convolution forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a convolution forward + /// propagation primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param bias_desc Bias memory descriptor. Passing zero memory + /// descriptor disables the bias term. + /// @param dst_desc Destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, &bias_desc, dst_desc, strides, nullptr, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution forward + /// propagation primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &dst_desc, + const memory::dims &strides, const memory::dims &padding_l, + const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, nullptr, dst_desc, strides, nullptr, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution forward + /// propagation primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param bias_desc Bias memory descriptor. Passing zero memory + /// descriptor disables the bias term. + /// @param dst_desc Destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, &bias_desc, dst_desc, strides, &dilates, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution forward + /// propagation primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &dst_desc, + const memory::dims &strides, const memory::dims &dilates, + const memory::dims &padding_l, const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, nullptr, dst_desc, strides, &dilates, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a convolution forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// Returns the bias memory descriptor. + /// @returns The bias memory descriptor. + /// @returns A zero memory descriptor of the primitive does not have a + /// bias parameter. + memory::desc bias_desc() const { return base::weights_desc(1); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc *bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, const primitive_attr &attr, + bool allow_empty) { + + memory::validate_dims(strides, src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_convolution_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + convert_to_c(aalgorithm), src_desc.get(), + weights_desc.get(), optional_arg(bias_desc), + dst_desc.get(), &strides[0], optional_arg(dilates), + &padding_l[0], &padding_r[0], attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the convolution forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + convolution_forward() = default; + + /// Constructs a convolution forward propagation primitive. + /// @param pd Primitive descriptor for a convolution forward propagation + /// primitive. + convolution_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a convolution forward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a convolution forward propagation + /// primitive. + /// @param cache_blob Cache blob. + convolution_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Convolution backward propagation primitive. +struct convolution_backward_data : public primitive { + /// Primitive descriptor for a convolution backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a convolution backward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param diff_src_desc Diff source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc, + diff_dst_desc, strides, nullptr, padding_l, padding_r, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution backward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param diff_src_desc Diff source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc, + diff_dst_desc, strides, &dilates, padding_l, padding_r, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a convolution backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + memory::validate_dims(strides, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, diff_src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_convolution_backward_data_primitive_desc_create(&pd, + aengine.get(), convert_to_c(aalgorithm), + diff_src_desc.get(), weights_desc.get(), + diff_dst_desc.get(), &strides[0], + optional_arg(dilates), &padding_l[0], &padding_r[0], + hint_fwd_pd.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the convolution backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + convolution_backward_data() = default; + + /// Constructs a convolution backward propagation primitive. + /// @param pd Primitive descriptor for a convolution backward propagation + /// primitive. + convolution_backward_data(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a convolution backward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a convolution backward propagation + /// primitive. + /// @param cache_blob Cache blob. + convolution_backward_data( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Convolution weights gradient primitive. +struct convolution_backward_weights : public primitive { + /// Primitive descriptor for a convolution weights gradient primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a convolution weights gradient + /// primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_bias_desc Diff bias memory descriptor. Passing zero + /// memory descriptor disables the bias term. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + &diff_bias_desc, diff_dst_desc, strides, nullptr, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution weights gradient + /// primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + nullptr, diff_dst_desc, strides, nullptr, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution weights + /// gradient primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_bias_desc Diff bias memory descriptor. Passing zero + /// memory descriptor disables the bias term. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + &diff_bias_desc, diff_dst_desc, strides, &dilates, + padding_l, padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution weights + /// gradient primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Convolution algorithm. Possible values are + /// #dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd, and + /// #dnnl::algorithm::convolution_auto. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + nullptr, diff_dst_desc, strides, &dilates, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a convolution weights gradient + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a convolution weights + /// gradient primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution, + dnnl::prop_kind::backward_weights) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// Returns the diff bias memory descriptor. + /// @returns The diff bias memory descriptor. + /// @returns A zero memory descriptor of the primitive does not have a + /// diff bias parameter. + memory::desc diff_bias_desc() const { + return base::diff_weights_desc(1); + } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc *diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const convolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + memory::validate_dims(strides, src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_convolution_backward_weights_primitive_desc_create( + &pd, aengine.get(), convert_to_c(aalgorithm), + src_desc.get(), diff_weights_desc.get(), + optional_arg(diff_bias_desc), diff_dst_desc.get(), + &strides[0], optional_arg(dilates), &padding_l[0], + &padding_r[0], hint_fwd_pd.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the convolution weights update primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + convolution_backward_weights() = default; + + /// Constructs a convolution weights gradient primitive. + /// @param pd Primitive descriptor for a convolution weights gradient + /// primitive. + convolution_backward_weights(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a convolution weights gradient primitive from a cache blob. + /// @param pd Primitive descriptor for a convolution weights gradient + /// primitive. + /// @param cache_blob Cache blob. + convolution_backward_weights( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_convolution +// +/// @addtogroup dnnl_api_deconvolution Deconvolution +/// +/// A primitive to perform 1D, 2D or 3D deconvolution. Supported variants are +/// forward propagation, backward propagation, and weights gradient with or +/// without bias. +/// +/// @{ + +/// Deconvolution forward propagation primitive. +struct deconvolution_forward : public primitive { + /// Primitive descriptor for a deconvolution forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a deconvolution forward + /// propagation primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Deconvolution algorithm: + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param bias_desc Bias memory descriptor. Passing zero memory + /// descriptor disables the bias term. + /// @param dst_desc Destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, &bias_desc, dst_desc, strides, nullptr, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution forward + /// propagation primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Deconvolution algorithm: + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &dst_desc, + const memory::dims &strides, const memory::dims &padding_l, + const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, nullptr, dst_desc, strides, nullptr, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution forward + /// propagation primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Deconvolution algorithm: + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param bias_desc Bias memory descriptor. Passing zero memory + /// descriptor disables the bias term. + /// @param dst_desc Destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, &bias_desc, dst_desc, strides, &dilates, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution forward + /// propagation primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Deconvolution algorithm: + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &dst_desc, + const memory::dims &strides, const memory::dims &dilates, + const memory::dims &padding_l, const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + weights_desc, nullptr, dst_desc, strides, &dilates, + padding_l, padding_r, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a deconvolution forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const + memory::desc bias_desc() const { return base::weights_desc(1); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc *bias_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, const primitive_attr &attr, + bool allow_empty) { + + memory::validate_dims(strides, src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_deconvolution_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + convert_to_c(aalgorithm), src_desc.get(), + weights_desc.get(), optional_arg(bias_desc), + dst_desc.get(), &strides[0], optional_arg(dilates), + &padding_l[0], &padding_r[0], attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the deconvolution forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + deconvolution_forward() = default; + + /// Constructs a deconvolution forward propagation primitive. + /// @param pd Primitive descriptor for a deconvolution forward propagation + /// primitive. + deconvolution_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a deconvolution forward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a deconvolution forward propagation + /// primitive. + /// @param cache_blob Cache blob. + deconvolution_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Deconvolution backward propagation primitive. +struct deconvolution_backward_data : public primitive { + /// Primitive descriptor for a deconvolution backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a deconvolution backward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm + /// (#dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd). + /// @param diff_src_desc Diff source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc, + diff_dst_desc, strides, nullptr, padding_l, padding_r, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution backward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm + /// (#dnnl::algorithm::convolution_direct, + /// #dnnl::algorithm::convolution_winograd). + /// @param diff_src_desc Diff source memory descriptor. + /// @param weights_desc Weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc, + diff_dst_desc, strides, &dilates, padding_l, padding_r, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a deconvolution backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + memory::validate_dims(strides, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, diff_src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_deconvolution_backward_data_primitive_desc_create( + &pd, aengine.get(), convert_to_c(aalgorithm), + diff_src_desc.get(), weights_desc.get(), + diff_dst_desc.get(), &strides[0], + optional_arg(dilates), &padding_l[0], &padding_r[0], + hint_fwd_pd.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the deconvolution backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + deconvolution_backward_data() = default; + + /// Constructs a deconvolution backward propagation primitive. + /// @param pd Primitive descriptor for a deconvolution backward propagation + /// primitive. + deconvolution_backward_data(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a deconvolution backward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a deconvolution backward propagation + /// primitive. + /// @param cache_blob Cache blob. + deconvolution_backward_data( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Deconvolution weights gradient primitive. +struct deconvolution_backward_weights : public primitive { + /// Primitive descriptor for a deconvolution weights gradient primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a deconvolution weights + /// gradient primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm. Possible values are + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_bias_desc Diff bias memory descriptor. Passing zero + /// memory descriptor disables the bias term. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + &diff_bias_desc, diff_dst_desc, strides, nullptr, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution weights + /// gradient primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p padding_l, and @p padding_r contain values + /// for spatial dimensions only and hence must have the same number of + /// elements as there are spatial dimensions. The order of values is + /// the same as in the tensor: depth (for 3D tensors), height (for 3D + /// and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm. Possible values are + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &padding_l, const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + nullptr, diff_dst_desc, strides, nullptr, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution weights + /// gradient primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm. Possible values are + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_bias_desc Diff bias memory descriptor. Passing zero + /// memory descriptor disables the bias term. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + &diff_bias_desc, diff_dst_desc, strides, &dilates, + padding_l, padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution weights + /// gradient primitive without bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r + /// contain values for spatial dimensions only and hence must have the + /// same number of elements as there are spatial dimensions. The order + /// of values is the same as in the tensor: depth (for 3D tensors), + /// height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Deconvolution algorithm. Possible values are + /// #dnnl::algorithm::deconvolution_direct, and + /// #dnnl::algorithm::deconvolution_winograd. + /// @param src_desc Source memory descriptor. + /// @param diff_weights_desc Diff weights memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Strides for each spatial dimension. + /// @param dilates Dilations for each spatial dimension. A zero value + /// means no dilation in the corresponding dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a deconvolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc, + nullptr, diff_dst_desc, strides, &dilates, padding_l, + padding_r, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a deconvolution weights + /// gradient primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a deconvolution weights + /// gradient primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution, + dnnl::prop_kind::backward_weights) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::convolution_backward_weights::primitive_desc::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return base::diff_weights_desc(1); + } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc *diff_bias_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims *dilates, const memory::dims &padding_l, + const memory::dims &padding_r, + const deconvolution_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + memory::validate_dims(strides, src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, src_desc.get_ndims() - 2); + + if (dilates) + memory::validate_dims(*dilates, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_deconvolution_backward_weights_primitive_desc_create( + &pd, aengine.get(), convert_to_c(aalgorithm), + src_desc.get(), diff_weights_desc.get(), + optional_arg(diff_bias_desc), diff_dst_desc.get(), + &strides[0], optional_arg(dilates), &padding_l[0], + &padding_r[0], hint_fwd_pd.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the deconvolution weights update primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + deconvolution_backward_weights() = default; + + /// Constructs a deconvolution weights gradient primitive. + /// @param pd Primitive descriptor for a deconvolution weights gradient + /// primitive. + deconvolution_backward_weights(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a deconvolution weights gradient primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a deconvolution weights gradient + /// primitive. + /// @param cache_blob Cache blob. + deconvolution_backward_weights( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_deconvolution + +/// @addtogroup dnnl_api_lrn LRN +/// +/// A primitive to perform local response normalization (LRN) across or within +/// channels. +/// +/// @sa @ref dev_guide_lrn in developer guide +/// +/// @{ + +/// Local response normalization (LRN) forward propagation primitive. +struct lrn_forward : public primitive { + /// Primitive descriptor for an LRN forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an LRN forward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm LRN algorithm kind: either + /// #dnnl::algorithm::lrn_across_channels, or + /// #dnnl::algorithm::lrn_within_channel. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param local_size Regularization local size. + /// @param alpha The alpha regularization parameter. + /// @param beta The beta regularization parameter. + /// @param k The k regularization parameter. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, memory::dim local_size, + float alpha, float beta, float k, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_lrn_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + convert_to_c(aalgorithm), src_desc.get(), dst_desc.get(), + local_size, alpha, beta, k, attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the lrn forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for an LRN forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an LRN forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::lrn, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + + /// @copydoc dnnl::primitive_desc_base::get_local_size()const + memory::dim get_local_size() const { return base::get_local_size(); } + + /// @copydoc dnnl::primitive_desc_base::get_k()const + float get_k() const { return base::get_k(); } + }; + + /// Default constructor. Produces an empty object. + lrn_forward() = default; + + /// Constructs an LRN forward propagation primitive. + /// @param pd Primitive descriptor for an LRN forward propagation + /// primitive. + lrn_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LRN forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LRN forward propagation + /// primitive. + /// @param cache_blob Cache blob. + lrn_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Local response normalization (LRN) backward propagation primitive. +struct lrn_backward : public primitive { + /// Primitive descriptor for an LRN backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an LRN backward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aalgorithm LRN algorithm kind: either + /// #dnnl::algorithm::lrn_across_channels, or + /// #dnnl::algorithm::lrn_within_channel. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param local_size Regularization local size. + /// @param alpha The alpha regularization parameter. + /// @param beta The beta regularization parameter. + /// @param k The k regularization parameter. + /// @param hint_fwd_pd Primitive descriptor for an LRN forward + /// propagation primitive. It is used as a hint for deciding which + /// memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + memory::dim local_size, float alpha, float beta, float k, + const lrn_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_lrn_backward_primitive_desc_create(&pd, + aengine.get(), convert_to_c(aalgorithm), + diff_src_desc.get(), diff_dst_desc.get(), src_desc.get(), + local_size, alpha, beta, k, hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the lrn backward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for an LRN backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an LRN backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::lrn, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + + /// @copydoc dnnl::primitive_desc_base::get_local_size()const + memory::dim get_local_size() const { return base::get_local_size(); } + + /// @copydoc dnnl::primitive_desc_base::get_k()const + float get_k() const { return base::get_k(); } + }; + + /// Default constructor. Produces an empty object. + lrn_backward() = default; + + /// Constructs an LRN backward propagation primitive. + /// @param pd Primitive descriptor for an LRN backward propagation + /// primitive. + lrn_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LRN backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LRN backward propagation + /// primitive. + /// @param cache_blob Cache blob. + lrn_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_lrn + +/// @addtogroup dnnl_api_eltwise Eltwise +/// +/// A primitive to perform elementwise operations such as the +/// rectifier linear unit (ReLU). +/// +/// Both forward and backward propagation primitives support in-place +/// operation; that is, src and dst can refer to the same memory for forward +/// propagation, and diff_dst and diff_src can refer to the same memory for +/// backward propagation. +/// +/// @warning +/// Because the original source data is required for backward propagation, +/// in-place forward propagation is not generally supported in the +/// training mode. However, for algorithms supporting destination as input +/// memory, dst can be used for the backward propagation, which makes it +/// possible to get performance benefit even in the training mode. +/// +/// @sa @ref dev_guide_eltwise in developer guide +/// +/// @{ + +/// Elementwise unary operation forward propagation primitive. +struct eltwise_forward : public primitive { + /// Primitive descriptor for an elementwise forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an elementwise forward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Elementwise algorithm kind. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + dst_desc, nullptr, nullptr, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an elementwise forward + /// propagation primitive with an alpha parameter. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Elementwise algorithm kind. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param alpha The alpha parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, float alpha, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + dst_desc, &alpha, nullptr, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an elementwise forward + /// propagation primitive with an alpha and beta parameters. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Elementwise algorithm kind. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param alpha The alpha parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param beta The beta parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, float alpha, float beta, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc, + dst_desc, &alpha, &beta, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an eltwise forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an eltwise forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::eltwise, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + dnnl::algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, const float *alpha, + const float *beta, const primitive_attr &attr, + bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_eltwise_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(aalgorithm), src_desc.get(), + dst_desc.get(), alpha ? *alpha : 0.0f, beta ? *beta : 0.0f, + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the eltwise forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + eltwise_forward() = default; + + /// Constructs an eltwise forward propagation primitive. + /// @param pd Primitive descriptor for an eltwise forward propagation + /// primitive. + eltwise_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an eltwise forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an eltwise forward propagation + /// primitive. + /// @param cache_blob Cache blob. + eltwise_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Elementwise unary operation backward propagation primitive. +struct eltwise_backward : public primitive { + /// Primitive descriptor for eltwise backward propagation. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an elementwise backward + /// propagation primitive with an alpha parameter. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Elementwise algorithm kind. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param data_desc Destination memory descriptor if one of the + /// "use_dst_for_bwd" algorithms are used (such as + /// #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor + /// otherwise. + /// @param hint_fwd_pd Primitive descriptor for an elementwise + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const memory::desc &data_desc, + const eltwise_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc, + data_desc, nullptr, nullptr, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for an elementwise backward + /// propagation primitive with an alpha parameter. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Elementwise algorithm kind. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param data_desc Destination memory descriptor if one of the + /// "use_dst_for_bwd" algorithms are used (such as + /// #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor + /// otherwise. + /// @param alpha The alpha parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param hint_fwd_pd Primitive descriptor for an elementwise + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const memory::desc &data_desc, float alpha, + const eltwise_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc, + data_desc, &alpha, nullptr, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for an elementwise backward + /// propagation primitive with an alpha and beta parameters. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Elementwise algorithm kind. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param data_desc Destination memory descriptor if one of the + /// "use_dst_for_bwd" algorithms are used (such as + /// #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor + /// otherwise. + /// @param alpha The alpha parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param beta The beta parameter for the elementwise operation. + /// Specific meaning depends on the algorithm. + /// @param hint_fwd_pd Primitive descriptor for an elementwise + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const memory::desc &data_desc, float alpha, float beta, + const eltwise_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc, + data_desc, &alpha, &beta, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an eltwise backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an eltwise backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::eltwise, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + dnnl::algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const memory::desc &data_desc, const float *alpha, + const float *beta, + const eltwise_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_eltwise_backward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aalgorithm), + diff_src_desc.get(), diff_dst_desc.get(), data_desc.get(), + alpha ? *alpha : 0.0f, beta ? *beta : 0.0f, + hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the eltwise backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + eltwise_backward() = default; + + /// Constructs an eltwise backward propagation primitive. + /// @param pd Primitive descriptor for an eltwise backward propagation + /// primitive. + eltwise_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an eltwise backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an eltwise backward propagation + /// primitive. + /// @param cache_blob Cache blob. + eltwise_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_eltwise + +/// @addtogroup dnnl_api_softmax Softmax +/// +/// A primitive to perform softmax. +/// +/// @sa @ref dev_guide_softmax in developer guide +/// +/// @{ + +/// Softmax forward propagation primitive. +struct softmax_forward : public primitive { + /// Primitive descriptor for a softmax forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a softmax forward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Softmax algorithm kind: either + /// #dnnl::algorithm::softmax_accurate, + /// or #dnnl::algorithm::softmax_log. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param axis Axis over which softmax is computed. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, int axis, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_softmax_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(aalgorithm), src_desc.get(), + dst_desc.get(), axis, attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the softmax forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a softmax forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a softmax forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::softmax, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + dnnl::algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_axis()const + int get_axis() const { return base::get_axis(); } + }; + + /// Default constructor. Produces an empty object. + softmax_forward() = default; + + /// Constructs a softmax forward propagation primitive. + /// @param pd Primitive descriptor for a softmax forward propagation + /// primitive. + softmax_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a softmax forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a softmax forward propagation + /// primitive. + /// @param cache_blob Cache blob. + softmax_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Softmax backward propagation primitive. +struct softmax_backward : public primitive { + /// Primitive descriptor for a softmax backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a softmax backward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Softmax algorithm kind: either + /// #dnnl::algorithm::softmax_accurate, + /// or #dnnl::algorithm::softmax_log. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param axis Axis over which softmax is computed. + /// @param hint_fwd_pd Primitive descriptor for a softmax + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &dst_desc, + int axis, const softmax_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_softmax_backward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aalgorithm), + diff_src_desc.get(), diff_dst_desc.get(), dst_desc.get(), + axis, hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the softmax backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a softmax backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a softmax backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::softmax, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + dnnl::algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_axis()const + int get_axis() const { return base::get_axis(); } + }; + + /// Default constructor. Produces an empty object. + softmax_backward() = default; + + /// Constructs a softmax backward propagation primitive. + /// @param pd Primitive descriptor for a softmax backward propagation + /// primitive. + softmax_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a softmax backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a softmax backward propagation + /// primitive. + /// @param cache_blob Cache blob. + softmax_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_softmax + +/// @addtogroup dnnl_api_batch_normalization Batch Normalization +/// +/// A primitive to perform batch normalization. +/// +/// Both forward and backward propagation primitives support in-place +/// operation; that is, src and dst can refer to the same memory for forward +/// propagation, and diff_dst and diff_src can refer to the same memory for +/// backward propagation. +/// +/// The batch normalization primitives computations can be controlled by +/// specifying different @ref dnnl::normalization_flags values. For example, +/// batch normalization forward propagation can be configured to either +/// compute the mean and variance or take them as arguments. It can either +/// perform scaling and shifting using gamma and beta parameters or not. +/// Optionally, it can also perform a fused ReLU, which in case of training +/// would also require a workspace. +/// +/// @sa @ref dev_guide_batch_normalization in developer guide +/// +/// @{ + +/// Batch normalization forward propagation primitive. +struct batch_normalization_forward : public primitive { + /// Primitive descriptor for a batch normalization forward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a batch normalization forward + /// propagation primitive. + /// + /// @note + /// In-place operation is supported: the dst can refer to the same + /// memory as the src. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param epsilon Batch normalization epsilon parameter. + /// @param flags Batch normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + float epsilon, normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_batch_normalization_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), dst_desc.get(), epsilon, + convert_to_c(flags), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the batch normalization forward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a batch normalization + /// forward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a batch normalization + /// forward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::batch_normalization, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// Returns memory descriptor for mean. + /// @returns Memory descriptor for mean. + memory::desc mean_desc() const { return stat_desc(mean); } + + /// Returns memory descriptor for variance. + /// @returns Memory descriptor for variance. + memory::desc variance_desc() const { return stat_desc(var); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + + private: + enum { + mean = 1, + var = 2, + }; + memory::desc stat_desc(int kind) const { + const bool use_global_stats + = (get_flags() & normalization_flags::use_global_stats) + != normalization_flags::none; + return query_md( + use_global_stats ? query::src_md : query::dst_md, kind); + } + }; + + /// Default constructor. Produces an empty object. + batch_normalization_forward() = default; + + /// Constructs a batch normalization forward propagation primitive. + /// @param pd Primitive descriptor for a batch normalization forward + /// propagation primitive. + batch_normalization_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a batch normalization forward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a batch normalization forward + /// propagation primitive. + /// @param cache_blob Cache blob. + batch_normalization_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Batch normalization backward propagation primitive. +struct batch_normalization_backward : public primitive { + /// Primitive descriptor for a batch normalization backward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a batch normalization backward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param epsilon Batch normalization epsilon parameter. + /// @param flags Batch normalization flags (@ref + /// dnnl::normalization_flags). + /// @param hint_fwd_pd Primitive descriptor for a batch normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + float epsilon, normalization_flags flags, + const batch_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_batch_normalization_backward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + diff_src_desc.get(), diff_dst_desc.get(), + src_desc.get(), epsilon, convert_to_c(flags), + hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the batch normalization backward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a batch normalization + /// backward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a batch normalization + /// backward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::batch_normalization, + dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) { + } + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const + memory::desc mean_desc() const { return query_md(query::src_md, 1); } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const + memory::desc variance_desc() const { + return query_md(query::src_md, 2); + } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + }; + + /// Default constructor. Produces an empty object. + batch_normalization_backward() = default; + + /// Constructs a batch normalization backward propagation primitive. + /// @param pd Primitive descriptor for a batch normalization backward + /// propagation primitive. + batch_normalization_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a batch normalization backward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a batch normalization backward + /// propagation primitive. + /// @param cache_blob Cache blob. + batch_normalization_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_batch_normalization + +/// @addtogroup dnnl_api_group_normalization Group Normalization +/// +/// A primitive to perform group normalization. +/// +/// Both forward and backward propagation primitives support in-place +/// operation; that is, src and dst can refer to the same memory for forward +/// propagation, and diff_dst and diff_src can refer to the same memory for +/// backward propagation. +/// +/// The group normalization primitives computations can be controlled by +/// specifying different @ref dnnl::normalization_flags values. For example, +/// group normalization forward propagation can be configured to either +/// compute the mean and variance or take them as arguments. It can either +/// perform scaling and shifting using gamma and beta parameters or not. +/// +/// @sa @ref dev_guide_group_normalization in developer guide +/// +/// @{ + +/// Group normalization forward propagation primitive. +struct group_normalization_forward : public primitive { + /// Primitive descriptor for a group normalization forward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a group normalization forward + /// propagation primitive. + /// + /// @note + /// In-place operation is supported: the dst can refer to the same + /// memory as the src. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param groups Group normalization groups parameter. + /// @param epsilon Group normalization epsilon parameter. + /// @param flags Group normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + memory::dim groups, float epsilon, normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_group_normalization_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), dst_desc.get(), groups, epsilon, + convert_to_c(flags), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the group normalization forward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a group normalization + /// forward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a group normalization + /// forward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::group_normalization, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// Returns memory descriptor for mean. + /// @returns Memory descriptor for mean. + memory::desc mean_desc() const { return stat_desc(mean); } + + /// Returns memory descriptor for variance. + /// @returns Memory descriptor for variance. + memory::desc variance_desc() const { return stat_desc(var); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_group_size()const + memory::dim get_group_size() const { return base::get_group_size(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + + private: + enum { + mean = 1, + var = 2, + }; + memory::desc stat_desc(int kind) const { + const bool use_global_stats + = (get_flags() & normalization_flags::use_global_stats) + != normalization_flags::none; + return query_md( + use_global_stats ? query::src_md : query::dst_md, kind); + } + }; + + /// Default constructor. Produces an empty object. + group_normalization_forward() = default; + + /// Constructs a group normalization forward propagation primitive. + /// @param pd Primitive descriptor for a group normalization forward + /// propagation primitive. + group_normalization_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a group normalization forward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a group normalization forward + /// propagation primitive. + /// @param cache_blob Cache blob. + group_normalization_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Group normalization backward propagation primitive. +struct group_normalization_backward : public primitive { + /// Primitive descriptor for a group normalization backward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a group normalization backward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param groups Group normalization groups parameter. + /// @param epsilon Group normalization epsilon parameter. + /// @param flags Group normalization flags (@ref + /// dnnl::normalization_flags). + /// @param hint_fwd_pd Primitive descriptor for a group normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + memory::dim groups, float epsilon, normalization_flags flags, + const group_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_group_normalization_backward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + diff_src_desc.get(), diff_dst_desc.get(), + src_desc.get(), groups, epsilon, + convert_to_c(flags), hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the group normalization backward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a group normalization + /// backward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a group normalization + /// backward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::group_normalization, + dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) { + } + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::group_normalization_forward::primitive_desc::mean_desc()const + memory::desc mean_desc() const { return query_md(query::src_md, 1); } + + /// @copydoc dnnl::group_normalization_forward::primitive_desc::variance_desc()const + memory::desc variance_desc() const { + return query_md(query::src_md, 2); + } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_group_size()const + memory::dim get_group_size() const { return base::get_group_size(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + }; + + /// Default constructor. Produces an empty object. + group_normalization_backward() = default; + + /// Constructs a group normalization backward propagation primitive. + /// @param pd Primitive descriptor for a group normalization backward + /// propagation primitive. + group_normalization_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a group normalization backward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a group normalization backward + /// propagation primitive. + /// @param cache_blob Cache blob. + group_normalization_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_group_normalization + +/// @addtogroup dnnl_api_layer_normalization Layer Normalization +/// +/// A primitive to perform layer normalization. Normalization is performed +/// within the last logical dimension of data tensor. +/// +/// Both forward and backward propagation primitives support in-place +/// operation; that is, src and dst can refer to the same memory for forward +/// propagation, and diff_dst and diff_src can refer to the same memory for +/// backward propagation. +/// +/// The layer normalization primitives computations can be controlled by +/// specifying different @ref dnnl::normalization_flags values. For example, +/// layer normalization forward propagation can be configured to either +/// compute the mean and variance or take them as arguments. It can either +/// perform scaling and shifting using gamma and beta parameters or not. +/// +/// @sa @ref dev_guide_layer_normalization in developer guide +/// +/// @{ + +/// Layer normalization forward propagation primitive. +struct layer_normalization_forward : public primitive { + /// Primitive descriptor for a layer normalization forward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a layer normalization forward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param stat_desc Statistics memory descriptors. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + const memory::desc &stat_desc, float epsilon, + normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, + &stat_desc, memory::data_type::f32, epsilon, flags, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization forward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + float epsilon, normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, nullptr, + memory::data_type::f32, epsilon, flags, attr, allow_empty) { + } + + /// Constructs a primitive descriptor for a layer normalization forward + /// propagation primitive with a user-provided data type for the scale + /// and shift memory objects. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param stat_desc Statistics memory descriptors. + /// @param scale_shift_data_type Data type of scale and shift memory. + /// If neither scale nor shift flag are specified the parameter + /// is ignored. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + const memory::desc &stat_desc, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, + &stat_desc, scale_shift_data_type, epsilon, flags, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization forward + /// propagation primitive with a user-provided data type for the scale + /// and shift memory objects. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param scale_shift_data_type Data type of scale and shift memory. + /// If neither scale nor shift flag are specified the parameter + /// is ignored. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, nullptr, + scale_shift_data_type, epsilon, flags, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization + /// forward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a layer normalization + /// forward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::layer_normalization, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const + memory::desc mean_desc() const { return stat_desc(mean); } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const + memory::desc variance_desc() const { return stat_desc(var); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + + private: + enum { + mean = 1, + var = 2, + }; + memory::desc stat_desc(int kind) const { + const bool use_global_stats + = (get_flags() & normalization_flags::use_global_stats) + != normalization_flags::none; + return query_md( + use_global_stats ? query::src_md : query::dst_md, kind); + } + + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + const memory::desc *stat_desc, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, const primitive_attr &attr, + bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_layer_normalization_forward_primitive_desc_create_v2( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), dst_desc.get(), + optional_arg(stat_desc), + memory::convert_to_c(scale_shift_data_type), + epsilon, convert_to_c(flags), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the layer normalization forward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + layer_normalization_forward() = default; + + /// Constructs a layer normalization forward propagation primitive. + /// @param pd Primitive descriptor for a layer normalization forward + /// propagation primitive. + layer_normalization_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a layer normalization forward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a layer normalization forward + /// propagation primitive. + /// @param cache_blob Cache blob. + layer_normalization_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Layer normalization backward propagation primitive. +struct layer_normalization_backward : public primitive { + /// Primitive descriptor for a layer normalization backward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a layer normalization backward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param stat_desc Statistics memory descriptors. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param hint_fwd_pd Primitive descriptor for a layer normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + const memory::desc &stat_desc, float epsilon, + normalization_flags flags, + const layer_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc, + src_desc, &stat_desc, memory::data_type::f32, + memory::data_type::f32, epsilon, flags, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization backward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param hint_fwd_pd Primitive descriptor for a layer normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + float epsilon, normalization_flags flags, + const layer_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc, + src_desc, nullptr, memory::data_type::f32, + memory::data_type::f32, epsilon, flags, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization backward + /// propagation primitive with a user-provided data type for the scale + /// and shift memory objects. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param stat_desc Statistics memory descriptors. + /// @param diff_scale_shift_data_type Data type of diff scale and shift + /// memory. If neither scale nor shift flag are specified the + /// parameter is ignored. + /// @param scale_shift_data_type Data type of scale and shift memory. + /// If neither scale nor shift flag are specified the parameter + /// is ignored. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param hint_fwd_pd Primitive descriptor for a layer normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + const memory::desc &stat_desc, + memory::data_type diff_scale_shift_data_type, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, + const layer_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc, + src_desc, &stat_desc, diff_scale_shift_data_type, + scale_shift_data_type, epsilon, flags, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization backward + /// propagation primitive with a user-provided data type for the scale + /// and shift memory objects. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward + /// (diffs for all parameters are computed in this case). + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param src_desc Source memory descriptor. + /// @param diff_scale_shift_data_type Data type of diff scale and shift + /// memory. If neither scale nor shift flag are specified the + /// parameter is ignored. + /// @param scale_shift_data_type Data type of scale and shift memory. + /// If neither scale nor shift flag are specified the parameter + /// is ignored. + /// @param epsilon Layer normalization epsilon parameter. + /// @param flags Layer normalization flags (@ref + /// dnnl::normalization_flags). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param hint_fwd_pd Primitive descriptor for a layer normalization + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + memory::data_type diff_scale_shift_data_type, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, + const layer_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc, + src_desc, nullptr, diff_scale_shift_data_type, + scale_shift_data_type, epsilon, flags, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for a layer normalization + /// backward propagation primitive from a C API primitive descriptor + /// that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a layer normalization + /// backward propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, + dnnl::primitive::kind::layer_normalization, + dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) { + } + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const + memory::desc mean_desc() const { return query_md(query::src_md, 1); } + + /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const + memory::desc variance_desc() const { + return query_md(query::src_md, 2); + } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// Returns normalization flags. + /// @return Normalization flags. + normalization_flags get_flags() const { + return base::get_flags(); + } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::desc &src_desc, + const memory::desc *stat_desc, + memory::data_type diff_scale_shift_data_type, + memory::data_type scale_shift_data_type, float epsilon, + normalization_flags flags, + const layer_normalization_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_layer_normalization_backward_primitive_desc_create_v2( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + diff_src_desc.get(), diff_dst_desc.get(), + src_desc.get(), optional_arg(stat_desc), + memory::convert_to_c(diff_scale_shift_data_type), + memory::convert_to_c(scale_shift_data_type), + epsilon, convert_to_c(flags), hint_fwd_pd.get(), + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the layer normalization backward propagation " + "primitive. Run workload with environment variable " + "ONEDNN_VERBOSE=all to get additional diagnostic " + "information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + layer_normalization_backward() = default; + + /// Constructs a layer normalization backward propagation primitive. + /// @param pd Primitive descriptor for a layer normalization backward + /// propagation primitive. + layer_normalization_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a layer normalization backward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a layer normalization backward + /// propagation primitive. + /// @param cache_blob Cache blob. + layer_normalization_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_layer_normalization + +/// @addtogroup dnnl_api_inner_product Inner Product +/// +/// A primitive to compute an inner product. +/// +/// @sa @ref dev_guide_inner_product in developer guide +/// +/// @{ + +/// Inner product forward propagation primitive. +struct inner_product_forward : public primitive { + /// Primitive descriptor for an inner product forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an inner product forward + /// propagation primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Memory descriptor for src. + /// @param weights_desc Memory descriptor for weights. + /// @param bias_desc Memory descriptor for bias. + /// @param dst_desc Memory descriptor for dst. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &weights_desc, + const memory::desc &bias_desc, const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, weights_desc, + &bias_desc, dst_desc, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an inner product forward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Memory descriptor for src. + /// @param weights_desc Memory descriptor for weights. + /// @param dst_desc Memory descriptor for dst. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &weights_desc, + const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, src_desc, weights_desc, + nullptr, dst_desc, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an inner product forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an inner product forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const + memory::desc bias_desc() const { return base::weights_desc(1); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &weights_desc, + const memory::desc *bias_desc, const memory::desc &dst_desc, + const primitive_attr &attr, bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_inner_product_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), weights_desc.get(), + optional_arg(bias_desc), dst_desc.get(), + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the inner product forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + inner_product_forward() = default; + + /// Constructs an inner product forward propagation primitive. + /// @param pd Primitive descriptor for an inner product forward + /// propagation primitive. + inner_product_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an inner product forward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for an inner product forward + /// propagation primitive. + /// @param cache_blob Cache blob. + inner_product_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Inner product backward propagation primitive. +struct inner_product_backward_data : public primitive { + /// Primitive descriptor for an inner product backward propagation + /// primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an inner product backward + /// propagation primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param diff_src_desc Memory descriptor for diff src. + /// @param weights_desc Memory descriptor for weights. + /// @param diff_dst_desc Memory descriptor for diff dst. + /// @param hint_fwd_pd Primitive descriptor for an inner product + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &diff_src_desc, + const memory::desc &weights_desc, + const memory::desc &diff_dst_desc, + const inner_product_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_inner_product_backward_data_primitive_desc_create( + &pd, aengine.get(), diff_src_desc.get(), + weights_desc.get(), diff_dst_desc.get(), + hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the inner product backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for an inner product backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an inner product backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { return base::weights_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + }; + + /// Default constructor. Produces an empty object. + inner_product_backward_data() = default; + + /// Constructs an inner product backward propagation primitive. + /// @param pd Primitive descriptor for an inner product backward + /// propagation primitive. + inner_product_backward_data(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an inner product backward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for an inner product backward + /// propagation primitive. + /// @param cache_blob Cache blob. + inner_product_backward_data( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Inner product weights gradient primitive. +struct inner_product_backward_weights : public primitive { + /// Primitive descriptor for an inner product weights gradient primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an inner product weights + /// update primitive with bias. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param src_desc Memory descriptor for src. + /// @param diff_weights_desc Memory descriptor for diff weights. + /// @param diff_bias_desc Memory descriptor for diff bias. + /// @param diff_dst_desc Memory descriptor for diff dst. + /// @param hint_fwd_pd Primitive descriptor for an inner product + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_desc, + const inner_product_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, src_desc, diff_weights_desc, + &diff_bias_desc, diff_dst_desc, hint_fwd_pd, attr, + allow_empty) {} + + /// Constructs a primitive descriptor for an inner product weights + /// update primitive. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param src_desc Memory descriptor for src. + /// @param diff_weights_desc Memory descriptor for diff weights. + /// @param diff_dst_desc Memory descriptor for diff dst. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param hint_fwd_pd Primitive descriptor for an inner product + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, + const inner_product_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, src_desc, diff_weights_desc, nullptr, + diff_dst_desc, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an inner product weights + /// update primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an inner product weights + /// gradient primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product, + dnnl::prop_kind::backward_weights) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const + memory::desc diff_weights_desc() const { + return base::diff_weights_desc(0); + } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::convolution_backward_weights::primitive_desc::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return base::diff_weights_desc(1); + } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + private: + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &diff_weights_desc, + const memory::desc *diff_bias_desc, + const memory::desc &diff_dst_desc, + const inner_product_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_inner_product_backward_weights_primitive_desc_create( + &pd, aengine.get(), src_desc.get(), + diff_weights_desc.get(), + optional_arg(diff_bias_desc), diff_dst_desc.get(), + hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the inner product weights gradient primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + inner_product_backward_weights() = default; + + /// Constructs an inner product weights gradient primitive. + /// @param pd Primitive descriptor for an inner product weights gradient + /// primitive. + inner_product_backward_weights(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an inner product weights gradient primitive from a cache + /// blob. + /// @param pd Primitive descriptor for an inner product weights gradient + /// primitive. + /// @param cache_blob Cache blob. + inner_product_backward_weights( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_inner_product + +/// @addtogroup dnnl_api_rnn RNN +/// +/// A primitive to compute recurrent neural network layers. +/// +/// @sa @ref dev_guide_rnn in developer guide +/// +/// @{ + +/// Base class for primitive descriptors for RNN primitives. +struct rnn_primitive_desc_base : public primitive_desc { + using primitive_desc::primitive_desc; + + /// Default constructor. Produces an empty object. + rnn_primitive_desc_base() = default; + + /// Constructs an RNN primitive descriptor base from a C API primitive + /// descriptor while checking that it actually describes the expected + /// primitive by comparing propagation and primitive kinds. + /// + /// @param pd C API primitive descriptor. + /// @param aprop_kind Expected propagation kind. + /// @param cell_kind Expected cell kind. + rnn_primitive_desc_base(dnnl_primitive_desc_t pd, + dnnl::prop_kind aprop_kind, dnnl::algorithm cell_kind) + : rnn_primitive_desc_base(pd, aprop_kind, aprop_kind, cell_kind) {} + + /// Returns source layer memory descriptor. + /// @returns Source layer memory descriptor. + memory::desc src_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_LAYER); + } + + /// Returns AUGRU attention memory descriptor. + /// @returns AUGRU attention memory descriptor. + memory::desc augru_attention_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_AUGRU_ATTENTION); + } + + /// Returns source iteration memory descriptor. + /// @returns Source iteration memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source iteration parameter. + memory::desc src_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_ITER); + } + + /// Returns source recurrent cell state memory descriptor. + /// @returns Source recurrent cell state memory descriptor. + memory::desc src_iter_c_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_ITER_C); + } + + /// Returns weights layer memory descriptor. + /// @returns Weights layer memory descriptor. + memory::desc weights_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_LAYER); + } + + /// Returns weights iteration memory descriptor. + /// @returns Weights iteration memory descriptor. + memory::desc weights_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_ITER); + } + + /// Returns weights peephole memory descriptor. + /// @returns Weights peephole memory descriptor. + memory::desc weights_peephole_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_PEEPHOLE); + } + + /// Returns weights projection memory descriptor. + /// @returns Weights projection memory descriptor. + memory::desc weights_projection_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_PROJECTION); + } + + /// Returns bias memory descriptor. + /// @returns Bias memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// bias parameter. + memory::desc bias_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_BIAS); + } + + /// Returns destination layer memory descriptor. + /// @returns Destination layer memory descriptor. + memory::desc dst_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DST_LAYER); + } + + /// Returns destination iteration memory descriptor. + /// @returns Destination iteration memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination iteration parameter. + memory::desc dst_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DST_ITER); + } + + /// Returns destination recurrent cell state memory descriptor. + /// @returns Destination recurrent cell state memory descriptor. + memory::desc dst_iter_c_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DST_ITER_C); + } + + /// Returns diff source layer memory descriptor. + /// @returns Diff source layer memory descriptor. + memory::desc diff_src_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_LAYER); + } + + /// Returns diff AUGRU attention memory descriptor. + /// @returns Diff AUGRU attention memory descriptor. + memory::desc diff_augru_attention_desc() const { + return base::query_md( + query::exec_arg_md, DNNL_ARG_DIFF_AUGRU_ATTENTION); + } + + /// Returns diff source iteration memory descriptor. + /// @returns Diff source iteration memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source iteration parameter. + memory::desc diff_src_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_ITER); + } + + /// Returns diff source recurrent cell state memory descriptor. + /// @returns Diff source recurrent cell state memory descriptor. + memory::desc diff_src_iter_c_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_ITER_C); + } + + /// Returns diff weights layer memory descriptor. + /// @returns Diff weights layer memory descriptor. + memory::desc diff_weights_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_LAYER); + } + + /// Returns diff weights iteration memory descriptor. + /// @returns Diff weights iteration memory descriptor. + memory::desc diff_weights_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_ITER); + } + + /// Returns diff weights peephole memory descriptor. + /// @returns Diff weights peephole memory descriptor. + memory::desc diff_weights_peephole_desc() const { + return base::query_md( + query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE); + } + + /// Returns diff weights projection memory descriptor. + /// @returns Diff weights projection memory descriptor. + memory::desc diff_weights_projection_desc() const { + return base::query_md( + query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_PROJECTION); + } + + /// Returns diff bias memory descriptor. + /// @returns Diff bias memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff bias parameter. + memory::desc diff_bias_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_BIAS); + } + + /// Returns diff destination layer memory descriptor. + /// @returns Diff destination layer memory descriptor. + memory::desc diff_dst_layer_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_LAYER); + } + + /// Returns diff destination iteration memory descriptor. + /// @returns Diff destination iteration memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination iteration parameter. + memory::desc diff_dst_iter_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_ITER); + } + + /// Returns diff destination recurrent cell state memory descriptor. + /// @returns Diff destination recurrent cell state memory descriptor. + memory::desc diff_dst_iter_c_desc() const { + return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_ITER_C); + } + +protected: + using rnn_base = rnn_primitive_desc_base; + + // (Deliberately not using doxygen comments) + // + // Constructs an RNN primitive descriptor base from a C API primitive + // descriptor while checking that it actually describes the expected + // primitive by comparing propagation and primitive kinds. Caller can + // pass two options propagation kinds. This is typically used to check + // that propagation kind is inference or training forward propagation. + // + // @param pd C API primitive descriptor. + // @param prop_kind1 Expected propagation kind. + // @param prop_kind2 Expected propagation kind. + // @param cell_kind Expected cell kind. + rnn_primitive_desc_base(dnnl_primitive_desc_t pd, + dnnl::prop_kind prop_kind1, dnnl::prop_kind prop_kind2, + dnnl::algorithm cell_kind) { + + dnnl_status_t rc; + + dnnl_primitive_kind_t q_primitive_kind; + rc = dnnl_primitive_desc_query( + pd, dnnl_query_primitive_kind, 0, &q_primitive_kind); + error::wrap_c_api(rc, + "could not retrieve a primitive kind from a primitive " + "descriptor for an RNN primitive"); + + dnnl_prop_kind_t q_prop_kind; + rc = dnnl_primitive_desc_query( + pd, dnnl_query_prop_kind, 0, &q_prop_kind); + error::wrap_c_api(rc, + "could not retrieve a propagation kind from a primitive " + "descriptor for an RNN primitive"); + + dnnl_alg_kind_t q_cell_kind; + rc = dnnl_primitive_desc_query( + pd, dnnl_query_cell_kind, 0, &q_cell_kind); + error::wrap_c_api(rc, + "could not retrieve a cell kind from a primitive descriptor " + "for an RNN primitive"); + + dnnl_prop_kind_t c_prop_kind1 = convert_to_c(prop_kind1); + dnnl_prop_kind_t c_prop_kind2 = convert_to_c(prop_kind2); + dnnl_alg_kind_t c_cell_kind = convert_to_c(cell_kind); + + bool ok = q_primitive_kind == dnnl_rnn + && (q_prop_kind == c_prop_kind1 || q_prop_kind == c_prop_kind2) + && q_cell_kind == c_cell_kind; + + if (!ok) + DNNL_THROW_ERROR(dnnl_invalid_arguments, + "mismatch between expected and provided descriptors for an " + "RNN primitive"); + + reset_with_clone(pd); + } + + // Constructs an RNN forward propagation primitive descriptor base for + // any cell kind. + rnn_primitive_desc_base(const engine &aengine, algorithm cell_kind, + prop_kind aprop_kind, algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc *src_iter_c_desc, + const memory::desc *attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc *weights_peephole_desc, + const memory::desc *weights_projection_desc, + const memory::desc &bias_desc, const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc *dst_iter_c_desc, rnn_flags flags, float alpha, + float beta, const primitive_attr &attr, bool allow_empty) { + + dnnl_status_t status = dnnl_success; + const char *msg + = "could not create a primitive descriptor for a requested " + "cell kind"; + + dnnl_primitive_desc_t pd = nullptr; + switch (cell_kind) { + case algorithm::vanilla_rnn: + status = dnnl_vanilla_rnn_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(activation), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + convert_to_c(flags), alpha, beta, attr.get()); + msg = "could not create a primitive descriptor for " + "the vanilla RNN forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."; + break; + case algorithm::vanilla_lstm: + status = dnnl_lstm_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(src_iter_c_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + optional_arg(weights_peephole_desc), + optional_arg(weights_projection_desc), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + optional_arg(dst_iter_c_desc), convert_to_c(flags), + attr.get()); + msg = "could not create a primitive descriptor for " + "the LSTM forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::vanilla_gru: + status = dnnl_gru_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + convert_to_c(flags), attr.get()); + msg = "could not create a primitive descriptor for " + "the GRU forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::lbr_gru: + status = dnnl_lbr_gru_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + convert_to_c(flags), attr.get()); + msg = "could not create a primitive descriptor for " + "the LBR GRU forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::vanilla_augru: + status = dnnl_augru_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(attention_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + bias_desc.get(), dst_layer_desc.get(), + dst_iter_desc.get(), convert_to_c(flags), attr.get()); + msg = "could not create a primitive descriptor for " + "the AUGRU forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::lbr_augru: + status = dnnl_lbr_augru_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(attention_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + bias_desc.get(), dst_layer_desc.get(), + dst_iter_desc.get(), convert_to_c(flags), attr.get()); + msg = "could not create a primitive descriptor for " + "the LBR AUGRU forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."; + break; + default: status = dnnl_unimplemented; + } + + if (!allow_empty) error::wrap_c_api(status, msg); + reset(pd); + } + + // Constructs an RNN backward propagation primitive descriptor base for + // any cell kind. + rnn_primitive_desc_base(const engine &aengine, algorithm cell_kind, + prop_kind aprop_kind, algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc *src_iter_c_desc, + const memory::desc *attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc *weights_peephole_desc, + const memory::desc *weights_projection_desc, + const memory::desc &bias_desc, const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc *dst_iter_c_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc *diff_src_iter_c_desc, + const memory::desc *diff_attention_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc *diff_weights_peephole_desc, + const memory::desc *diff_weights_projection_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const memory::desc *diff_dst_iter_c_desc, rnn_flags flags, + float alpha, float beta, const rnn_primitive_desc_base &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + dnnl_status_t status = dnnl_success; + const char *msg = ""; + + dnnl_primitive_desc_t pd = nullptr; + switch (cell_kind) { + case algorithm::vanilla_rnn: + status = dnnl_vanilla_rnn_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(activation), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + diff_src_layer_desc.get(), diff_src_iter_desc.get(), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), diff_bias_desc.get(), + diff_dst_layer_desc.get(), diff_dst_iter_desc.get(), + convert_to_c(flags), alpha, beta, hint_fwd_pd.get(), + attr.get()); + msg = "could not create a primitive descriptor for " + "the vanilla RNN backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."; + break; + case algorithm::vanilla_lstm: + status = dnnl_lstm_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(src_iter_c_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + optional_arg(weights_peephole_desc), + optional_arg(weights_projection_desc), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + optional_arg(dst_iter_c_desc), + diff_src_layer_desc.get(), diff_src_iter_desc.get(), + optional_arg(diff_src_iter_c_desc), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), + optional_arg(diff_weights_peephole_desc), + optional_arg(diff_weights_projection_desc), + diff_bias_desc.get(), diff_dst_layer_desc.get(), + diff_dst_iter_desc.get(), + optional_arg(diff_dst_iter_c_desc), convert_to_c(flags), + hint_fwd_pd.get(), attr.get()); + msg = "could not create a primitive descriptor for " + "the LSTM backward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::vanilla_gru: + status = dnnl_gru_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + diff_src_layer_desc.get(), diff_src_iter_desc.get(), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), diff_bias_desc.get(), + diff_dst_layer_desc.get(), diff_dst_iter_desc.get(), + convert_to_c(flags), hint_fwd_pd.get(), attr.get()); + msg = "could not create a primitive descriptor for " + "the GRU backward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::lbr_gru: + status = dnnl_lbr_gru_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), weights_layer_desc.get(), + weights_iter_desc.get(), bias_desc.get(), + dst_layer_desc.get(), dst_iter_desc.get(), + diff_src_layer_desc.get(), diff_src_iter_desc.get(), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), diff_bias_desc.get(), + diff_dst_layer_desc.get(), diff_dst_iter_desc.get(), + convert_to_c(flags), hint_fwd_pd.get(), attr.get()); + msg = "could not create a primitive descriptor for " + "the LBR GRU backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."; + break; + case algorithm::vanilla_augru: + status = dnnl_augru_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(attention_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + bias_desc.get(), dst_layer_desc.get(), + dst_iter_desc.get(), diff_src_layer_desc.get(), + diff_src_iter_desc.get(), + optional_arg(diff_attention_desc), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), diff_bias_desc.get(), + diff_dst_layer_desc.get(), diff_dst_iter_desc.get(), + convert_to_c(flags), hint_fwd_pd.get(), attr.get()); + msg = "could not create a primitive descriptor for " + "the AUGRU backward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."; + break; + case algorithm::lbr_augru: + status = dnnl_lbr_augru_backward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + dnnl::convert_to_c(direction), src_layer_desc.get(), + src_iter_desc.get(), optional_arg(attention_desc), + weights_layer_desc.get(), weights_iter_desc.get(), + bias_desc.get(), dst_layer_desc.get(), + dst_iter_desc.get(), diff_src_layer_desc.get(), + diff_src_iter_desc.get(), + optional_arg(diff_attention_desc), + diff_weights_layer_desc.get(), + diff_weights_iter_desc.get(), diff_bias_desc.get(), + diff_dst_layer_desc.get(), diff_dst_iter_desc.get(), + convert_to_c(flags), hint_fwd_pd.get(), attr.get()); + msg = "could not create a primitive descriptor for " + "the LBR AUGRU backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."; + break; + default: status = dnnl_unimplemented; + } + if (!allow_empty) error::wrap_c_api(status, msg); + reset(pd); + } +}; + +/// Vanilla RNN forward propagation primitive. +struct vanilla_rnn_forward : public primitive { + /// Primitive descriptor for a vanilla RNN forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a vanilla RNN forward + /// propagation primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the RNN forward propagation primitive + /// should not use them and should default to zero values instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc can be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param activation Activation kind. Possible values are + /// #dnnl::algorithm::eltwise_relu, + /// #dnnl::algorithm::eltwise_tanh, or + /// #dnnl::algorithm::eltwise_logistic. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn, + aprop_kind, activation, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef, + 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a vanilla RNN forward + /// propagation primitive with alpha parameter. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the RNN forward propagation primitive + /// should not use them and should default to zero values instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc can be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param activation Activation kind. Possible values are + /// #dnnl::algorithm::eltwise_relu, + /// #dnnl::algorithm::eltwise_tanh, or + /// #dnnl::algorithm::eltwise_logistic. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param alpha Negative slope if activation is + /// #dnnl::algorithm::eltwise_relu. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, float alpha, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn, + aprop_kind, activation, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef, + alpha, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a vanilla RNN forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a vanilla RNN forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::vanilla_rnn) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_activation_kind()const + algorithm get_activation_kind() const { + return base::get_activation_kind(); + } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + }; + + /// Default constructor. Produces an empty object. + vanilla_rnn_forward() = default; + + /// Constructs a vanilla RNN forward propagation primitive. + /// @param pd Primitive descriptor for a vanilla RNN forward + /// propagation primitive. + vanilla_rnn_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a vanilla RNN forward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a vanilla RNN forward + /// propagation primitive. + /// @param cache_blob Cache blob. + vanilla_rnn_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Vanilla RNN backward propagation primitive. +struct vanilla_rnn_backward : public primitive { + /// Primitive descriptor for an RNN backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a vanilla RNN backward + /// propagation primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the RNN backward propagation + /// primitive should not use the respective data and should use zero + /// values instead. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param activation Activation kind. Possible values are + /// #dnnl::algorithm::eltwise_relu, + /// #dnnl::algorithm::eltwise_tanh, or + /// #dnnl::algorithm::eltwise_logistic. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param hint_fwd_pd Primitive descriptor for a vanilla RNN + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const vanilla_rnn_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn, + aprop_kind, activation, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc, + diff_src_iter_desc, nullptr, nullptr, + diff_weights_layer_desc, diff_weights_iter_desc, nullptr, + nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a vanilla RNN backward + /// propagation primitive with an alpha parameter. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the RNN backward propagation + /// primitive should not use the respective data and should use zero + /// values instead. + /// + /// @note + /// All the memory descriptors may be initialized with the + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param activation Activation kind. Possible values are + /// #dnnl::algorithm::eltwise_relu, + /// #dnnl::algorithm::eltwise_tanh, or + /// #dnnl::algorithm::eltwise_logistic. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param alpha Negative slope if activation is + /// #dnnl::algorithm::eltwise_relu. + /// @param hint_fwd_pd Primitive descriptor for a vanilla RNN + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm activation, rnn_direction direction, + const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, float alpha, + const vanilla_rnn_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn, + aprop_kind, activation, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc, + diff_src_iter_desc, nullptr, nullptr, + diff_weights_layer_desc, diff_weights_iter_desc, nullptr, + nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, alpha, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a vanilla RNN backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a vanilla RNN backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward, + dnnl::algorithm::vanilla_rnn) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_activation_kind()const + algorithm get_activation_kind() const { + return base::get_activation_kind(); + } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + + /// @copydoc dnnl::primitive_desc_base::get_alpha()const + float get_alpha() const { return base::get_alpha(); } + + /// @copydoc dnnl::primitive_desc_base::get_beta()const + float get_beta() const { return base::get_beta(); } + }; + + /// Default constructor. Produces an empty object. + vanilla_rnn_backward() = default; + + /// Constructs a vanilla RNN backward propagation primitive. + /// @param pd Primitive descriptor for a vanilla RNN backward + /// propagation primitive. + vanilla_rnn_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a vanilla RNN backward propagation primitive from + /// a cache blob. + /// @param pd Primitive descriptor for a vanilla RNN backward + /// propagation primitive. + /// @param cache_blob Cache blob. + vanilla_rnn_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LSTM forward propagation primitive. +struct lstm_forward : public primitive { + /// Primitive descriptor for an LSTM forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an LSTM (with or without + /// peephole and with or without projection) forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// - @p weights_peephole_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc. + /// + /// This would then indicate that the LSTM forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// The @p weights_projection_desc may point to a zero memory + /// descriptor. This would then indicate that the LSTM doesn't have + /// recurrent projection layer. + /// + /// @note + /// All memory descriptors can be initialized with an + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param weights_peephole_desc Memory descriptor for the weights + /// applied to the cell states (according to the Peephole LSTM + /// formula). + /// @param weights_projection_desc Memory descriptor for the weights + /// applied to the hidden states to get the recurrent projection + /// (according to the Projection LSTM formula). + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &weights_peephole_desc, + const memory::desc &weights_projection_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, + &weights_peephole_desc, &weights_projection_desc, bias_desc, + dst_layer_desc, dst_iter_desc, &dst_iter_c_desc, + rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LSTM (with or without + /// peephole) forward propagation primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// - @p weights_peephole_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc. + /// + /// This would then indicate that the LSTM forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors can be initialized with an + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param weights_peephole_desc Memory descriptor for the weights + /// applied to the cell states (according to the Peephole LSTM + /// formula). + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &weights_peephole_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, + &weights_peephole_desc, nullptr, bias_desc, dst_layer_desc, + dst_iter_desc, &dst_iter_c_desc, rnn_flags::undef, 0.0f, + 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LSTM forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc. + /// + /// This would then indicate that the LSTM forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors can be initialized with an + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, nullptr, nullptr, + bias_desc, dst_layer_desc, dst_iter_desc, &dst_iter_c_desc, + rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LSTM forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an LSTM forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::vanilla_lstm) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_c_desc() const { + return rnn_base::src_iter_c_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_peephole_desc()const + memory::desc weights_peephole_desc() const { + return rnn_base::weights_peephole_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_projection_desc()const + memory::desc weights_projection_desc() const { + return rnn_base::weights_projection_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc dst_iter_c_desc() const { + return rnn_base::dst_iter_c_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lstm_forward() = default; + + /// Constructs an LSTM forward propagation primitive. + /// @param pd Primitive descriptor for an LSTM forward propagation + /// primitive. + lstm_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LSTM forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LSTM forward propagation + /// primitive. + /// @param cache_blob Cache blob. + lstm_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LSTM backward propagation primitive. +struct lstm_backward : public primitive { + /// Primitive descriptor for an LSTM backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs an LSTM (with or without peephole and with or without + /// projection) primitive descriptor for backward propagation + /// using @p prop_kind, @p direction, and memory descriptors. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// @p diff_src_iter_desc, and @p diff_src_iter_c_desc, + /// - @p weights_peephole_desc together with + /// @p diff_weights_peephole_desc + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc, + /// @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc. + /// + /// This would then indicate that the LSTM backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// The @p weights_projection_desc together with @p + /// diff_weights_projection_desc may point to a zero memory descriptor. + /// This would then indicate that the LSTM doesn't have recurrent + /// projection layer. + /// + /// @note + /// All memory descriptors can be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param weights_peephole_desc Memory descriptor for the weights + /// applied to the cell states (according to the Peephole LSTM + /// formula). + /// @param weights_projection_desc Memory descriptor for the weights + /// applied to the hidden states to get the recurrent projection + /// (according to the Projection LSTM formula). + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_src_iter_c_desc Memory descriptor for the diff of + /// input recurrent cell state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_weights_peephole_desc Memory descriptor for the diff of + /// weights applied to the cell states (according to the Peephole + /// LSTM formula). + /// @param diff_weights_projection_desc Memory descriptor for the diff + /// of weights applied to the hidden states to get the recurrent + /// projection (according to the Projection LSTM formula). + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param diff_dst_iter_c_desc Memory descriptor for the diff of + /// output recurrent cell state vector. + /// @param hint_fwd_pd Primitive descriptor for an LSTM + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &weights_peephole_desc, + const memory::desc &weights_projection_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_src_iter_c_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_weights_peephole_desc, + const memory::desc &diff_weights_projection_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const memory::desc &diff_dst_iter_c_desc, + const lstm_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, + &weights_peephole_desc, &weights_projection_desc, bias_desc, + dst_layer_desc, dst_iter_desc, &dst_iter_c_desc, + diff_src_layer_desc, diff_src_iter_desc, + &diff_src_iter_c_desc, nullptr, diff_weights_layer_desc, + diff_weights_iter_desc, &diff_weights_peephole_desc, + &diff_weights_projection_desc, diff_bias_desc, + diff_dst_layer_desc, diff_dst_iter_desc, + &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs an LSTM (with or without peephole) primitive descriptor + /// for backward propagation using @p prop_kind, @p direction, + /// and memory descriptors. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// @p diff_src_iter_desc, and @p diff_src_iter_c_desc, + /// - @p weights_peephole_desc together with + /// @p diff_weights_peephole_desc + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc, + /// @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc. + /// + /// This would then indicate that the LSTM backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param weights_peephole_desc Memory descriptor for the weights + /// applied to the cell states (according to the Peephole LSTM + /// formula). + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_src_iter_c_desc Memory descriptor for the diff of + /// input recurrent cell state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_weights_peephole_desc Memory descriptor for the diff of + /// weights applied to the cell states (according to the Peephole + /// LSTM formula). + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param diff_dst_iter_c_desc Memory descriptor for the diff of + /// output recurrent cell state vector. + /// @param hint_fwd_pd Primitive descriptor for an LSTM + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &weights_peephole_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_src_iter_c_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_weights_peephole_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const memory::desc &diff_dst_iter_c_desc, + const lstm_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, + &weights_peephole_desc, nullptr, bias_desc, dst_layer_desc, + dst_iter_desc, &dst_iter_c_desc, diff_src_layer_desc, + diff_src_iter_desc, &diff_src_iter_c_desc, nullptr, + diff_weights_layer_desc, diff_weights_iter_desc, + &diff_weights_peephole_desc, nullptr, diff_bias_desc, + diff_dst_layer_desc, diff_dst_iter_desc, + &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs an LSTM primitive descriptor for backward propagation + /// using @p prop_kind, @p direction, and memory descriptors. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p src_iter_c_desc, + /// @p diff_src_iter_desc, and @p diff_src_iter_c_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p dst_iter_c_desc, + /// @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc. + /// + /// This would then indicate that the LSTM backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param src_iter_c_desc Memory descriptor for the input recurrent + /// cell state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param dst_iter_c_desc Memory descriptor for the output recurrent + /// cell state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_src_iter_c_desc Memory descriptor for the diff of + /// input recurrent cell state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param diff_dst_iter_c_desc Memory descriptor for the diff of + /// output recurrent cell state vector. + /// @param hint_fwd_pd Primitive descriptor for a convolution + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &src_iter_c_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &dst_iter_c_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_src_iter_c_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const memory::desc &diff_dst_iter_c_desc, + const lstm_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, &src_iter_c_desc, nullptr, + weights_layer_desc, weights_iter_desc, nullptr, nullptr, + bias_desc, dst_layer_desc, dst_iter_desc, &dst_iter_c_desc, + diff_src_layer_desc, diff_src_iter_desc, + &diff_src_iter_c_desc, nullptr, diff_weights_layer_desc, + diff_weights_iter_desc, nullptr, nullptr, diff_bias_desc, + diff_dst_layer_desc, diff_dst_iter_desc, + &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LSTM backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an LSTM backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward, + dnnl::algorithm::vanilla_lstm) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_c_desc() const { + return rnn_base::src_iter_c_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_peephole_desc()const + memory::desc weights_peephole_desc() const { + return rnn_base::weights_peephole_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_projection_desc()const + memory::desc weights_projection_desc() const { + return rnn_base::weights_projection_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc dst_iter_c_desc() const { + return rnn_base::dst_iter_c_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_c_desc()const + memory::desc diff_src_iter_c_desc() const { + return rnn_base::diff_src_iter_c_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_peephole_desc()const + memory::desc diff_weights_peephole_desc() const { + return rnn_base::diff_weights_peephole_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_projection_desc()const + memory::desc diff_weights_projection_desc() const { + return rnn_base::diff_weights_projection_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_c_desc()const + memory::desc diff_dst_iter_c_desc() const { + return rnn_base::diff_dst_iter_c_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lstm_backward() = default; + + /// Constructs an LSTM backward propagation primitive. + /// @param pd Primitive descriptor for an LSTM backward propagation + /// primitive. + lstm_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LSTM backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LSTM backward propagation + /// primitive. + /// @param cache_blob Cache blob. + lstm_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// GRU forward propagation primitive. +struct gru_forward : public primitive { + /// Primitive descriptor for a GRU forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a GRU forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the GRU forward propagation primitive + /// should not use them and should default to zero values instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc may be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_gru, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef, + 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a GRU forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a GRU forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::vanilla_gru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + gru_forward() = default; + + /// Constructs a GRU forward propagation primitive. + /// @param pd Primitive descriptor for a GRU forward propagation + /// primitive. + gru_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a GRU forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a GRU forward propagation + /// primitive. + /// @param cache_blob Cache blob. + gru_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// GRU backward propagation primitive. +struct gru_backward : public primitive { + /// Primitive descriptor for a GRU backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a GRU backward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the GRU backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param hint_fwd_pd Primitive descriptor for a GRU + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const gru_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_gru, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, nullptr, nullptr, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc, + diff_src_iter_desc, nullptr, nullptr, + diff_weights_layer_desc, diff_weights_iter_desc, nullptr, + nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a GRU backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a GRU backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward, + dnnl::algorithm::vanilla_gru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + gru_backward() = default; + + /// Constructs a GRU backward propagation primitive. + /// @param pd Primitive descriptor for a GRU backward propagation + /// primitive. + gru_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a GRU backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a GRU backward propagation + /// primitive. + /// @param cache_blob Cache blob. + gru_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LBR GRU forward propagation primitive. +struct lbr_gru_forward : public primitive { + /// Primitive descriptor for an LBR GRU forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for LBR GRU forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the LBR GRU forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc may be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::lbr_gru, aprop_kind, + algorithm::undef, direction, src_layer_desc, src_iter_desc, + nullptr, nullptr, weights_layer_desc, weights_iter_desc, + nullptr, nullptr, bias_desc, dst_layer_desc, dst_iter_desc, + nullptr, rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a LBR GRU forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a LBR GRU forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::lbr_gru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lbr_gru_forward() = default; + + /// Constructs an LBR GRU forward propagation primitive. + /// @param pd Primitive descriptor for an LBR GRU forward propagation + /// primitive. + lbr_gru_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LBR GRU forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LBR GRU forward propagation + /// primitive. + /// @param cache_blob Cache blob. + lbr_gru_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LBR GRU backward propagation primitive. +struct lbr_gru_backward : public primitive { + /// Primitive descriptor for an LBR GRU backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for LBR GRU backward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the LBR GRU backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param hint_fwd_pd Primitive descriptor for an LBR GRU + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const lbr_gru_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::lbr_gru, aprop_kind, + algorithm::undef, direction, src_layer_desc, src_iter_desc, + nullptr, nullptr, weights_layer_desc, weights_iter_desc, + nullptr, nullptr, bias_desc, dst_layer_desc, dst_iter_desc, + nullptr, diff_src_layer_desc, diff_src_iter_desc, nullptr, + nullptr, diff_weights_layer_desc, diff_weights_iter_desc, + nullptr, nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a LBR GRU backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a LBR GRU backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base( + pd, dnnl::prop_kind::backward, dnnl::algorithm::lbr_gru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lbr_gru_backward() = default; + + /// Constructs an LBR GRU backward propagation primitive. + /// @param pd Primitive descriptor for an LBR GRU backward propagation + /// primitive. + lbr_gru_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LBR GRU backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LBR GRU backward propagation + /// primitive. + /// @param cache_blob Cache blob. + lbr_gru_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// AUGRU forward propagation primitive. +struct augru_forward : public primitive { + /// Primitive descriptor for an AUGRU forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an AUGRU forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the AUGRU forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc may be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param attention_desc Memory descriptor for the attention vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_augru, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, nullptr, &attention_desc, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef, + 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an AUGRU forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an AUGRU forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::vanilla_augru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const + memory::desc attention_desc() const { + return rnn_base::augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + augru_forward() = default; + + /// Constructs an AUGRU forward propagation primitive. + /// @param pd Primitive descriptor for an AUGRU forward propagation + /// primitive. + augru_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an AUGRU forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an AUGRU forward propagation + /// primitive. + /// @param cache_blob Cache blob. + augru_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// AUGRU backward propagation primitive. +struct augru_backward : public primitive { + /// Descriptor for an AUGRU backward propagation primitive. + /// Primitive descriptor for an AUGRU backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an AUGRU backward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the AUGRU backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param attention_desc Memory descriptor for the attention vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_attention_desc Memory descriptor for the diff of + /// attention vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param hint_fwd_pd Primitive descriptor for an AUGRU + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_attention_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const augru_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::vanilla_augru, + aprop_kind, algorithm::undef, direction, src_layer_desc, + src_iter_desc, nullptr, &attention_desc, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc, + diff_src_iter_desc, nullptr, &diff_attention_desc, + diff_weights_layer_desc, diff_weights_iter_desc, nullptr, + nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an AUGRU backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an AUGRU backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward, + dnnl::algorithm::vanilla_augru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const + memory::desc attention_desc() const { + return rnn_base::augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_augru_attention_desc()const + memory::desc diff_attention_desc() const { + return rnn_base::diff_augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + augru_backward() = default; + + /// Constructs an AUGRU backward propagation primitive. + /// @param pd Primitive descriptor for an AUGRU backward propagation + /// primitive. + augru_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an AUGRU backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an AUGRU backward propagation + /// primitive. + /// @param cache_blob Cache blob. + augru_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LBR AUGRU forward propagation primitive. +struct lbr_augru_forward : public primitive { + /// Descriptor for an LBR AUGRU forward propagation primitive. + + /// Primitive descriptor for an LBR AUGRU forward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for LBR AUGRU forward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc, + /// - @p bias_desc, + /// - @p dst_iter_desc. + /// + /// This would then indicate that the LBR AUGRU forward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors except @p src_iter_desc may be + /// initialized with an #dnnl::memory::format_tag::any value of @p + /// format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param attention_desc Memory descriptor for the attention vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::lbr_augru, aprop_kind, + algorithm::undef, direction, src_layer_desc, src_iter_desc, + nullptr, &attention_desc, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef, + 0.0f, 0.0f, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LBR AUGRU forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for an LBR AUGRU forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference, + dnnl::algorithm::lbr_augru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const + memory::desc attention_desc() const { + return rnn_base::augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lbr_augru_forward() = default; + + /// Constructs an LBR AUGRU forward propagation primitive. + /// @param pd Primitive descriptor for an LBR AUGRU forward propagation + /// primitive. + lbr_augru_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LBR AUGRU forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LBR AUGRU forward propagation + /// primitive. + /// @param cache_blob Cache blob. + lbr_augru_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// LBR AUGRU backward propagation primitive. +struct lbr_augru_backward : public primitive { + /// Primitive descriptor for an LBR AUGRU backward propagation primitive. + struct primitive_desc : public rnn_primitive_desc_base { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for LBR AUGRU backward propagation + /// primitive. + /// + /// The following arguments may point to a zero memory descriptor: + /// - @p src_iter_desc together with @p diff_src_iter_desc, + /// - @p bias_desc together with @p diff_bias_desc, + /// - @p dst_iter_desc together with @p diff_dst_iter_desc. + /// + /// This would then indicate that the LBR AUGRU backward propagation + /// primitive should not use them and should default to zero values + /// instead. + /// + /// @note + /// All memory descriptors may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Must be + /// #dnnl::prop_kind::backward. + /// @param direction RNN direction. See @ref dnnl::rnn_direction for + /// more info. + /// @param src_layer_desc Memory descriptor for the input vector. + /// @param src_iter_desc Memory descriptor for the input recurrent + /// hidden state vector. + /// @param attention_desc Memory descriptor for the attention vector. + /// @param weights_layer_desc Memory descriptor for the weights + /// applied to the layer input. + /// @param weights_iter_desc Memory descriptor for the weights applied + /// to the recurrent input. + /// @param bias_desc Bias memory descriptor. + /// @param dst_layer_desc Memory descriptor for the output vector. + /// @param dst_iter_desc Memory descriptor for the output recurrent + /// hidden state vector. + /// @param diff_src_layer_desc Memory descriptor for the diff of input + /// vector. + /// @param diff_src_iter_desc Memory descriptor for the diff of input + /// recurrent hidden state vector. + /// @param diff_attention_desc Memory descriptor for the diff of + /// attention vector. + /// @param diff_weights_layer_desc Memory descriptor for the diff of + /// weights applied to the layer input. + /// @param diff_weights_iter_desc Memory descriptor for the diff of + /// weights applied to the recurrent input. + /// @param diff_bias_desc Diff bias memory descriptor. + /// @param diff_dst_layer_desc Memory descriptor for the diff of + /// output vector. + /// @param diff_dst_iter_desc Memory descriptor for the diff of output + /// recurrent hidden state vector. + /// @param hint_fwd_pd Primitive descriptor for an LBR AUGRU + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + rnn_direction direction, const memory::desc &src_layer_desc, + const memory::desc &src_iter_desc, + const memory::desc &attention_desc, + const memory::desc &weights_layer_desc, + const memory::desc &weights_iter_desc, + const memory::desc &bias_desc, + const memory::desc &dst_layer_desc, + const memory::desc &dst_iter_desc, + const memory::desc &diff_src_layer_desc, + const memory::desc &diff_src_iter_desc, + const memory::desc &diff_attention_desc, + const memory::desc &diff_weights_layer_desc, + const memory::desc &diff_weights_iter_desc, + const memory::desc &diff_bias_desc, + const memory::desc &diff_dst_layer_desc, + const memory::desc &diff_dst_iter_desc, + const lbr_augru_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : rnn_primitive_desc_base(aengine, algorithm::lbr_augru, aprop_kind, + algorithm::undef, direction, src_layer_desc, src_iter_desc, + nullptr, &attention_desc, weights_layer_desc, + weights_iter_desc, nullptr, nullptr, bias_desc, + dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc, + diff_src_iter_desc, nullptr, &diff_attention_desc, + diff_weights_layer_desc, diff_weights_iter_desc, nullptr, + nullptr, diff_bias_desc, diff_dst_layer_desc, + diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f, + hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for an LBR AUGRU backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for an LBR AUGRU backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward, + dnnl::algorithm::lbr_augru) {} + + /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const + memory::desc src_layer_desc() const { + return rnn_base::src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const + memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const + memory::desc attention_desc() const { + return rnn_base::augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const + memory::desc weights_layer_desc() const { + return rnn_base::weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const + memory::desc weights_iter_desc() const { + return rnn_base::weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const + memory::desc bias_desc() const { return rnn_base::bias_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const + memory::desc dst_layer_desc() const { + return rnn_base::dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const + memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); } + + /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { + return rnn_base::workspace_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const + memory::desc diff_src_layer_desc() const { + return rnn_base::diff_src_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const + memory::desc diff_src_iter_desc() const { + return rnn_base::diff_src_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_augru_attention_desc()const + memory::desc diff_attention_desc() const { + return rnn_base::diff_augru_attention_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const + memory::desc diff_weights_layer_desc() const { + return rnn_base::diff_weights_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const + memory::desc diff_weights_iter_desc() const { + return rnn_base::diff_weights_iter_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const + memory::desc diff_bias_desc() const { + return rnn_base::diff_bias_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const + memory::desc diff_dst_layer_desc() const { + return rnn_base::diff_dst_layer_desc(); + } + + /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const + memory::desc diff_dst_iter_desc() const { + return rnn_base::diff_dst_iter_desc(); + } + + /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const + algorithm get_cell_kind() const { return base::get_cell_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_direction()const + rnn_direction get_direction() const { return base::get_direction(); } + }; + + /// Default constructor. Produces an empty object. + lbr_augru_backward() = default; + + /// Constructs an LBR AUGRU backward propagation primitive. + /// @param pd Primitive descriptor for an LBR AUGRU backward propagation + /// primitive. + lbr_augru_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an LBR AUGRU backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for an LBR AUGRU backward propagation + /// primitive. + /// @param cache_blob Cache blob. + lbr_augru_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_rnn + +/// @addtogroup dnnl_api_shuffle Shuffle +/// +/// A primitive to shuffle tensor data along an axis. +/// +/// @sa @ref dev_guide_shuffle in developer guide +/// +/// @{ + +/// Shuffle forward propagation primitive. +struct shuffle_forward : public primitive { + /// Primitive descriptor for a shuffle forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a shuffle forward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param axis The axis along which the data is shuffled. + /// @param group_size Shuffle group size. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &dst_desc, + int axis, int group_size, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_shuffle_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), dst_desc.get(), axis, group_size, + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the shuffle forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a shuffle forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a shuffle forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::shuffle, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_axis()const + int get_axis() const { return base::get_axis(); } + + /// @copydoc dnnl::primitive_desc_base::get_group_size()const + memory::dim get_group_size() const { return base::get_group_size(); } + }; + + /// Default constructor. Produces an empty object. + shuffle_forward() = default; + + /// Constructs a shuffle forward propagation primitive. + /// @param pd Primitive descriptor for a shuffle forward propagation + /// primitive. + shuffle_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a shuffle forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a shuffle forward propagation + /// primitive. + /// @param cache_blob Cache blob. + shuffle_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Shuffle backward propagation primitive. +struct shuffle_backward : public primitive { + /// Primitive descriptor for a shuffle backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a shuffle backward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param axis The axis along which the data is shuffled. + /// @param group_size Shuffle group size. + /// @param hint_fwd_pd Primitive descriptor for a shuffle forward + /// propagation primitive. It is used as a hint for deciding which + /// memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, int axis, int group_size, + const shuffle_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_shuffle_backward_primitive_desc_create( + &pd, aengine.get(), diff_src_desc.get(), + diff_dst_desc.get(), axis, group_size, hint_fwd_pd.get(), + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the shuffle backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a shuffle backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a shuffle backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::shuffle, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_axis()const + int get_axis() const { return base::get_axis(); } + + /// @copydoc dnnl::primitive_desc_base::get_group_size()const + memory::dim get_group_size() const { return base::get_group_size(); } + }; + + /// Default constructor. Produces an empty object. + shuffle_backward() = default; + + /// Constructs a shuffle backward propagation primitive. + /// @param pd Primitive descriptor for a shuffle backward propagation + /// primitive. + shuffle_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a shuffle backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a shuffle backward propagation + /// primitive. + /// @param cache_blob Cache blob. + shuffle_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_shuffle + +/// @addtogroup dnnl_api_binary Binary +/// +/// A primitive to perform tensor operations over two tensors. +/// +/// @sa @ref dev_guide_binary in developer guide +/// +/// @{ + +/// Elementwise binary operator primitive. +struct binary : public primitive { + /// Primitive descriptor for an elementwise binary operator primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for an elementwise binary operator + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Elementwise binary algorithm. + /// @param src0 Memory descriptor for source tensor #0. + /// @param src1 Memory descriptor for source tensor #1. + /// @param dst Memory descriptor for destination tensor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src0, const memory::desc &src1, + const memory::desc &dst, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_binary_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aalgorithm), src0.get(), + src1.get(), dst.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the binary operation primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for an elementwise binary operator + /// primitive with support of ternary operators. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Elementwise binary algorithm. + /// @param src0 Memory descriptor for source tensor #0. + /// @param src1 Memory descriptor for source tensor #1. + /// @param src2 Memory descriptor for source tensor #2 for ternary + /// operations. Might be empty. + /// @param dst Memory descriptor for destination tensor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src0, const memory::desc &src1, + const memory::desc &src2, const memory::desc &dst, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_binary_primitive_desc_create_v2(&pd, + aengine.get(), dnnl::convert_to_c(aalgorithm), src0.get(), + src1.get(), src2.get(), dst.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the binary v2 operation primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a binary primitive from a C + /// API primitive descriptor that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a binary primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::binary) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc(int)const + memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); } + + /// Returns the memory descriptor for source #0. + memory::desc src0_desc() const { return base::src_desc(0); } + + /// Returns the memory descriptor for source #1. + memory::desc src1_desc() const { return base::src_desc(1); } + + /// Returns the memory descriptor for source #2. + memory::desc src2_desc() const { return base::src_desc(2); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + }; + + /// Default constructor. Produces an empty object. + binary() = default; + + /// Constructs an elementwise binary operation primitive. + /// @param pd Primitive descriptor for an elementwise binary operation + /// primitive. + binary(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs an elementwise binary operation primitive from a cache blob. + /// @param pd Primitive descriptor for an elementwise binary operation + /// primitive. + /// @param cache_blob Cache blob. + binary(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_binary + +/// @addtogroup dnnl_api_matmul Matrix Multiplication +/// +/// A primitive to perform matrix-matrix multiplication. The batched mode +/// is supported with 3D tensors. +/// +/// @sa @ref dev_guide_matmul in developer guide +/// +/// +/// @{ + +/// Matrix multiplication (matmul) primitive. +struct matmul : public primitive { + /// Primitive descriptor for a matmul primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a matmul primitive + /// without bias. + /// + /// @param aengine Engine to use. + /// @param src_desc Memory descriptor for source (matrix A). + /// @param weights_desc Memory descriptor for weights (matrix B). + /// @param dst_desc Memory descriptor for destination (matrix C). + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, src_desc, weights_desc, nullptr, dst_desc, + attr, allow_empty) {} + + /// Constructs a primitive descriptor for a matmul primitive with bias. + /// + /// @param aengine Engine to use. + /// @param src_desc Memory descriptor for source (matrix A). + /// @param weights_desc Memory descriptor for weights (matrix B). + /// @param dst_desc Memory descriptor for destination (matrix C). + /// @param bias_desc Memory descriptor for bias. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc &bias_desc, + const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, src_desc, weights_desc, &bias_desc, + dst_desc, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a matmul primitive from a C + /// API primitive descriptor that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a matmul primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::matmul) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return query_md(query::src_md, 0); } + + /// @copydoc dnnl::primitive_desc_base::weights_desc()const + memory::desc weights_desc() const { + return query_md(query::weights_md, 0); + } + + /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const + memory::desc bias_desc() const { + return query_md(query::weights_md, 1); + } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return query_md(query::dst_md, 0); } + + private: + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &weights_desc, const memory::desc *bias_desc, + const memory::desc &dst_desc, const primitive_attr &attr, + bool allow_empty) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_matmul_primitive_desc_create(&pd, + aengine.get(), src_desc.get(), weights_desc.get(), + optional_arg(bias_desc), dst_desc.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the matmul primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + matmul() = default; + + /// Constructs a matmul primitive. + /// @param pd Primitive descriptor for a matmul primitive. + matmul(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a matmul primitive from a cache blob. + /// @param pd Primitive descriptor for a matmul primitive. + /// @param cache_blob Cache blob. + matmul(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_matmul + +/// @addtogroup dnnl_api_resampling Resampling +/// +/// A primitive to compute resampling operation on 1D, 2D or 3D data tensor +/// using Nearest Neighbor, or Linear (Bilinear, Trilinear) interpolation +/// method. +/// +/// @sa @ref dev_guide_resampling in developer guide +/// +/// @{ + +/// Resampling forward propagation. +struct resampling_forward : public primitive { + /// Primitive descriptor for a resampling forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a resampling forward + /// propagation primitive using source and destination memory + /// descriptors. + /// + /// @note + /// Destination memory descriptor may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm resampling algorithm kind: either + /// #dnnl::algorithm::resampling_nearest, or + /// #dnnl::algorithm::resampling_linear + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, nullptr, src_desc, + &dst_desc, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a resampling forward + /// propagation primitive using source memory descriptor and + /// factors. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm resampling algorithm kind: either + /// #dnnl::algorithm::resampling_nearest, or + /// #dnnl::algorithm::resampling_linear + /// @param factors Vector of scaling factors for spatial dimension. + /// @param src_desc Source memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const std::vector &factors, + const memory::desc &src_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, &factors, + src_desc, nullptr, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a resampling forward + /// propagation primitive. + /// + /// @note + /// The destination memory descriptor may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm resampling algorithm kind: either + /// #dnnl::algorithm::resampling_nearest, or + /// #dnnl::algorithm::resampling_linear + /// @param factors Vector of scaling factors for spatial dimension. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const std::vector &factors, + const memory::desc &src_desc, const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aprop_kind, aalgorithm, &factors, + src_desc, &dst_desc, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a resampling forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a resampling forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::resampling, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + private: + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const std::vector *factors, + const memory::desc &src_desc, const memory::desc *dst_desc, + const primitive_attr &attr, bool allow_empty) { + + if (factors) + memory::validate_dims(*factors, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_resampling_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + convert_to_c(aalgorithm), optional_arg(factors), + src_desc.get(), optional_arg(dst_desc), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the resampling forward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + resampling_forward() = default; + + /// Constructs a resampling forward propagation primitive. + /// @param pd Primitive descriptor for a resampling forward propagation + /// primitive. + resampling_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a resampling forward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a resampling forward propagation + /// primitive. + /// @param cache_blob Cache blob. + resampling_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Resampling backward propagation primitive. +struct resampling_backward : public primitive { + /// Primitive descriptor for resampling backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a resampling backward + /// propagation primitive using source and destination memory + /// descriptors. + /// + /// @param aengine Engine to use. + /// @param aalgorithm resampling algorithm kind: either + /// #dnnl::algorithm::resampling_nearest, or + /// #dnnl::algorithm::resampling_linear + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param hint_fwd_pd Primitive descriptor for a resampling + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const resampling_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, nullptr, diff_src_desc, + diff_dst_desc, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for resampling backward + /// propagation primitive. + /// + /// @param aengine Engine to use. + /// @param aalgorithm resampling algorithm kind: either + /// #dnnl::algorithm::resampling_nearest, or + /// #dnnl::algorithm::resampling_linear + /// @param factors Vector of scaling factors for spatial dimension. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param hint_fwd_pd Primitive descriptor for a resampling + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const std::vector &factors, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const resampling_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) + : primitive_desc(aengine, aalgorithm, &factors, diff_src_desc, + diff_dst_desc, hint_fwd_pd, attr, allow_empty) {} + + /// Constructs a primitive descriptor for a resampling backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a resampling backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::resampling, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + private: + primitive_desc(const engine &aengine, algorithm aalgorithm, + const std::vector *factors, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, + const resampling_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr, bool allow_empty) { + + if (factors) + memory::validate_dims(*factors, diff_src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status + = dnnl_resampling_backward_primitive_desc_create(&pd, + aengine.get(), convert_to_c(aalgorithm), + optional_arg(factors), diff_src_desc.get(), + diff_dst_desc.get(), hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the resampling backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + }; + + /// Default constructor. Produces an empty object. + resampling_backward() = default; + + /// Constructs a resampling backward propagation primitive. + /// @param pd Primitive descriptor for a resampling backward propagation + /// primitive. + resampling_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a resampling backward propagation primitive from a cache + /// blob. + /// @param pd Primitive descriptor for a resampling backward propagation + /// primitive. + /// @param cache_blob Cache blob. + resampling_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_resampling + +/// @addtogroup dnnl_api_pooling Pooling +/// +/// A primitive to perform max or average pooling with dilation. +/// +/// @sa @ref dev_guide_pooling in developer guide +/// +/// @{ + +/// Pooling forward propagation primitive. +struct pooling_forward : public primitive { + /// Primitive descriptor for a pooling forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for pooling forward propagation + /// primitive. + /// + /// Arrays @p strides, @p kernel, @p dilation, @p padding_l + /// and @p padding_r contain values for spatial dimensions only and + /// hence must have the same number of elements as there are spatial + /// dimensions. The order of values is the same as in the tensor: + /// depth (for 3D tensors), height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param aalgorithm Pooling algorithm kind: either + /// #dnnl::algorithm::pooling_max, + /// #dnnl::algorithm::pooling_avg_include_padding, + /// or #dnnl::algorithm::pooling_avg_exclude_padding. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param kernel Vector of kernel spatial dimensions. + /// @param dilation Array of dilations for spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + algorithm aalgorithm, const memory::desc &src_desc, + const memory::desc &dst_desc, const memory::dims &strides, + const memory::dims &kernel, const memory::dims &dilation, + const memory::dims &padding_l, const memory::dims &padding_r, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + memory::validate_dims(strides, src_desc.get_ndims() - 2); + memory::validate_dims(kernel, src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, src_desc.get_ndims() - 2); + memory::validate_dims(dilation, src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_pooling_forward_primitive_desc_create( + &pd, aengine.get(), dnnl::convert_to_c(aprop_kind), + convert_to_c(aalgorithm), src_desc.get(), dst_desc.get(), + &strides[0], &kernel[0], &dilation[0], &padding_l[0], + &padding_r[0], attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a descriptor for a pooling forward " + "propagation primitive"); + reset(pd); + } + + /// Constructs a primitive descriptor for a pooling forward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a pooling forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::pooling, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_kernel()const + memory::dims get_kernel() const { return base::get_kernel(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + }; + + /// Default constructor. Produces an empty object. + pooling_forward() = default; + + /// Constructs a pooling forward propagation primitive. + /// + /// @param pd Primitive descriptor for a pooling forward propagation + /// primitive. + pooling_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a pooling forward propagation primitive from a cache blob. + /// + /// @param pd Primitive descriptor for a pooling forward propagation + /// primitive. + /// @param cache_blob Cache blob. + pooling_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// Pooling backward propagation primitive. +struct pooling_backward : public primitive { + /// Primitive descriptor for a pooling backward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a pooling backward propagation + /// primitive. + /// + /// Arrays @p strides, @p kernel, @p dilation, @p padding_l + /// and @p padding_r contain values for spatial dimensions only and + /// hence must have the same number of elements as there are spatial + /// dimensions. The order of values is the same as in the tensor: + /// depth (for 3D tensors), height (for 3D and 2D tensors), and width. + /// + /// @param aengine Engine to use. + /// @param aalgorithm Pooling algorithm kind: either + /// #dnnl::algorithm::pooling_max, + /// #dnnl::algorithm::pooling_avg_include_padding, + /// or #dnnl::algorithm::pooling_avg_exclude_padding. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param strides Vector of strides for spatial dimension. + /// @param kernel Vector of kernel spatial dimensions. + /// @param dilation Array of dilations for spatial dimension. + /// @param padding_l Vector of padding values for low indices for each + /// spatial dimension `([[front,] top,] left)`. + /// @param padding_r Vector of padding values for high indices for + /// each spatial dimension `([[back,] bottom,] right)`. + /// @param hint_fwd_pd Primitive descriptor for a pooling + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &diff_src_desc, + const memory::desc &diff_dst_desc, const memory::dims &strides, + const memory::dims &kernel, const memory::dims &dilation, + const memory::dims &padding_l, const memory::dims &padding_r, + const pooling_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + memory::validate_dims(strides, diff_src_desc.get_ndims() - 2); + memory::validate_dims(kernel, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2); + memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2); + memory::validate_dims(dilation, diff_src_desc.get_ndims() - 2); + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_pooling_backward_primitive_desc_create( + &pd, aengine.get(), convert_to_c(aalgorithm), + diff_src_desc.get(), diff_dst_desc.get(), &strides[0], + &kernel[0], &dilation[0], &padding_l[0], &padding_r[0], + hint_fwd_pd.get(), attr.get()); + if (!allow_empty) + error::wrap_c_api(status, + "could not create a descriptor for a pooling backward " + "propagation primitive"); + reset(pd); + } + + /// Constructs a primitive descriptor for a pooling backward propagation + /// primitive from a C API primitive descriptor that must have a + /// matching kind. + /// + /// @param pd C API primitive descriptor for a pooling backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::pooling, + dnnl::prop_kind::backward_data) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::workspace_desc()const + memory::desc workspace_desc() const { return base::workspace_desc(); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + + /// @copydoc dnnl::primitive_desc_base::get_strides()const + memory::dims get_strides() const { return base::get_strides(); } + + /// @copydoc dnnl::primitive_desc_base::get_kernel()const + memory::dims get_kernel() const { return base::get_kernel(); } + + /// @copydoc dnnl::primitive_desc_base::get_dilations()const + memory::dims get_dilations() const { return base::get_dilations(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_l()const + memory::dims get_padding_l() const { return base::get_padding_l(); } + + /// @copydoc dnnl::primitive_desc_base::get_padding_r()const + memory::dims get_padding_r() const { return base::get_padding_r(); } + }; + + /// Default constructor. Produces an empty object. + pooling_backward() = default; + + /// Constructs a pooling backward propagation primitive. + /// + /// @param pd Primitive descriptor for a pooling backward propagation + /// primitive. + pooling_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a pooling backward propagation primitive from a cache blob. + /// + /// @param pd Primitive descriptor for a pooling backward propagation + /// primitive. + /// @param cache_blob Cache blob. + pooling_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_pooling + +/// @addtogroup dnnl_api_prelu PReLU +/// +/// PReLU primitive +/// A primitive to perform PReLU (leaky ReLU with trainable alpha parameter) +/// +/// @sa @ref dev_guide_prelu in developer guide +/// +/// @{ + +/// PReLU forward propagation primitive. +struct prelu_forward : public primitive { + /// Primitive descriptor for a PReLU forward propagation primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a PReLU forward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param aprop_kind Propagation kind. Possible values are + /// #dnnl::prop_kind::forward_training, and + /// #dnnl::prop_kind::forward_inference. + /// @param src_desc Source memory descriptor. + /// @param weight_desc Alpha parameters memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, prop_kind aprop_kind, + const memory::desc &src_desc, const memory::desc &weight_desc, + const memory::desc &dst_desc, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_prelu_forward_primitive_desc_create(&pd, + aengine.get(), dnnl::convert_to_c(aprop_kind), + src_desc.get(), weight_desc.get(), dst_desc.get(), + attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the prelu forward propagation primitive. Run workload " + "with environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a prelu forward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a prelu forward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::prelu, + dnnl::prop_kind::forward_training, + dnnl::prop_kind::forward_inference) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + }; + + /// Default constructor. Produces an empty object. + prelu_forward() = default; + + /// Constructs a prelu forward propagation primitive. + /// @param pd Primitive descriptor for a prelu forward propagation + /// primitive. + prelu_forward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a prelu forward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a prelu forward propagation + /// primitive. + /// @param cache_blob Cache blob. + prelu_forward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// PReLU backward propagation primitive. +struct prelu_backward : public primitive { + /// Primitive descriptor for prelu backward propagation. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a descriptor for a PReLU backward propagation + /// primitive. + /// + /// @param aengine Engine to use. + /// @param src_desc Source memory descriptor. + /// @param weight_desc Alpha parameters memory descriptor. + /// @param diff_src_desc Diff source memory descriptor. + /// @param diff_weights_desc Diff alpha parameters memory descriptor. + /// @param diff_dst_desc Diff destination memory descriptor. + /// @param hint_fwd_pd Primitive descriptor for a PReLU + /// forward propagation primitive. It is used as a hint for + /// deciding which memory format to use. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, const memory::desc &src_desc, + const memory::desc &weight_desc, + const memory::desc &diff_src_desc, + const memory::desc &diff_weights_desc, + const memory::desc &diff_dst_desc, + const prelu_forward::primitive_desc &hint_fwd_pd, + const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_prelu_backward_primitive_desc_create( + &pd, aengine.get(), src_desc.get(), weight_desc.get(), + diff_src_desc.get(), diff_weights_desc.get(), + diff_dst_desc.get(), hint_fwd_pd.get(), attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the prelu backward propagation primitive. Run " + "workload with environment variable ONEDNN_VERBOSE=all " + "to get additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a prelu backward + /// propagation primitive from a C API primitive descriptor that must + /// have a matching kind. + /// + /// @param pd C API primitive descriptor for a prelu backward + /// propagation primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::prelu, + dnnl::prop_kind::backward) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const + memory::desc diff_src_desc() const { return base::diff_src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const + memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const + prop_kind get_prop_kind() const { return base::get_prop_kind(); } + }; + + /// Default constructor. Produces an empty object. + prelu_backward() = default; + + /// Constructs a prelu backward propagation primitive. + /// @param pd Primitive descriptor for a prelu backward propagation + /// primitive. + prelu_backward(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a prelu backward propagation primitive from a cache blob. + /// @param pd Primitive descriptor for a prelu backward propagation + /// primitive. + /// @param cache_blob Cache blob. + prelu_backward( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_prelu + +/// @addtogroup dnnl_api_reduction Reduction +/// +/// A primitive to compute reduction operation on data tensor +/// using min, max, mul, sum, mean and norm_lp operations. +/// +/// @sa @ref dev_guide_reduction in developer guide +/// +/// @{ + +/// Reduction. +struct reduction : public primitive { + /// Primitive descriptor for a reduction primitive. + struct primitive_desc : public dnnl::primitive_desc { + /// Default constructor. Produces an empty object. + primitive_desc() = default; + + /// Constructs a primitive descriptor for a reduction primitive using + /// algorithm specific parameters, source and destination memory + /// descriptors. + /// + /// @note + /// Destination memory descriptor may be initialized with + /// #dnnl::memory::format_tag::any value of @p format_tag. + /// + /// @param aengine Engine to use. + /// @param aalgorithm reduction algorithm kind. Possible values: + /// #dnnl_reduction_max, #dnnl_reduction_min, #dnnl_reduction_sum, + /// #dnnl_reduction_mul, #dnnl_reduction_mean, + /// #dnnl_reduction_norm_lp_max, #dnnl_reduction_norm_lp_sum, + /// #dnnl_reduction_norm_lp_power_p_max, + /// #dnnl_reduction_norm_lp_power_p_sum. + /// @param p algorithm specific parameter. + /// @param eps algorithm specific parameter. + /// @param src_desc Source memory descriptor. + /// @param dst_desc Destination memory descriptor. + /// @param attr Primitive attributes to use. Attributes are optional + /// and default to empty attributes. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + primitive_desc(const engine &aengine, algorithm aalgorithm, + const memory::desc &src_desc, const memory::desc &dst_desc, + float p, float eps, const primitive_attr &attr = default_attr(), + bool allow_empty = false) { + + dnnl_primitive_desc_t pd = nullptr; + dnnl_status_t status = dnnl_reduction_primitive_desc_create(&pd, + aengine.get(), convert_to_c(aalgorithm), src_desc.get(), + dst_desc.get(), p, eps, attr.get()); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a primitive descriptor for " + "the reduction primitive. Run workload with " + "environment variable ONEDNN_VERBOSE=all to get " + "additional diagnostic information."); + reset(pd); + } + + /// Constructs a primitive descriptor for a reduction primitive from a C + /// API primitive descriptor that must have a matching kind. + /// + /// @param pd C API primitive descriptor for a reduction primitive. + primitive_desc(dnnl_primitive_desc_t pd) + : dnnl::primitive_desc(pd, dnnl::primitive::kind::reduction) {} + + /// @copydoc dnnl::primitive_desc_base::src_desc()const + memory::desc src_desc() const { return base::src_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::dst_desc()const + memory::desc dst_desc() const { return base::dst_desc(0); } + + /// @copydoc dnnl::primitive_desc_base::get_p()const + float get_p() const { return base::get_p(); } + + /// @copydoc dnnl::primitive_desc_base::get_epsilon()const + float get_epsilon() const { return base::get_epsilon(); } + + /// @copydoc dnnl::primitive_desc_base::get_algorithm()const + algorithm get_algorithm() const { return base::get_algorithm(); } + }; + + /// Default constructor. Produces an empty object. + reduction() = default; + + /// Constructs a reduction primitive. + /// @param pd Primitive descriptor for a reduction primitive. + reduction(const primitive_desc &pd) : primitive(pd) {} + + /// Constructs a reduction primitive from a cache blob. + /// @param pd Primitive descriptor for a reduction primitive. + /// @param cache_blob Cache blob. + reduction(const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd, cache_blob) {} +}; + +/// @} dnnl_api_reduction + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_service Service +/// +/// A set of functions that aid in oneDNN debugging and profiling. +/// +/// @{ + +/// @copydoc dnnl_version_t +using version_t = dnnl_version_t; + +/// Status values returned by the library functions. +enum class status { + /// @copydoc dnnl_success + success = dnnl_success, + /// @copydoc dnnl_out_of_memory + out_of_memory = dnnl_out_of_memory, + /// @copydoc dnnl_invalid_arguments + invalid_arguments = dnnl_invalid_arguments, + /// @copydoc dnnl_unimplemented + unimplemented = dnnl_unimplemented, + /// @copydoc dnnl_last_impl_reached + last_impl_reached = dnnl_last_impl_reached, + /// @copydoc dnnl_runtime_error + runtime_error = dnnl_runtime_error, + /// @copydoc dnnl_not_required + not_required = dnnl_not_required, +}; + +/// @copydoc dnnl_set_verbose() +inline status set_verbose(int level) { + return static_cast(dnnl_set_verbose(level)); +} + +/// @copydoc dnnl_version() +inline const version_t *version() { + return dnnl_version(); +} + +/// Returns the floating-point math mode that will be used by default +/// for all subsequently created primitives. +/// +/// @returns Output FP math mode. +inline fpmath_mode get_default_fpmath_mode() { + dnnl_fpmath_mode_t mode; + error::wrap_c_api(dnnl_get_default_fpmath_mode(&mode), + "could not get a default fpmath mode"); + return static_cast(mode); +} + +/// @copydoc dnnl_set_default_fpmath_mode() +inline status set_default_fpmath_mode(fpmath_mode mode) { + return static_cast( + dnnl_set_default_fpmath_mode(convert_to_c(mode))); +} + +/// @copydoc dnnl_set_jit_dump() +inline status set_jit_dump(int enable) { + return static_cast(dnnl_set_jit_dump(enable)); +} + +/// @copydoc dnnl_set_jit_profiling_flags() +inline status set_jit_profiling_flags(unsigned flags) { + return static_cast(dnnl_set_jit_profiling_flags(flags)); +} + +/// @copydoc dnnl_set_jit_profiling_jitdumpdir() +inline status set_jit_profiling_jitdumpdir(const std::string &dir) { + return static_cast(dnnl_set_jit_profiling_jitdumpdir(dir.c_str())); +} + +/// @copydoc dnnl_cpu_isa_t +enum class cpu_isa { + /// @copydoc dnnl_cpu_isa_default + isa_default = dnnl_cpu_isa_default, + /// @copydoc dnnl_cpu_isa_sse41 + sse41 = dnnl_cpu_isa_sse41, + /// @copydoc dnnl_cpu_isa_avx + avx = dnnl_cpu_isa_avx, + /// @copydoc dnnl_cpu_isa_avx2 + avx2 = dnnl_cpu_isa_avx2, + /// @copydoc dnnl_cpu_isa_avx2_vnni + avx2_vnni = dnnl_cpu_isa_avx2_vnni, + /// @copydoc dnnl_cpu_isa_avx2_vnni_2 + avx2_vnni_2 = dnnl_cpu_isa_avx2_vnni_2, + /// @copydoc dnnl_cpu_isa_avx512_core + avx512_core = dnnl_cpu_isa_avx512_core, + /// @copydoc dnnl_cpu_isa_avx512_core_vnni + avx512_core_vnni = dnnl_cpu_isa_avx512_core_vnni, + /// @copydoc dnnl_cpu_isa_avx512_core_bf16 + avx512_core_bf16 = dnnl_cpu_isa_avx512_core_bf16, + /// @copydoc dnnl_cpu_isa_avx10_1_512 + avx10_1_512 = dnnl_cpu_isa_avx10_1_512, + /// @copydoc dnnl_cpu_isa_avx512_core_fp16 + avx512_core_fp16 = dnnl_cpu_isa_avx512_core_fp16, + /// @copydoc dnnl_cpu_isa_avx10_1_512_amx + avx10_1_512_amx = dnnl_cpu_isa_avx10_1_512_amx, + /// @copydoc dnnl_cpu_isa_avx512_core_amx + avx512_core_amx = dnnl_cpu_isa_avx512_core_amx, + /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16 + avx10_1_512_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16, + /// @copydoc dnnl_cpu_isa_avx512_core_amx_fp16 + avx512_core_amx_fp16 = dnnl_cpu_isa_avx512_core_amx_fp16, +}; + +/// @copydoc dnnl_set_max_cpu_isa() +inline status set_max_cpu_isa(cpu_isa isa) { + return static_cast( + dnnl_set_max_cpu_isa(static_cast(isa))); +} + +/// @copydoc dnnl_get_effective_cpu_isa() +inline cpu_isa get_effective_cpu_isa() { + return static_cast(dnnl_get_effective_cpu_isa()); +} + +/// @copydoc dnnl_cpu_isa_hints_t +enum class cpu_isa_hints { + /// @copydoc dnnl_cpu_isa_no_hints + no_hints = dnnl_cpu_isa_no_hints, + /// @copydoc dnnl_cpu_isa_prefer_ymm + prefer_ymm = dnnl_cpu_isa_prefer_ymm, +}; + +/// @copydoc dnnl_set_cpu_isa_hints() +inline status set_cpu_isa_hints(cpu_isa_hints isa_hints) { + return static_cast(dnnl_set_cpu_isa_hints( + static_cast(isa_hints))); +} + +/// @copydoc dnnl_get_cpu_isa_hints() +inline cpu_isa_hints get_cpu_isa_hints() { + return static_cast(dnnl_get_cpu_isa_hints()); +} + +/// @} dnnl_api_service + +#ifdef DNNL_EXPERIMENTAL_PROFILING +/// @addtogroup dnnl_api_profiling Profiling +/// @{ + +/// Profiling data kind. +enum class profiling_data_kind { + /// Undefined profiling data kind. + undef = dnnl_profiling_data_kind_undef, + /// Data kind to query an execution time in nanoseconds. + time = dnnl_profiling_data_kind_time, +}; + +/// Resets a profiler's state. +/// +/// @param stream Stream associated with the profiler. +inline void reset_profiling(stream &stream) { + error::wrap_c_api( + dnnl_reset_profiling(stream.get()), "could not reset profiling"); +} + +/// Returns requested profiling data. The profiling data accumulates for each +/// primitive execution. The size of the vector will be equal to the number +/// of executions since the last `dnnl::reset_profiling` call. +/// +/// The profiling data can be reset by calling #dnnl::reset_profiling. +/// +/// @note +/// It is required to wait for all submitted primitives to complete +/// using #dnnl::stream::wait prior to querying profiling data. +/// +/// @param stream Stream that was used for executing a primitive that +/// is being profiled. +/// @param data_kind Profiling data kind to query. +/// +/// @returns A vector with the requested profiling data. +inline std::vector get_profiling_data( + stream &stream, profiling_data_kind data_kind) { + int num_entries = 0; + error::wrap_c_api( + dnnl_query_profiling_data(stream.get(), + static_cast(data_kind), + &num_entries, nullptr), + "could not get number of entries for profiling data"); + + if (num_entries == 0) return {}; + + std::vector data(num_entries); + error::wrap_c_api( + dnnl_query_profiling_data(stream.get(), + static_cast(data_kind), + &num_entries, data.data()), + "could not get profiling data"); + return data; +} + +/// @} dnnl_api_profiling +#endif + +/// @addtogroup dnnl_api_primitive_cache Primitive Cache +/// +/// A set of functions that provide primitive cache control. +/// +/// @{ + +/// Returns the number of primitives that can be held in the primitive cache +/// at the same time. +inline int get_primitive_cache_capacity() { + int result = 0; + error::wrap_c_api(dnnl_get_primitive_cache_capacity(&result), + "could not get primitive cache capacity"); + return result; +} + +/// @copydoc dnnl_set_primitive_cache_capacity(int capacity) +inline void set_primitive_cache_capacity(int capacity) { + error::wrap_c_api(dnnl_set_primitive_cache_capacity(capacity), + "could not set primitive cache capacity"); +} + +/// @} dnnl_api_primitive_cache + +/// @addtogroup dnnl_api_blas BLAS functions +/// +/// A subset of Basic Linear Algebra (BLAS) functions that perform +/// matrix-matrix multiplication. +/// +/// @{ + +/// @copydoc dnnl_sgemm() +inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N, + dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda, + const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc) { + return static_cast(dnnl_sgemm( + transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)); +} + +/// @copydoc dnnl_gemm_u8s8s32() +inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A, + dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co) { + return static_cast(dnnl_gemm_u8s8s32(transa, transb, offsetc, M, N, + K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co)); +} + +/// @copydoc dnnl_gemm_s8s8s32() +inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A, + dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co) { + return static_cast(dnnl_gemm_s8s8s32(transa, transb, offsetc, M, N, + K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co)); +} + +/// @} dnnl_api_blas + +// implementation section + +/// @cond DO_NOT_DOCUMENT_THIS +inline primitive::primitive(const_dnnl_primitive_desc_t c_pd) { + dnnl_primitive_t result; + error::wrap_c_api(dnnl_primitive_create(&result, c_pd), + "could not create a primitive"); + reset(result); +} + +inline primitive::primitive(const_dnnl_primitive_desc_t c_pd, + const std::vector &cache_blob) { + dnnl_primitive_t result; + size_t size = cache_blob.size(); + const uint8_t *cache_blob_data = cache_blob.data(); + error::wrap_c_api(dnnl_primitive_create_from_cache_blob( + &result, c_pd, size, cache_blob_data), + "could not create a primitive from a cache blob"); + reset(result); +} + +inline primitive::primitive(const primitive_desc &pd) : primitive(pd.get()) {} +inline primitive::primitive( + const primitive_desc &pd, const std::vector &cache_blob) + : primitive(pd.get(), cache_blob) {} + +inline void primitive::execute(const stream &astream, + const std::unordered_map &args) const { + std::vector c_args; + c_args.reserve(args.size()); + for (const auto &a : args) + c_args.push_back({a.first, a.second.get(true)}); + + error::wrap_c_api(dnnl_primitive_execute(get(), astream.get(), + (int)c_args.size(), c_args.data()), + "could not execute a primitive"); +} + +/// @endcond + +#undef DNNL_DEFINE_BITMASK_OPS + +} // namespace dnnl + +/// oneAPI namespace + +/// The oneAPI namespace. +/// Contains the oneapi::dnnl namespace as an alias to the ::dnnl namespace. +namespace oneapi { +// Note: without this guard, doxygen warns of potentially recursive namespace +#ifndef DOXYGEN_SHOULD_SKIP_THIS +/// oneDNN alias namespace +namespace dnnl = ::dnnl; +#endif +} // namespace oneapi + +/// @} dnnl_api + +#endif /* ONEAPI_DNNL_DNNL_HPP */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h new file mode 100644 index 0000000000000000000000000000000000000000..611e025c4bbe2a194fb799dfe7d1a45583bfb182 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h @@ -0,0 +1,180 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2022-2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C common API + +#ifndef ONEAPI_DNNL_DNNL_COMMON_H +#define ONEAPI_DNNL_DNNL_COMMON_H + +#include "oneapi/dnnl/dnnl_common_types.h" +#include "oneapi/dnnl/dnnl_config.h" +#include "oneapi/dnnl/dnnl_version.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// @addtogroup dnnl_api_common Common API +/// @{ + +/// @addtogroup dnnl_api_engine Engine +/// @{ + +/// Returns the number of engines of a particular kind. +/// +/// @param kind Kind of engines to count. +/// @returns Count of the engines. +size_t DNNL_API dnnl_engine_get_count(dnnl_engine_kind_t kind); + +/// Creates an engine. +/// +/// @param engine Output engine. +/// @param kind Engine kind. +/// @param index Engine index that should be between 0 and the count of +/// engines of the requested kind. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_engine_create( + dnnl_engine_t *engine, dnnl_engine_kind_t kind, size_t index); + +/// Returns the kind of an engine. +/// +/// @param engine Engine to query. +/// @param kind Output engine kind. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_engine_get_kind( + dnnl_engine_t engine, dnnl_engine_kind_t *kind); + +/// Destroys an engine. +/// +/// @param engine Engine to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_engine_destroy(dnnl_engine_t engine); + +/// @} dnnl_api_engine + +/// @addtogroup dnnl_api_stream Stream +/// @{ + +/// Creates an execution stream. +/// +/// @param stream Output execution stream. +/// @param engine Engine to create the execution stream on. +/// @param flags Stream behavior flags (@sa dnnl_stream_flags_t). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_stream_create( + dnnl_stream_t *stream, dnnl_engine_t engine, unsigned flags); + +/// Returns the engine of a stream object. +/// +/// @param stream Stream object. +/// @param engine Output engine on which the stream is created. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_stream_get_engine( + const_dnnl_stream_t stream, dnnl_engine_t *engine); + +/// Waits for all primitives in the execution stream to finish computations. +/// +/// @param stream Execution stream. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_stream_wait(dnnl_stream_t stream); + +/// Destroys an execution stream. +/// +/// @param stream Execution stream to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_stream_destroy(dnnl_stream_t stream); + +/// @} dnnl_api_stream + +/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode +/// @{ + +/// Returns the floating-point math mode that will be used by default +/// for all subsequently created primitives. +/// +/// @param mode Output FP math mode. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_get_default_fpmath_mode(dnnl_fpmath_mode_t *mode); + +/// Sets the floating-point math mode that will be used by default +/// for all subsequently created primitives. +/// +/// @param mode FP math mode. The possible values are: +/// #dnnl_fpmath_mode_strict, +/// #dnnl_fpmath_mode_bf16, +/// #dnnl_fpmath_mode_f16, +/// #dnnl_fpmath_mode_tf32, +/// #dnnl_fpmath_mode_any. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_set_default_fpmath_mode(dnnl_fpmath_mode_t mode); + +/// @} dnnl_api_fpmath_mode + +/// @addtogroup dnnl_api_service +/// @{ + +/// Configures verbose output to stdout. +/// +/// @note +/// Enabling verbose output affects performance. +/// This setting overrides the ONEDNN_VERBOSE environment variable. +/// +/// @param level Verbosity level: +/// - 0: no verbose output (default), +/// - 1: primitive and graph information at execution, +/// - 2: primitive and graph information at creation/compilation and execution. +/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the +/// @p level value is invalid, and #dnnl_success/#dnnl::status::success on +/// success. +dnnl_status_t DNNL_API dnnl_set_verbose(int level); + +/// Returns library version information. +/// @returns Pointer to a constant structure containing +/// - major: major version number, +/// - minor: minor version number, +/// - patch: patch release number, +/// - hash: git commit hash. +const dnnl_version_t DNNL_API *dnnl_version(void); + +/// @} dnnl_api_service + +/// @} dnnl_api_common + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_COMMON_H */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..694fe419c786be7a8449c3a7083dfd865627871c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp @@ -0,0 +1,484 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2022-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C++ common API + +#ifndef ONEAPI_DNNL_DNNL_COMMON_HPP +#define ONEAPI_DNNL_DNNL_COMMON_HPP + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include +#include +#include +#include +#include +#include + +#include "oneapi/dnnl/dnnl_common.h" + +/// @endcond + +// __cpp_exceptions is referred from +// https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_exceptions.html +// gcc < 5 does not define __cpp_exceptions but __EXCEPTIONS, +// Microsoft C++ Compiler does not provide an option to disable exceptions +#ifndef DNNL_ENABLE_EXCEPTIONS +#if __cpp_exceptions || __EXCEPTIONS \ + || (defined(_MSC_VER) && !defined(__clang__)) +#define DNNL_ENABLE_EXCEPTIONS 1 +#else +#define DNNL_ENABLE_EXCEPTIONS 0 +#endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define DNNL_TRAP() __builtin_trap() +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) +#define DNNL_TRAP() __debugbreak() +#else +#error "unknown compiler" +#endif + +#if DNNL_ENABLE_EXCEPTIONS +#define DNNL_THROW_ERROR(status, msg) throw error(status, msg) +#else +#include +#define DNNL_THROW_ERROR(status, msg) \ + do { \ + fputs(msg, stderr); \ + DNNL_TRAP(); \ + } while (0) +#endif + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// oneDNN namespace +namespace dnnl { + +/// @addtogroup dnnl_api_common Common API +/// @{ + +/// @addtogroup dnnl_api_utils Utilities +/// Utility types and definitions. +/// @{ + +/// oneDNN exception class. +/// +/// This class captures the status returned by a failed C API function and +/// the error message from the call site. +struct error : public std::exception { + dnnl_status_t status; + const char *message; + + /// Constructs an instance of an exception class. + /// + /// @param status The error status returned by a C API function. + /// @param message The error message. + error(dnnl_status_t status, const char *message) + : status(status), message(message) {} + + /// Returns the explanatory string. + const char *what() const noexcept override { return message; } + + /// A convenience function for wrapping calls to C API functions. Checks + /// the return status and throws an dnnl::error in case of failure. + /// + /// @param status The error status returned by a C API function. + /// @param message The error message. + static void wrap_c_api(dnnl_status_t status, const char *message) { + if (status != dnnl_success) DNNL_THROW_ERROR(status, message); + } +}; + +/// A class that provides the destructor for a oneDNN C API handle. +template +struct handle_traits {}; + +/// oneDNN C API handle wrapper class. +/// +/// This class is used as the base class for primitive (dnnl::primitive), +/// engine (dnnl::engine), and stream (dnnl::stream) classes, as well as +/// others. An object of the dnnl::handle class can be passed by value. +/// +/// A handle can be weak, in which case it follows std::weak_ptr semantics. +/// Otherwise, it follows `std::shared_ptr` semantics. +/// +/// @note +/// The implementation stores oneDNN C API handles in a `std::shared_ptr` +/// with deleter set to a dummy function in the weak mode. +/// +template > +struct handle { +private: + static dnnl_status_t dummy_destructor(T) { return dnnl_success; } + std::shared_ptr::type> data_ {0}; + +protected: + bool operator==(const T other) const { return other == data_.get(); } + bool operator!=(const T other) const { return !(*this == other); } + +public: + /// Constructs an empty handle object. + /// + /// @warning + /// Uninitialized object cannot be used in most library calls and is + /// equivalent to a null pointer. Any attempt to use its methods, or + /// passing it to the other library function, will cause an exception + /// to be thrown. + handle() = default; + + /// Copy constructor. + handle(const handle &) = default; + /// Assignment operator. + handle &operator=(const handle &) = default; + /// Move constructor. + handle(handle &&) = default; + /// Move assignment operator. + handle &operator=(handle &&) = default; + + /// Constructs a handle wrapper object from a C API handle. + /// + /// @param t The C API handle to wrap. + /// @param weak A flag specifying whether to construct a weak wrapper; + /// defaults to @c false. + explicit handle(T t, bool weak = false) { reset(t, weak); } + + /// Resets the handle wrapper objects to wrap a new C API handle. + /// + /// @param t The new value of the C API handle. + /// @param weak A flag specifying whether the wrapper should be weak; + /// defaults to @c false. + void reset(T t, bool weak = false) { + data_.reset(t, weak ? &dummy_destructor : traits::destructor); + } + + /// Returns the underlying C API handle. + /// + /// @param allow_empty A flag signifying whether the method is allowed to + /// return an empty (null) object without throwing an exception. + /// @returns The underlying C API handle. + T get(bool allow_empty = false) const { + T result = data_.get(); + if (allow_empty == false && result == nullptr) + DNNL_THROW_ERROR( + dnnl_invalid_arguments, "object is not initialized"); + return result; + } + + /// Converts a handle to the underlying C API handle type. Does not throw + /// and returns `nullptr` if the object is empty. + /// + /// @returns The underlying C API handle. + explicit operator T() const { return get(true); } + + /// Checks whether the object is not empty. + /// + /// @returns Whether the object is not empty. + explicit operator bool() const { return get(true) != nullptr; } + + /// Equality operator. + /// + /// @param other Another handle wrapper. + /// @returns @c true if this and the other handle wrapper manage the same + /// underlying C API handle, and @c false otherwise. Empty handle + /// objects are considered to be equal. + bool operator==(const handle &other) const { + return other.data_.get() == data_.get(); + } + + /// Inequality operator. + /// + /// @param other Another handle wrapper. + /// @returns @c true if this and the other handle wrapper manage different + /// underlying C API handles, and @c false otherwise. Empty handle + /// objects are considered to be equal. + bool operator!=(const handle &other) const { return !(*this == other); } +}; + +/// @} dnnl_api_utils + +/// @addtogroup dnnl_api_engine Engine +/// +/// An abstraction of a computational device: a CPU, a specific GPU +/// card in the system, etc. Most primitives are created to execute +/// computations on one specific engine. The only exceptions are reorder +/// primitives that transfer data between two different engines. +/// +/// @sa @ref dev_guide_basic_concepts +/// +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_engine_t p) { + return dnnl_engine_destroy(p); + } +}; +/// @endcond + +/// An execution engine. +struct engine : public handle { + friend struct primitive; + friend struct reorder; + + /// Kinds of engines. + enum class kind { + /// An unspecified engine + any = dnnl_any_engine, + /// CPU engine + cpu = dnnl_cpu, + /// GPU engine + gpu = dnnl_gpu, + }; + + using handle::handle; + + /// Constructs an empty engine. An empty engine cannot be used in any + /// operations. + engine() = default; + + /// Returns the number of engines of a certain kind. + /// + /// @param akind The kind of engines to count. + /// @returns The number of engines of the specified kind. + static size_t get_count(kind akind) { + return dnnl_engine_get_count(convert_to_c(akind)); + } + + /// Constructs an engine. + /// + /// @param akind The kind of engine to construct. + /// @param index The index of the engine. Must be less than the value + /// returned by #get_count() for this particular kind of engine. + engine(kind akind, size_t index) { + dnnl_engine_t engine; + error::wrap_c_api( + dnnl_engine_create(&engine, convert_to_c(akind), index), + "could not create an engine"); + reset(engine); + } + + /// Returns the kind of the engine. + /// @returns The kind of the engine. + kind get_kind() const { + dnnl_engine_kind_t kind; + error::wrap_c_api(dnnl_engine_get_kind(get(), &kind), + "could not get kind of an engine"); + return static_cast(kind); + } + +private: + static dnnl_engine_kind_t convert_to_c(kind akind) { + return static_cast(akind); + } +}; + +/// Converts engine kind enum value from C++ API to C API type. +/// +/// @param akind C++ API engine kind enum value. +/// @returns Corresponding C API engine kind enum value. +inline dnnl_engine_kind_t convert_to_c(engine::kind akind) { + return static_cast(akind); +} + +/// @} dnnl_api_engine + +/// @addtogroup dnnl_api_stream Stream +/// +/// An encapsulation of execution context tied to a particular engine. +/// +/// @sa @ref dev_guide_basic_concepts +/// +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_stream_t p) { + return dnnl_stream_destroy(p); + } +}; +/// @endcond + +/// An execution stream. +struct stream : public handle { + using handle::handle; + + /// Stream flags. Can be combined using the bitwise OR operator. + enum class flags : unsigned { + /// In-order execution. + in_order = dnnl_stream_in_order, + /// Out-of-order execution. + out_of_order = dnnl_stream_out_of_order, + /// Default stream configuration. + default_flags = dnnl_stream_default_flags, +#ifdef DNNL_EXPERIMENTAL_PROFILING + /// Enables profiling capabilities. + profiling = dnnl_stream_profiling, +#endif + }; + + /// Constructs an empty stream. An empty stream cannot be used in any + /// operations. + stream() = default; + + /// Constructs a stream for the specified engine and with behavior + /// controlled by the specified flags. + /// + /// @param aengine Engine to create the stream on. + /// @param aflags Flags controlling stream behavior. + explicit stream( + const engine &aengine, flags aflags = flags::default_flags) { + dnnl_stream_t stream; + error::wrap_c_api(dnnl_stream_create(&stream, aengine.get(), + static_cast(aflags)), + "could not create a stream"); + reset(stream); + } + + /// Returns the associated engine. + engine get_engine() const { + dnnl_engine_t c_engine; + error::wrap_c_api(dnnl_stream_get_engine(get(), &c_engine), + "could not get an engine from a stream object"); + return engine(c_engine, true); + } + + /// Waits for all primitives executing in the stream to finish. + /// @returns The stream itself. + stream &wait() { + error::wrap_c_api( + dnnl_stream_wait(get()), "could not wait on a stream"); + return *this; + } +}; + +#define DNNL_DEFINE_BITMASK_OPS(enum_name) \ + inline enum_name operator|(enum_name lhs, enum_name rhs) { \ + return static_cast( \ + static_cast(lhs) | static_cast(rhs)); \ + } \ +\ + inline enum_name operator&(enum_name lhs, enum_name rhs) { \ + return static_cast( \ + static_cast(lhs) & static_cast(rhs)); \ + } \ +\ + inline enum_name operator^(enum_name lhs, enum_name rhs) { \ + return static_cast( \ + static_cast(lhs) ^ static_cast(rhs)); \ + } \ +\ + inline enum_name &operator|=(enum_name &lhs, enum_name rhs) { \ + lhs = static_cast( \ + static_cast(lhs) | static_cast(rhs)); \ + return lhs; \ + } \ +\ + inline enum_name &operator&=(enum_name &lhs, enum_name rhs) { \ + lhs = static_cast( \ + static_cast(lhs) & static_cast(rhs)); \ + return lhs; \ + } \ +\ + inline enum_name &operator^=(enum_name &lhs, enum_name rhs) { \ + lhs = static_cast( \ + static_cast(lhs) ^ static_cast(rhs)); \ + return lhs; \ + } \ +\ + inline enum_name operator~(enum_name rhs) { \ + return static_cast(~static_cast(rhs)); \ + } + +DNNL_DEFINE_BITMASK_OPS(stream::flags) + +/// @} dnnl_api_stream + +/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode +/// @{ + +/// Floating-point math mode +enum class fpmath_mode { + /// Default behavior, no downconversions allowed + strict = dnnl_fpmath_mode_strict, + /// Implicit f32->bf16 conversions allowed + bf16 = dnnl_fpmath_mode_bf16, + /// Implicit f32->f16 conversions allowed + f16 = dnnl_fpmath_mode_f16, + /// Implicit f32->tf32 conversions allowed + tf32 = dnnl_fpmath_mode_tf32, + /// Implicit f32->f16, f32->tf32 or f32->bf16 conversions allowed + any = dnnl_fpmath_mode_any +}; + +/// Converts an fpmath mode enum value from C++ API to C API type. +/// +/// @param mode C++ API fpmath mode enum value. +/// @returns Corresponding C API fpmath mode enum value. +inline dnnl_fpmath_mode_t convert_to_c(fpmath_mode mode) { + return static_cast(mode); +} + +/// @} dnnl_api_fpmath_mode + +/// @addtogroup dnnl_api_accumulation_mode Accumulation Mode +/// @{ + +/// Accumulation mode +enum class accumulation_mode { + /// Default behavior, f32 for floating point computation, s32 for integer + strict = dnnl_accumulation_mode_strict, + /// same as strict except some partial accumulators can be rounded to + /// src/dst datatype in memory. + relaxed = dnnl_accumulation_mode_relaxed, + /// uses fastest implementation, could use src/dst datatype or + /// wider datatype for accumulators + any = dnnl_accumulation_mode_any, + /// use s32 accumulators during computation + s32 = dnnl_accumulation_mode_s32, + /// use f32 accumulators during computation + f32 = dnnl_accumulation_mode_f32, + /// use f16 accumulators during computation + f16 = dnnl_accumulation_mode_f16 +}; + +/// Converts an accumulation mode enum value from C++ API to C API type. +/// +/// @param mode C++ API accumulation mode enum value. +/// @returns Corresponding C API accumulation mode enum value. +inline dnnl_accumulation_mode_t convert_to_c(accumulation_mode mode) { + return static_cast(mode); +} + +/// @} dnnl_api_accumulation_mode + +/// @} dnnl_api_common + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h new file mode 100644 index 0000000000000000000000000000000000000000..116dd79eae86acfc0f153d0dc11970f55fb38571 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h @@ -0,0 +1,268 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2022-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C API common types definitions + +#ifndef ONEAPI_DNNL_DNNL_COMMON_TYPES_H +#define ONEAPI_DNNL_DNNL_COMMON_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include + +#include "oneapi/dnnl/dnnl_config.h" + +/// @endcond + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// @addtogroup dnnl_api_common Common API +/// @{ + +/// @addtogroup dnnl_api_utils +/// @{ + +/// Status values returned by the library functions. +typedef enum { + /// The operation was successful + dnnl_success = 0, + /// The operation failed due to an out-of-memory condition + dnnl_out_of_memory = 1, + /// The operation failed because of incorrect function arguments + dnnl_invalid_arguments = 2, + /// The operation failed because requested functionality is not implemented + dnnl_unimplemented = 3, + /// The last available implementation is reached + dnnl_last_impl_reached = 4, + /// Primitive or engine failed on execution + dnnl_runtime_error = 5, + /// Queried element is not required for given primitive + dnnl_not_required = 6, + /// The graph is not legitimate + dnnl_invalid_graph = 7, + /// The operation is not legitimate according to op schema + dnnl_invalid_graph_op = 8, + /// The shape cannot be inferred or compiled + dnnl_invalid_shape = 9, + /// The data type cannot be inferred or compiled + dnnl_invalid_data_type = 10, +} dnnl_status_t; + +/// @} dnnl_api_utils + +/// @addtogroup dnnl_api_data_types Data types +/// @{ + +/// Data type specification +typedef enum { + /// Undefined data type, used for empty memory descriptors. + dnnl_data_type_undef = 0, + /// 16-bit/half-precision floating point. + dnnl_f16 = 1, + /// non-standard 16-bit (bfloat16 w/ 7 bit mantissa) floating point. + dnnl_bf16 = 2, + /// 32-bit/single-precision floating point. + dnnl_f32 = 3, + /// 32-bit signed integer. + dnnl_s32 = 4, + /// 8-bit signed integer. + dnnl_s8 = 5, + /// 8-bit unsigned integer. + dnnl_u8 = 6, + /// 64-bit/double-precision floating point. + dnnl_f64 = 7, + /// Boolean data type. Size is C++ implementation defined. + dnnl_boolean = 8, + /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 5-bit exponent and a 2-bit mantissa. + dnnl_f8_e5m2 = 9, + /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 4-bit exponent and a 3-bit mantissa. + dnnl_f8_e4m3 = 10, + /// 4-bit signed integer. + dnnl_s4 = 11, + /// 4-bit unsigned integer. + dnnl_u4 = 12, + /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent. + dnnl_e8m0 = 13, + /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa. + dnnl_f4_e2m1 = 14, + /// 4-bit float data type with 3-bit exponent and 0 bit mantissa. + dnnl_f4_e3m0 = 15, + + /// Parameter to allow internal only data_types without undefined behavior. + /// This parameter is chosen to be valid for so long as sizeof(int) >= 2. + dnnl_data_type_max = 0x7fff, +} dnnl_data_type_t; + +/// Maximum number of dimensions a tensor can have. Only restricts the amount +/// of space used for the tensor description. Individual computational +/// primitives may support only tensors of certain dimensions. +#define DNNL_MAX_NDIMS 12 + +/// A type to describe tensor dimension. +typedef int64_t dnnl_dim_t; + +/// A type to describe tensor dimensions. +typedef dnnl_dim_t dnnl_dims_t[DNNL_MAX_NDIMS]; + +/// @} dnnl_api_data_types + +/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode +/// @{ + +/// Floating-point math mode +typedef enum { + /// Default behavior, no downconversions allowed + dnnl_fpmath_mode_strict, + /// Implicit f32->bf16 conversions allowed + dnnl_fpmath_mode_bf16, + /// Implicit f32->f16 conversions allowed + dnnl_fpmath_mode_f16, + /// Implicit f32->f16, f32->tf32 or f32->bf16 conversions allowed + dnnl_fpmath_mode_any, + /// Implicit f32->tf32 conversions allowed + dnnl_fpmath_mode_tf32, +} dnnl_fpmath_mode_t; + +/// @} dnnl_api_fpmath_mode + +/// @addtogroup dnnl_api_accumulation_mode Accumulation Mode +/// @{ + +/// Accumulation mode +typedef enum { + /// Default behavior, f32/f64 for floating point computation, s32 + /// for integer + dnnl_accumulation_mode_strict, + /// Same as strict but allows some partial accumulators to be + /// rounded to src/dst datatype in memory. + dnnl_accumulation_mode_relaxed, + /// uses fastest implementation, could use src/dst datatype or + /// wider datatype for accumulators + dnnl_accumulation_mode_any, + /// use s32 accumulators during computation + dnnl_accumulation_mode_s32, + /// use f32 accumulators during computation + dnnl_accumulation_mode_f32, + /// use f16 accumulators during computation + dnnl_accumulation_mode_f16 +} dnnl_accumulation_mode_t; + +/// @} dnnl_api_accumulation_mode + +/// @addtogroup dnnl_api_engine Engine +/// @{ + +/// @brief Kinds of engines. +typedef enum { + /// An unspecified engine. + dnnl_any_engine, + /// CPU engine. + dnnl_cpu, + /// GPU engine. + dnnl_gpu, +} dnnl_engine_kind_t; + +/// @struct dnnl_engine +/// @brief An opaque structure to describe an engine. +struct dnnl_engine; +/// @brief An engine handle. +typedef struct dnnl_engine *dnnl_engine_t; +#if 0 +// FIXME: looks like this never happens +/// @brief A constant engine handle. +typedef const struct dnnl_engine *const_dnnl_engine_t; +#endif + +/// @} dnnl_api_engine + +/// @addtogroup dnnl_api_stream Stream +/// @{ + +/// @brief Stream flags. +typedef enum { + // In-order execution. + dnnl_stream_in_order = 0x1U, + /// Out-of-order execution. + dnnl_stream_out_of_order = 0x2U, + /// Default stream configuration. + dnnl_stream_default_flags = dnnl_stream_in_order, +#ifdef DNNL_EXPERIMENTAL_PROFILING + /// Enables profiling capabilities. + dnnl_stream_profiling = 0x4U, +#endif +} dnnl_stream_flags_t; + +/// @struct dnnl_stream +/// An opaque structure to describe an execution stream. +struct dnnl_stream; +/// An execution stream handle. +typedef struct dnnl_stream *dnnl_stream_t; +/// A constant execution stream handle. +typedef const struct dnnl_stream *const_dnnl_stream_t; + +/// @} dnnl_api_stream + +/// @addtogroup dnnl_api_service +/// @{ + +/// Structure containing version information as per [Semantic +/// Versioning](https://semver.org) +typedef struct { + int major; ///< Major version + int minor; ///< Minor version + int patch; ///< Patch version + const char *hash; ///< Git hash of the sources (may be absent) + unsigned cpu_runtime; ///< CPU runtime + unsigned gpu_runtime; ///< GPU runtime +} dnnl_version_t; + +/// @} dnnl_api_service + +/// @addtogroup dnnl_api_memory +/// @{ + +/// Special pointer value that indicates that a memory object should not have +/// an underlying buffer. +#define DNNL_MEMORY_NONE (NULL) + +/// Special pointer value that indicates that the library needs to allocate an +/// underlying buffer for a memory object. +#define DNNL_MEMORY_ALLOCATE ((void *)(size_t)-1) + +/// @} dnnl_api_memory + +/// @} dnnl_api_common + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h new file mode 100644 index 0000000000000000000000000000000000000000..700cf6ce936c072fb56a50d1dbf83431a9a5ad5c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h @@ -0,0 +1,237 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2019-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_CONFIG_H +#define ONEAPI_DNNL_DNNL_CONFIG_H + +/// @cond DO_NOT_DOCUMENT_THIS + +// All symbols shall be internal unless marked as DNNL_API +#if defined _WIN32 || defined __CYGWIN__ +#define DNNL_HELPER_DLL_IMPORT __declspec(dllimport) +#define DNNL_HELPER_DLL_EXPORT __declspec(dllexport) +#else +#if __GNUC__ >= 4 +#define DNNL_HELPER_DLL_IMPORT __attribute__((visibility("default"))) +#define DNNL_HELPER_DLL_EXPORT __attribute__((visibility("default"))) +#else +#define DNNL_HELPER_DLL_IMPORT +#define DNNL_HELPER_DLL_EXPORT +#endif +#endif + +#ifdef DNNL_DLL +#ifdef DNNL_DLL_EXPORTS +#define DNNL_API DNNL_HELPER_DLL_EXPORT +#else +#define DNNL_API DNNL_HELPER_DLL_IMPORT +#endif +#else +#define DNNL_API +#endif + +#if defined(__GNUC__) +#define DNNL_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define DNNL_DEPRECATED __declspec(deprecated) +#else +#define DNNL_DEPRECATED +#endif + +/// @endcond + +// clang-format off + +/// @addtogroup dnnl_api_service +/// @{ + +/// No runtime (disabled) +#define DNNL_RUNTIME_NONE 0u + +/// Sequential runtime (CPU only) +#define DNNL_RUNTIME_SEQ 1u + +/// OpenMP runtime (CPU only) +#define DNNL_RUNTIME_OMP 2u + +/// TBB runtime (CPU only) +#define DNNL_RUNTIME_TBB 4u + +/// Threadpool runtime (CPU only) +#define DNNL_RUNTIME_THREADPOOL 8u + +/// OpenCL runtime +#define DNNL_RUNTIME_OCL 256u + +/// SYCL runtime +#define DNNL_RUNTIME_SYCL 512u + +/// DPC++ runtime +#define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL + +/// No vendor (corresponding runtime is disabled) +#define DNNL_VENDOR_NONE 0u + +/// Intel vendor +#define DNNL_VENDOR_INTEL 1u + +/// NVIDIA vendor +#define DNNL_VENDOR_NVIDIA 2u + +/// AMD vendor +#define DNNL_VENDOR_AMD 4u + +/// Generic vendor +#define DNNL_VENDOR_GENERIC 8u + +/// @} dnnl_api_service + +// oneDNN CPU threading runtime +#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP + +// oneDNN CPU engine runtime +#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP + +// oneDNN GPU engine runtime +#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE + +// oneDNN GPU vendor +#define DNNL_GPU_VENDOR DNNL_VENDOR_NONE + +// clang-format on + +#if defined(DNNL_CPU_RUNTIME) && defined(DNNL_GPU_RUNTIME) +#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_OCL) +#error "Unexpected DNNL_CPU_RUNTIME" +#endif +#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \ + && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \ + && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) +#error "Unexpected DNNL_GPU_RUNTIME" +#endif +#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \ + && DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE) +#error "At least one runtime must be specified" +#endif +#else +#error "BOTH DNNL_CPU_RUNTIME and DNNL_GPU_RUNTIME must be defined" +#endif + +// For SYCL CPU, a primitive may be created and executed in different threads +// hence the global scratchpad does not work. This enables concurrent execution +// when CPU runtime is SYCL to avoid the issue. +#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL +#ifndef DNNL_ENABLE_CONCURRENT_EXEC +#define DNNL_ENABLE_CONCURRENT_EXEC +#endif +#endif + +// When defined, primitive cache stores runtime objects. +/* #undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE */ + +// When defined, DPCPP is supported. +/* #undef DNNL_WITH_SYCL */ + +// When defined, Level Zero is supported. +/* #undef DNNL_WITH_LEVEL_ZERO */ + +// When defined, SYCL CUDA backend is used. +/* #undef DNNL_SYCL_CUDA */ + +// When defined, SYCL HIP backend is used. +/* #undef DNNL_SYCL_HIP */ + +// When defined, SYCL Generic backend is used. +/* #undef DNNL_SYCL_GENERIC */ + +// When defined, stack checker is enabled. +/* #undef DNNL_ENABLE_STACK_CHECKER */ + +// When defined, experimental features are enabled. +/* #undef DNNL_EXPERIMENTAL */ + +// When defined, experimental functionality for sparse domain is enabled. +/* #undef DNNL_EXPERIMENTAL_SPARSE */ + +// When defined, experimental functionality for ukernels is enabled. +#define DNNL_EXPERIMENTAL_UKERNEL + +// When defined, graph component is enabled. +#define ONEDNN_BUILD_GRAPH + +// When defined, experimental profiling capabilities are enabled. +/* #undef DNNL_EXPERIMENTAL_PROFILING */ + +// When defined, experimental logging capabilities are enabled. +/* #undef DNNL_EXPERIMENTAL_LOGGING */ +// When defined, it disables GPU compute reference kernels. +/* #undef DNNL_DISABLE_GPU_REF_KERNELS */ + +// List of configurating build controls +// Workload controls +#define BUILD_TRAINING 1 +#define BUILD_INFERENCE 0 +// Primitive controls +#define BUILD_PRIMITIVE_ALL 1 +#define BUILD_BATCH_NORMALIZATION 0 +#define BUILD_BINARY 0 +#define BUILD_CONCAT 0 +#define BUILD_CONVOLUTION 0 +#define BUILD_DECONVOLUTION 0 +#define BUILD_ELTWISE 0 +#define BUILD_GROUP_NORMALIZATION 0 +#define BUILD_INNER_PRODUCT 0 +#define BUILD_LAYER_NORMALIZATION 0 +#define BUILD_LRN 0 +#define BUILD_MATMUL 0 +#define BUILD_POOLING 0 +#define BUILD_PRELU 0 +#define BUILD_REDUCTION 0 +#define BUILD_REORDER 0 +#define BUILD_RESAMPLING 0 +#define BUILD_RNN 0 +#define BUILD_SDPA 0 +#define BUILD_SHUFFLE 0 +#define BUILD_SOFTMAX 0 +#define BUILD_SUM 0 +// Primitives CPU ISA controls +#define BUILD_PRIMITIVE_CPU_ISA_ALL 1 +#define BUILD_SSE41 0 +#define BUILD_AVX2 0 +#define BUILD_AVX512 0 +#define BUILD_AMX 0 +// Primitives GPU ISA controls +#define BUILD_PRIMITIVE_GPU_ISA_ALL 1 +#define BUILD_GEN9 0 +#define BUILD_GEN11 0 +#define BUILD_XELP 0 +#define BUILD_XEHP 0 +#define BUILD_XEHPG 0 +#define BUILD_XEHPC 0 +#define BUILD_XE2 0 +#define BUILD_XE3 0 +// GeMM kernels ISA controls +#define BUILD_GEMM_KERNELS_ALL 1 +#define BUILD_GEMM_KERNELS_NONE 0 +#define BUILD_GEMM_SSE41 0 +#define BUILD_GEMM_AVX2 0 +#define BUILD_GEMM_AVX512 0 +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h new file mode 100644 index 0000000000000000000000000000000000000000..6447ade13989ceceff4ead69f7a8c11d27c13cd2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2018-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +// DO NOT EDIT, AUTO-GENERATED +// Use this script to update the file: scripts/generate_dnnl_debug.py + +// clang-format off + +#ifndef ONEAPI_DNNL_DNNL_DEBUG_H +#define ONEAPI_DNNL_DNNL_DEBUG_H + +/// @file +/// Debug capabilities + +#include "oneapi/dnnl/dnnl_config.h" +#include "oneapi/dnnl/dnnl_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +const char DNNL_API *dnnl_status2str(dnnl_status_t v); +const char DNNL_API *dnnl_dt2str(dnnl_data_type_t v); +const char DNNL_API *dnnl_fpmath_mode2str(dnnl_fpmath_mode_t v); +const char DNNL_API *dnnl_accumulation_mode2str(dnnl_accumulation_mode_t v); +const char DNNL_API *dnnl_engine_kind2str(dnnl_engine_kind_t v); +#ifdef DNNL_EXPERIMENTAL_SPARSE +const char DNNL_API *dnnl_sparse_encoding2str(dnnl_sparse_encoding_t v); +#endif +const char DNNL_API *dnnl_fmt_tag2str(dnnl_format_tag_t v); +const char DNNL_API *dnnl_prop_kind2str(dnnl_prop_kind_t v); +const char DNNL_API *dnnl_prim_kind2str(dnnl_primitive_kind_t v); +const char DNNL_API *dnnl_alg_kind2str(dnnl_alg_kind_t v); +const char DNNL_API *dnnl_rnn_flags2str(dnnl_rnn_flags_t v); +const char DNNL_API *dnnl_rnn_direction2str(dnnl_rnn_direction_t v); +const char DNNL_API *dnnl_scratchpad_mode2str(dnnl_scratchpad_mode_t v); +const char DNNL_API *dnnl_rounding_mode2str(dnnl_rounding_mode_t v); +const char DNNL_API *dnnl_cpu_isa2str(dnnl_cpu_isa_t v); +const char DNNL_API *dnnl_cpu_isa_hints2str(dnnl_cpu_isa_hints_t v); + +const char DNNL_API *dnnl_runtime2str(unsigned v); +const char DNNL_API *dnnl_fmt_kind2str(dnnl_format_kind_t v); + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h new file mode 100644 index 0000000000000000000000000000000000000000..dc34d2713c00890d3de8a5a87153c0d81a8dfb6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h @@ -0,0 +1,777 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// Graph C API + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_H +#define ONEAPI_DNNL_DNNL_GRAPH_H + +#include "oneapi/dnnl/dnnl_common.h" +#include "oneapi/dnnl/dnnl_config.h" +#include "oneapi/dnnl/dnnl_graph_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_graph_api +/// @{ + +/// @addtogroup dnnl_graph_api_allocator +/// @{ + +/// Creates a host allocator with the given allocation and deallocation +/// call-back function pointers. +/// +/// @param allocator Output allocator. +/// @param host_malloc A pointer to malloc function for host. +/// @param host_free A pointer to free function for host. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_allocator_create( + dnnl_graph_allocator_t *allocator, + dnnl_graph_host_allocate_f host_malloc, + dnnl_graph_host_deallocate_f host_free); + +/// Destroys an allocator. +/// +/// @param allocator The allocator to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_allocator_destroy( + dnnl_graph_allocator_t allocator); + +/// @} dnnl_graph_api_allocator + +/// @addtogroup dnnl_graph_api_engine +/// @{ + +/// This API is a supplement for existing onednn engine API. +dnnl_status_t DNNL_API dnnl_graph_make_engine_with_allocator( + dnnl_engine_t *engine, dnnl_engine_kind_t kind, size_t index, + const_dnnl_graph_allocator_t alloc); + +/// @} dnnl_graph_api_engine + +/// @addtogroup dnnl_graph_api_logical_tensor +/// @{ + +/// Initializes a logical tensor with id, data type, number of dimensions, +/// layout type, and property. The logical tensor's dims are unknown with this +/// interface. +/// +/// @param logical_tensor Output logical tensor. +/// @param tid The unique id of the output logical tensor. +/// @param dtype Elements data type. +/// @param ndims Number of dimensions. +/// @param ltype Layout type of the underlying tensor buffer. +/// @param ptype Tensor property type. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init( + dnnl_graph_logical_tensor_t *logical_tensor, size_t tid, + dnnl_data_type_t dtype, int32_t ndims, dnnl_graph_layout_type_t ltype, + dnnl_graph_tensor_property_t ptype); + +/// Initializes a logical tensor with basic information and dims. The logical +/// tensor's dimensions and layout will be initialized according to the input +/// arguments. +/// +/// @note +/// If dims contains all valid values and layout type is +/// #dnnl_graph_layout_type_strided. The strides field in +/// #dnnl_graph_logical_tensor_t will be calculated in a row major and +/// contiguous way. Otherwise, Accessing the strides field is an undefined +/// behavior. +/// +/// Eg. dims (2, 3, 4, 5) will get strides (60, 20, 5, 1) +/// +/// @param logical_tensor Output logical tensor. +/// @param tid The unique id of output logical tensor. +/// @param dtype Elements data type. +/// @param ndims Number of dimensions. +/// @param dims Array of dimensions. +/// @param ltype Layout type of the underlying tensor memory. +/// @param ptype Tensor property type. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init_with_dims( + dnnl_graph_logical_tensor_t *logical_tensor, size_t tid, + dnnl_data_type_t dtype, int32_t ndims, const dnnl_dims_t dims, + dnnl_graph_layout_type_t ltype, dnnl_graph_tensor_property_t ptype); + +/// Initializes a logical tensor with dimensions and strides provided by user. +/// +/// @note +/// Once strides are explicitly provided through the API, the `layout_type` +/// in #dnnl_graph_logical_tensor_t can only be +/// #dnnl_graph_layout_type_strided or #dnnl_graph_layout_type_any. +/// +/// @param logical_tensor Output logical tensor. +/// @param tid The unique id of output logical tensor. +/// @param dtype Elements data type. +/// @param ndims Number of dimensions. +/// @param dims Array of dimensions. +/// @param strides Array of strides. +/// @param ptype Tensor property type. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init_with_strides( + dnnl_graph_logical_tensor_t *logical_tensor, size_t tid, + dnnl_data_type_t dtype, int32_t ndims, const dnnl_dims_t dims, + const dnnl_dims_t strides, dnnl_graph_tensor_property_t ptype); + +/// Returns the memory size described by the logical tensor. If it's a strided +/// layout, the size will be calculated by `dims` and `strides`. If it's an +/// opaque layout, the size will be decided by `layout_id`. +/// +/// @param logical_tensor Logical tensor. +/// @param size Output memory size in bytes. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_logical_tensor_get_mem_size( + const dnnl_graph_logical_tensor_t *logical_tensor, size_t *size); + +/// Compares if two logical tenors are equal. Users can decide accordingly +/// if layout reordering is needed for two logical tensors. The method will +/// return true for below two circumstances: +/// +/// 1. the two logical tensors are equal regarding each field in the struct, +/// eg. id, ndims, dims, layout type, property, etc. +/// 2. If all other fields are equal but the layout types in two logical +/// tensors are different, the method will return true when the underlying +/// memory layout is the same. For example, one logical tensor has strided +/// layout type while the other one has opaque layout type, but underneath, +/// both layouts are NHWC, the method will still return true for this case. +/// +/// @param lt1 The handle of first logical tensor. +/// @param lt2 The handle of second logical tensor. +/// @param is_equal 1 if these two logical tensors are equal, 0 otherwise. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_logical_tensor_is_equal( + const dnnl_graph_logical_tensor_t *lt1, + const dnnl_graph_logical_tensor_t *lt2, uint8_t *is_equal); + +/// @} dnnl_graph_api_logical_tensor + +/// @addtogroup dnnl_graph_api_tensor +/// @{ + +/// Creates a tensor with logical tensor, engine, and data handle. +/// +/// @param tensor Output tensor. +/// @param logical_tensor Description for this tensor. +/// @param engine Engine to use. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the tensor. In this case the library +/// owns the buffer. +/// - DNNL_MEMORY_NONE to create tensor without an underlying buffer. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_create(dnnl_graph_tensor_t *tensor, + const dnnl_graph_logical_tensor_t *logical_tensor, dnnl_engine_t engine, + void *handle); + +/// Destroys a tensor. +/// +/// @param tensor The tensor to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_destroy(dnnl_graph_tensor_t tensor); + +/// Gets the data handle of a tensor. +/// +/// @param tensor The input tensor. +/// @param handle Pointer to the data of input tensor. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_get_data_handle( + const_dnnl_graph_tensor_t tensor, void **handle); + +/// Set data handle for a tensor. +/// +/// @param tensor The input tensor. +/// @param handle New data handle for tensor. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_set_data_handle( + dnnl_graph_tensor_t tensor, void *handle); + +/// Returns the engine of a tensor object. +/// +/// @param tensor The input tensor. +/// @param engine Output engine on which the tensor is located. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_get_engine( + const_dnnl_graph_tensor_t tensor, dnnl_engine_t *engine); + +/// Returns the logical tensor of a tensor object. +/// +/// @param tensor The input tensor. +/// @param logical_tensor Output logical tensor of the tensor object. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_tensor_get_logical_tensor( + const_dnnl_graph_tensor_t tensor, + dnnl_graph_logical_tensor_t *logical_tensor); + +/// @} dnnl_graph_api_tensor + +/// @addtogroup dnnl_graph_api_op +/// @{ + +/// Initializes an op with unique id, kind, and name. +/// +/// @param op Output op +/// @param id The unique id of the output op. +/// @param kind The op kind. +/// @param verbose_name The string added as the op name. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_create(dnnl_graph_op_t *op, size_t id, + dnnl_graph_op_kind_t kind, const char *verbose_name); + +/// Destroys an op. +/// +/// @param op The op to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_destroy(dnnl_graph_op_t op); + +/// Adds input logical tensor to the op. +/// +/// @param op Input op. +/// @param input The input logical tensor to be added. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_add_input( + dnnl_graph_op_t op, const dnnl_graph_logical_tensor_t *input); + +/// Adds output logical tensor to the op. +/// +/// @param op Input op. +/// @param output The output logical tensor to be added. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_add_output( + dnnl_graph_op_t op, const dnnl_graph_logical_tensor_t *output); + +/// Sets floating point attribute to an op. +/// +/// @param op Input op. +/// @param name The attribute's name. +/// @param value The attribute's value. +/// @param value_len The number of value element. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_set_attr_f32(dnnl_graph_op_t op, + dnnl_graph_op_attr_t name, const float *value, size_t value_len); + +/// Sets boolean attribute to an op. +/// +/// @param op Input op. +/// @param name The attribute's name. +/// @param value The attribute's value. +/// @param value_len The number of value element. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_set_attr_bool(dnnl_graph_op_t op, + dnnl_graph_op_attr_t name, const uint8_t *value, size_t value_len); + +/// Sets integer attribute to an op. +/// +/// @param op Input op. +/// @param name The attribute's name. +/// @param value The attribute's value. +/// @param value_len The number of value element. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_set_attr_s64(dnnl_graph_op_t op, + dnnl_graph_op_attr_t name, const int64_t *value, size_t value_len); + +/// Sets string attribute to an op. +/// +/// @param op Input op. +/// @param name The attribute's name. +/// @param value The attribute's value. +/// @param value_len The length of the string value. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_set_attr_str(dnnl_graph_op_t op, + dnnl_graph_op_attr_t name, const char *value, size_t value_len); + +/// Returns the unique id of an op. +/// +/// @param op Input op. +/// @param id Output the unique id. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_get_id( + const_dnnl_graph_op_t op, size_t *id); + +/// Returns the kind of an op. +/// +/// @param op Input op. +/// @param kind Output op kind. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_op_get_kind( + const_dnnl_graph_op_t op, dnnl_graph_op_kind_t *kind); + +/// @} dnnl_graph_api_op + +/// @addtogroup dnnl_graph_api_partition +/// @{ + +/// Creates a new partition with a given operator and engine kind. The API is +/// used to create a partition from an operation directly without creating the +/// graph and calling `get_partitions()`. The output partition contains only one +/// operation specified by the parameter. The output partition instance should +/// be destroyed via #dnnl_graph_partition_destroy after use. +/// +/// @param partition The handle of output partition. +/// @param op The operation used to create partition. +/// @param ekind The engine kind used to create partition. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_create_with_op( + dnnl_graph_partition_t *partition, const_dnnl_graph_op_t op, + dnnl_engine_kind_t ekind); + +/// Destroys a partition. +/// +/// @param partition The partition to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_destroy( + dnnl_graph_partition_t partition); + +/// Returns the number of operations in a partition. +/// +/// @param partition The target partition. +/// @param num Output the number of operations. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_op_num( + const_dnnl_graph_partition_t partition, size_t *num); + +/// Returns the list of op IDs of the partition. +/// +/// @param partition The target partition. +/// @param num The number of ops. +/// @param ids Output the op IDs. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_ops( + dnnl_graph_partition_t partition, size_t num, size_t *ids); + +/// Returns the ID of a partition. +/// +/// @param partition The target partition. +/// @param id Output the ID of the partition. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_id( + const_dnnl_graph_partition_t partition, size_t *id); + +/// Compiles a partition with given input and output logical tensors. The output +/// logical tensors can contain unknown dimensions. For this case, the +/// compilation will deduce the output shapes according to input shapes. The +/// output logical tensors can also have layout type `any`. The compilation will +/// choose the optimal layout for output tensors. The optimal layout will be +/// represented as an opaque layout ID saved in the output logical tensor. +/// +/// @param partition The target partition. +/// @param compiled_partition Output compiled partition. +/// @param in_num The number of input logical tensors. +/// @param inputs A list of input logical tensors. +/// @param out_num The number of output logical tensors. +/// @param outputs A list of output logical tensors. +/// @param engine The target engine of the compilation. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_compile( + dnnl_graph_partition_t partition, + dnnl_graph_compiled_partition_t compiled_partition, size_t in_num, + const dnnl_graph_logical_tensor_t **inputs, size_t out_num, + const dnnl_graph_logical_tensor_t **outputs, dnnl_engine_t engine); + +/// Returns the number of input logical tensors of a partition. +/// +/// @param partition The target partition. +/// @param num Output the number of input logical tensors. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_input_ports_num( + const_dnnl_graph_partition_t partition, size_t *num); + +/// Returns a list of input logical tensors from a partition. +/// +/// @param partition The target partition. +/// @param num The number of input logical tensors. +/// @param inputs The list of input logical tensors. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_input_ports( + const_dnnl_graph_partition_t partition, size_t num, + dnnl_graph_logical_tensor_t *inputs); + +/// Returns the number of output logical tensors of a partition. +/// +/// @param partition The target partition. +/// @param num Output the number of output logical tensors. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_output_ports_num( + const_dnnl_graph_partition_t partition, size_t *num); + +/// Returns a list of output logical tensors from a partition. +/// +/// @param partition The target partition. +/// @param num The number of output logical tensors. +/// @param outputs The list of output logical tensors. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_output_ports( + const_dnnl_graph_partition_t partition, size_t num, + dnnl_graph_logical_tensor_t *outputs); + +/// Returns the supporting status of a partition. Some operations may not be +/// supported by the library under certain circumstances. During partitioning +/// stage, unsupported partitions will be returned to users with each containing +/// an unsupported operation. Users should check the supporting status of a +/// partition before transforming the computation graph or compiling the +/// partition. +/// +/// @param partition The target partition. +/// @param is_supported Output flag to indicate the supporting status. 0 means +/// unsupported while 1 means supported. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_is_supported( + const_dnnl_graph_partition_t partition, uint8_t *is_supported); + +/// Returns the engine kind of a partition. +/// +/// @param partition The target partition. +/// @param kind The output engine kind. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_partition_get_engine_kind( + const_dnnl_graph_partition_t partition, dnnl_engine_kind_t *kind); + +/// @} dnnl_graph_api_partition + +/// @addtogroup dnnl_graph_api_compiled_partition +/// @{ + +/// Creates a new compiled partition handle. +/// +/// @param compiled_partition The handle of output compiled partition. +/// @param partition The handle of input partition. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_compiled_partition_create( + dnnl_graph_compiled_partition_t *compiled_partition, + dnnl_graph_partition_t partition); + +/// Executes a compiled partition. +/// +/// @param compiled_partition The handle of target compiled partition. +/// @param stream The stream used for execution. +/// @param num_inputs The number of input tensors. +/// @param inputs A list of input tensors. +/// @param num_outputs The number of output tensors. +/// @param outputs A non-empty list of output tensors. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_compiled_partition_execute( + const_dnnl_graph_compiled_partition_t compiled_partition, + dnnl_stream_t stream, size_t num_inputs, + const_dnnl_graph_tensor_t *inputs, size_t num_outputs, + const_dnnl_graph_tensor_t *outputs); + +/// Destroys a compiled partition. +/// +/// @param compiled_partition The compiled partition to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_compiled_partition_destroy( + dnnl_graph_compiled_partition_t compiled_partition); + +/// Queries an input or output logical tensor according to tensor ID. If the +/// tensor ID doesn't belong to any input or output of the compiled partition, +/// an error status #dnnl_invalid_arguments will be returned by the API. +/// +/// @param compiled_partition The handle of target compiled_partition. +/// @param tid The unique id of required tensor. +/// @param lt The output logical tensor. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_compiled_partition_query_logical_tensor( + const_dnnl_graph_compiled_partition_t compiled_partition, size_t tid, + dnnl_graph_logical_tensor_t *lt); + +/// Returns the hint of in-place pairs from a compiled partition. It indicates +/// that an input and an output of the partition can share the same memory +/// buffer for computation. In-place computation helps to reduce the memory +/// footprint and improves cache locality. But since the library may not have a +/// global view of user's application, it's possible that the tensor with +/// `input_id` is used at other places in user's computation graph. In this +/// case, the user should take the in-place pair as a hint and pass a different +/// memory buffer for output tensor to avoid overwriting the input memory buffer +/// which will probably cause unexpected incorrect results. +/// +/// @param compiled_partition The handle of target compiled_partition. +/// @param num_inplace_pairs The number of in-place pairs. +/// @param inplace_pairs The handle of in-place pairs. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_compiled_partition_get_inplace_ports( + const_dnnl_graph_compiled_partition_t compiled_partition, + size_t *num_inplace_pairs, + const dnnl_graph_inplace_pair_t **inplace_pairs); + +/// @} dnnl_graph_api_compiled_partition + +/// @addtogroup dnnl_graph_api_graph +/// @{ + +/// Creates a new empty graph. A graph is associated to a specific engine kind. +/// The partitions returned from the graph will inherit the engine kind of the +/// graph. +/// +/// @param graph The handle of output graph. +/// @param engine_kind The target engine kind. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_create( + dnnl_graph_graph_t *graph, dnnl_engine_kind_t engine_kind); + +/// Creates a new empty graph with an engine kind and a floating-point math +/// mode. All partitions returned from the graph will inherit the engine kind +/// and floating-point math mode. +/// +/// @param graph The handle of output graph. +/// @param engine_kind The kind for engine. +/// @param mode The floating-point math mode. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_create_with_fpmath_mode( + dnnl_graph_graph_t *graph, dnnl_engine_kind_t engine_kind, + dnnl_fpmath_mode_t mode); + +/// Destroys a graph. +/// +/// @param graph The graph to be destroyed. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_destroy(dnnl_graph_graph_t graph); + +/// Set the floating point math mode for a graph. +/// +/// @param graph The target graph. +/// @param mode The floating-point math mode. +/// @param apply_to_int The flag that controls whether to use floating-point +/// arithmetic for integral operations. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_set_fpmath_mode( + dnnl_graph_graph_t graph, dnnl_fpmath_mode_t mode, int apply_to_int); + +/// Get the floating point math mode for a graph. +/// +/// @param graph The target graph. +/// @param mode The floating-point math mode. +/// @param apply_to_int The flag that controls whether to use floating-point +/// arithmetic for integral operations. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_get_fpmath_mode( + dnnl_graph_graph_t graph, dnnl_fpmath_mode_t *mode, int *apply_to_int); + +/// Adds an operation into a graph. The API will return failure if the operator +/// has already been added to the graph or the operation cannot pass the schema +/// check in the library (eg. input and output numbers and data types, the +/// attributes of the operation, etc.). +/// +/// @param graph The target graph. +/// @param op The operation to be added. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_add_op( + dnnl_graph_graph_t graph, dnnl_graph_op_t op); + +/// Finalizes a graph. It means users have finished adding operations into the +/// graph and the graph is ready for partitioning. Adding a new operation into a +/// finalized graph will return failures. Similarly, partitioning on a +/// un-finalized graph will also return failures. +/// +/// @param graph The target graph to be finalized. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_finalize(dnnl_graph_graph_t graph); + +/// Checks if a graph is finalized. +/// +/// @param graph The target graph to be finalized. +/// @param finalized Output the finalization status. 0 means then graph is not +/// finalized. Other values means the graph is finalized. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_is_finalized( + dnnl_graph_graph_t graph, uint8_t *finalized); + +/// Filters a graph. Partitions will be claimed internally according to the +/// capability of the library, the engine kind, and the policy. +/// +/// @param graph The target graph. +/// @param policy The partition policy. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_filter( + dnnl_graph_graph_t graph, dnnl_graph_partition_policy_t policy); + +/// Returns the number of partitions of a graph. The API should be called after +/// a partition is already filtered. Otherwise, the output number is zero. +/// +/// @param graph The graph. +/// @param num Output the number of partitions. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_get_partition_num( + const_dnnl_graph_graph_t graph, size_t *num); + +/// Returns the partitions from a filtered graph. Output partition instances +/// will be written into the parameter `partitions`. Users need to make sure +/// `partitions` is valid and has enough space to accept the partition +/// instances. Each output partition instance should be destroyed via +/// #dnnl_graph_partition_destroy explicitly after use. +/// +/// @param graph The target graph. +/// @param num The number of partitions. +/// @param partitions Output the partitions. +/// @returns #dnnl_success on success or a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_graph_get_partitions(dnnl_graph_graph_t graph, + size_t num, dnnl_graph_partition_t *partitions); + +/// @} dnnl_graph_api_graph + +/// @addtogroup dnnl_graph_api_compiled_partition_cache +/// @{ + +/// Returns the number of compiled partitions that can be held in the compiled +/// partition cache at the same time. +/// +/// @param capacity Compiled partition cache capacity to query. Concurrently +/// accessing @p capacity is safe. +/// @returns #dnnl_invalid_arguments if the @p capacity value +/// is invalid, and #dnnl_success on success. +dnnl_status_t DNNL_API dnnl_graph_get_compiled_partition_cache_capacity( + int *capacity); + +/// Sets a number of compiled partitions that can be held in the compiled +/// partition cache at the same time. The default capacity of compiled partition +/// cache is 1024. +/// +/// @param capacity Compiled partition cache capacity to set. The default cache +/// capacity is 1024. If a new @p capacity is less than a number of compiled +/// partition that the compiled partition cache already has, then the excess +/// entries will be evicted. Setting the @p capacity to 0 clears the compiled +/// partition cache and disables it. Concurrently modifying @p capacity is safe. +/// @returns #dnnl_invalid_arguments if the @p capacity value +/// is invalid, and #dnnl_success on success. +dnnl_status_t DNNL_API dnnl_graph_set_compiled_partition_cache_capacity( + int capacity); + +/// @} dnnl_graph_api_compiled_partition_cache + +/// @addtogroup dnnl_graph_api_constant_tensor_cache +/// @{ + +/// Control the enabling or disabling of constant tensor cache. This API must +/// be called once before compilation stage. By default, constant tensor cache is +/// disabled in the library. +/// +/// @param flag Set to positive value to enable the cache and set to 0 to +/// disable the cache. Negative values are invalid. +/// @returns #dnnl_invalid_arguments if the @p flag value is +/// invalid, and #dnnl_success on success. +/// @note This API is deprecated and will be removed in future release, please +/// use the dnnl_graph_set_constant_tensor_cache_capacity API to disable +/// constant tensor cache by setting it's capacity to zero. +dnnl_status_t DNNL_API dnnl_graph_set_constant_tensor_cache(int flag); + +/// Return the enabling or disabling status of constant tensor cache. +/// +/// @param flag The constant tensor cache enabling status to query. +/// @returns #dnnl_invalid_arguments if the @p flag value is +/// nullptr, and #dnnl_success on success. +/// @note This API is deprecated and will be removed in future release, please +/// use the dnnl_graph_get_constant_tensor_cache_capacity API to check the +/// enabling status by checking it's capacity. +dnnl_status_t DNNL_API dnnl_graph_get_constant_tensor_cache(int *flag); + +/// Control the capacity for the constant tensor cache that used for specific +/// engine kind. This API is thread safe and can be called multiple times at +/// runtime. The capacity is set to zero by default which means the cache is +/// disabled. When calling this API, the corresponding cache will be flushed. +/// Setting capacity to 0 means to clear all cached tensors and disable cache. +/// Once the capacity limit is reached, no new tensors will be cached. If there +/// are multiple devices for an engine kind, the capacity set here is for each +/// device. +/// +/// @param eng_kind The engine kind that the constant tensor cache used for. +/// @param size The constant tensor cache capacity size to set. +/// @returns #dnnl_invalid_arguments if the @p eng_kind value is invalid, and +/// #dnnl_success on success. +dnnl_status_t DNNL_API dnnl_graph_set_constant_tensor_cache_capacity( + dnnl_engine_kind_t eng_kind, size_t size); + +/// Return the current capacity of constant tensor cache. +/// +/// @param eng_kind The engine kind that the constant tensor cache used for. +/// @param size The constant tensor cache capacity size to query. +/// @returns #dnnl_invalid_arguments if the @p eng_kind value is +/// nullptr or the @p size is nullptr, and #dnnl_success on success. +dnnl_status_t DNNL_API dnnl_graph_get_constant_tensor_cache_capacity( + dnnl_engine_kind_t eng_kind, size_t *size); + +/// @} dnnl_graph_api_constant_tensor_cache + +/// @} dnnl_graph_api + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c9db7c9bbb0742d3c564951c6f60aa80f80967e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp @@ -0,0 +1,1639 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// Graph C++ API + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_HPP +#define ONEAPI_DNNL_DNNL_GRAPH_HPP + +#include "oneapi/dnnl/dnnl_common.hpp" +#include "oneapi/dnnl/dnnl_graph.h" + +#include +#include +#include +#include +#include + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_graph_api Graph API +/// oneDNN Graph API +/// @{ + +/// oneDNN Graph namespace +namespace graph { + +/// @cond DO_NOT_DOCUMENT_THIS + +// Alias for common engine and stream API. +using engine = dnnl::engine; +using stream = dnnl::stream; +using fpmath_mode = dnnl::fpmath_mode; + +/// @endcond + +/// @addtogroup dnnl_graph_api_utils Utilities +/// Utility types and definitions +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS + +/// A class that provides the destructor for a oneDNN graph C API handle. +template +struct graph_handle_traits : public dnnl::handle_traits {}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_op_t p) { + return dnnl_graph_op_destroy(p); + } +}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_graph_t p) { + return dnnl_graph_graph_destroy(p); + } +}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_tensor_t p) { + return dnnl_graph_tensor_destroy(p); + } +}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_partition_t p) { + return dnnl_graph_partition_destroy(p); + } +}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_compiled_partition_t p) { + return dnnl_graph_compiled_partition_destroy(p); + } +}; + +template <> +struct graph_handle_traits { + static dnnl_status_t destructor(dnnl_graph_allocator_t p) { + return dnnl_graph_allocator_destroy(p); + } +}; + +#define DNNL_GRAPH_HANDLE_ALIAS(type) \ + using type##_handle = dnnl::handle> + +DNNL_GRAPH_HANDLE_ALIAS(allocator); +DNNL_GRAPH_HANDLE_ALIAS(graph); +DNNL_GRAPH_HANDLE_ALIAS(op); +DNNL_GRAPH_HANDLE_ALIAS(tensor); +DNNL_GRAPH_HANDLE_ALIAS(compiled_partition); +DNNL_GRAPH_HANDLE_ALIAS(partition); + +#undef DNNL_GRAPH_HANDLE_ALIAS + +template +using req = typename std::enable_if::type; + +/// @endcond + +/// @} dnnl_graph_api_utils + +/// @addtogroup dnnl_graph_api_status Status +/// Definitions of status values returned by the library functions. +/// @{ + +/// Status values returned by the library functions. +enum class status { + /// The operation was successful + success = dnnl_success, + /// The operation failed due to an out-of-memory condition + out_of_memory = dnnl_out_of_memory, + /// The operation failed because of incorrect function arguments + invalid_arguments = dnnl_invalid_arguments, + /// The operation failed because requested functionality is not implemented + unimplemented = dnnl_unimplemented, + /// The last available implementation is reached + last_impl_reached = dnnl_last_impl_reached, + /// Primitive or engine failed on execution + runtime_error = dnnl_runtime_error, + /// Queried element is not required for given primitive + not_required = dnnl_not_required, + /// The graph is not legitimate + invalid_graph = dnnl_invalid_graph, + /// The operation is not legitimate according to op schema + invalid_graph_op = dnnl_invalid_graph_op, + /// The shape cannot be inferred or compiled + invalid_shape = dnnl_invalid_shape, + /// The data type cannot be inferred or compiled + invalid_data_type = dnnl_invalid_data_type, +}; + +/// @} dnnl_graph_api_status + +/// @addtogroup dnnl_graph_api_allocator Allocator +/// +/// Definitions of allocator which is used to acquire memory resources in +/// partition compilation and execution. SYCL allocator +/// (#dnnl::graph::sycl_interop::make_allocator) should be used for SYCL runtime +/// and host allocator should be used for non-SYCL. +/// +/// @{ + +/// Allocator +class allocator : public allocator_handle { +public: + using allocator_handle::handle; + + /// Constructs an allocator according to given function pointers + /// + /// @param host_malloc A pointer to malloc function for CPU + /// @param host_free A pointer to free function for CPU + allocator(dnnl_graph_host_allocate_f host_malloc, + dnnl_graph_host_deallocate_f host_free) { + dnnl_graph_allocator_t a = nullptr; + error::wrap_c_api( + dnnl_graph_allocator_create(&a, host_malloc, host_free), + "could not create allocator for cpu"); + reset(a); + } + + /// Default constructor + allocator() { + dnnl_graph_allocator_t a = nullptr; + error::wrap_c_api(dnnl_graph_allocator_create(&a, nullptr, nullptr), + "could not create allocator"); + reset(a); + } +}; + +/// @} dnnl_graph_api_allocator + +/// @addtogroup dnnl_graph_api_engine Engine +/// @{ + +/// This API is a supplement for existing onednn engine API. +inline engine make_engine_with_allocator( + engine::kind kind, size_t index, const allocator &alloc) { + dnnl_engine_t c_engine; + error::wrap_c_api( + dnnl_graph_make_engine_with_allocator(&c_engine, + static_cast(kind), index, alloc.get()), + "could not make an engine with allocator"); + return engine(c_engine); +} + +/// @} dnnl_graph_api_engine + +/// @addtogroup dnnl_graph_api_logical_tensor Logical Tensor +/// +/// Logical tensor describes the meta-data of the input or output tensor, like +/// elements data type, number of dimensions, size for each dimension (shape), +/// layout, and the property of the tensor. +/// +/// Each logical tensor has an unique ID. The library uses logical tensor IDs to +/// build up the connections between operations if the output of one operation +/// has the same ID as the input of another operation. The meta-data in a +/// logical tensor may be enriched in the framework graph as it progresses +/// toward final execution. For example, the library doesn't require detailed +/// shape information at the operation and graph creation stage. But shape +/// information of input logical tensor will be required at partition +/// compilation stage. Logical tensor is not mutable. Users must create a new +/// logical tensor with the same ID to pass any new additional information to +/// oneDNN Graph API. Please note that the library also has unique IDs for +/// operations. The ID should be unique among different logical tensors, but it +/// can have the same value between a logical tensor and an operation. +/// +/// @{ + +/// Logical tensor object +class logical_tensor { + friend class op; + friend class tensor; + friend class partition; + friend class compiled_partition; + + dnnl_graph_logical_tensor_t data; + +public: + /// Integer type for representing dimension sizes and indices. + using dim = dnnl_dim_t; + /// Vector of dimensions. Implementations are free to force a limit on the + /// vector's length. + using dims = std::vector; + + /// Data Type + enum class data_type { + undef = dnnl_data_type_undef, + /// 16-bit/half-precision floating point. + f16 = dnnl_f16, + /// non-standard 16-bit (bfloat16 w/ 7 bit mantissa) floating point. + bf16 = dnnl_bf16, + /// 32-bit/single-precision floating point. + f32 = dnnl_f32, + /// 32-bit signed integer. + s32 = dnnl_s32, + /// 8-bit signed integer. + s8 = dnnl_s8, + /// 8-bit unsigned integer. + u8 = dnnl_u8, + /// Boolean data type. Size is C++ implementation defined. + boolean = dnnl_boolean, + /// [OFP8 standard 8-bit + /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 5-bit exponent and a 2-bit mantissa. + f8_e5m2 = dnnl_f8_e5m2, + /// [OFP8 standard 8-bit + /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) + /// with a 4-bit exponent and a 3-bit mantissa. + f8_e4m3 = dnnl_f8_e4m3, + /// 4-bit signed integer. + s4 = dnnl_s4, + /// 4-bit unsigned integer. + u4 = dnnl_u4, + }; + + /// Layout type + enum class layout_type { + /// Undefined layout type. + undef = dnnl_graph_layout_type_undef, + /// Any means to let the library to decide the layout for a tensor + /// during partition compilation. + any = dnnl_graph_layout_type_any, + /// Strided means that the layout of a tensor is determined by the + /// strides field in the logical tensor. + strided = dnnl_graph_layout_type_strided, + /// Opaque means that the layout of a tensor is the library specific. + /// Usually, an opaque layout is generated by a partition which is + /// compiled with layout type any. + opaque = dnnl_graph_layout_type_opaque, + }; + + /// Tensor property + enum class property_type { + /// Undefined tensor property. + undef = dnnl_graph_tensor_property_undef, + /// Variable means the tensor may be changed during computation or + /// between different iterations. + variable = dnnl_graph_tensor_property_variable, + /// Constant means the tensor will keep unchanged during computation and + /// between different iterations. It's useful for the library to apply + /// optimizations for constant tensors or cache constant tensors inside + /// the library. For example, constant weight tensors in inference + /// scenarios. + constant = dnnl_graph_tensor_property_constant, + }; + + /// default constructor + /// construct an empty object + logical_tensor() = default; + + /// Constructs a logical tensor object + explicit logical_tensor(const dnnl_graph_logical_tensor_t &c_data) + : data(c_data) {} + + /// Copy + logical_tensor(const logical_tensor &other) = default; + + /// Assign + logical_tensor &operator=(const logical_tensor &other) = default; + + /// Constructs a logical tensor object with ID, data type, ndims, layout + /// type, and property type. + /// + /// @param tid Logical tensor ID. + /// @param dtype Elements data type. + /// @param ndims Number of dimensions. -1 means unknown (see + /// #DNNL_GRAPH_UNKNOWN_NDIMS) and 0 means a scalar tensor. + /// @param ltype Layout type. + /// @param ptype Property type. + logical_tensor(size_t tid, data_type dtype, int32_t ndims, + layout_type ltype, property_type ptype = property_type::undef) { + dnnl_graph_logical_tensor_t val; + error::wrap_c_api( + dnnl_graph_logical_tensor_init(&val, tid, convert_to_c(dtype), + ndims, convert_to_c(ltype), convert_to_c(ptype)), + "could not create logical_tensor with property"); + data = val; + } + + /// Delegated constructor. + /// + /// @param tid Logical tensor ID. + /// @param dtype Elements data type. + /// @param ltype Layout type. + logical_tensor( + size_t tid, data_type dtype, layout_type ltype = layout_type::undef) + : logical_tensor(tid, dtype, DNNL_GRAPH_UNKNOWN_NDIMS, ltype) {} + + /// Constructs a logical tensor object with basic information and detailed + /// dims. + /// + /// @param tid Logical tensor ID. + /// @param dtype Elements data type. + /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means + /// the size of that dimension is unknown. 0 is used to define + /// zero-dimension tensor. + /// @param ltype Layout type. If it's strided, the strides field in the + /// output logical tensor will be deduced accordingly. + /// @param ptype Property type. + logical_tensor(size_t tid, data_type dtype, const dims &adims, + layout_type ltype, property_type ptype = property_type::undef) { + dnnl_graph_logical_tensor_t val; + // if dimension size equals to 0, it's a scalar + if (adims.size() == 0) + error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid, + convert_to_c(dtype), 0, + convert_to_c(ltype), convert_to_c(ptype)), + "could not create logical_tensor with property"); + else + error::wrap_c_api( + dnnl_graph_logical_tensor_init_with_dims(&val, tid, + convert_to_c(dtype), + static_cast(adims.size()), adims.data(), + convert_to_c(ltype), convert_to_c(ptype)), + "could not create logical_tensor with dims and property"); + data = val; + } + + /// Constructs a logical tensor object with detailed dims and strides. The + /// layout_type of the output logical tensor object will always be strided. + /// + /// @param tid Logical tensor ID. + /// @param dtype Elements data type. + /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means + /// the size of that dimension is unknown. 0 is used to define + /// zero-dimension tensor. + /// @param strides Logical tensor strides. #DNNL_GRAPH_UNKNOWN_DIM means + /// the stride of the dimension is unknown. The library currently + /// doesn't support other negative stride values. + /// @param ptype Property type. + logical_tensor(size_t tid, data_type dtype, const dims &adims, + const dims &strides, property_type ptype = property_type::undef) { + dnnl_graph_logical_tensor_t val; + // TODO(lvtao): check the size of adims and strides. + // They should be same. + error::wrap_c_api( + dnnl_graph_logical_tensor_init_with_strides(&val, tid, + convert_to_c(dtype), static_cast(adims.size()), + adims.data(), strides.data(), convert_to_c(ptype)), + "could not create logical_tensor with strides and property"); + data = val; + } + + /// Constructs a logical tensor object with detailed dims and an opaque + /// layout ID. layout_type of the output logical tensor object will always + /// be opaque. + /// + /// @param tid Logical tensor ID. + /// @param dtype Elements data type. + /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means + /// the size of that dimension is unknown. 0 is used to define + /// zero-dimension tensor. + /// @param lid Opaque layout id. + /// @param ptype Property type + logical_tensor(size_t tid, data_type dtype, const dims &adims, size_t lid, + property_type ptype = property_type::undef) { + dnnl_graph_logical_tensor_t val; + + if (adims.size() == 0) { + error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid, + convert_to_c(dtype), 0, + convert_to_c(layout_type::opaque), + convert_to_c(ptype)), + "could not create logical_tensor"); + } else { + error::wrap_c_api( + dnnl_graph_logical_tensor_init_with_dims(&val, tid, + convert_to_c(dtype), + static_cast(adims.size()), adims.data(), + convert_to_c(layout_type::opaque), + convert_to_c(ptype)), + "could not create logical_tensor with dims"); + } + + val.layout.layout_id = lid; + data = val; + } + + /// Returns dimensions of a logical tensor. + /// + /// @returns A vector describing the size of each dimension. + dims get_dims() const { + if (data.ndims < 0) { + error::wrap_c_api(dnnl_invalid_arguments, + "cannot return dims when ndims < 0"); + } + + return {data.dims, data.dims + data.ndims}; + } + + /// Returns the unique id of a logical tensor. + /// + /// @returns An integer value describing the ID. + size_t get_id() const { return data.id; } + + /// Returns the data type of a logical tensor. + /// + /// @returns The data type. + data_type get_data_type() const { + return static_cast(data.data_type); + } + + /// Returns the property type of a logical tensor. + /// + /// @returns The property type. + property_type get_property_type() const { + return static_cast(data.property); + } + + /// Returns the layout type of a logical tensor. + /// + /// @returns The layout type. + layout_type get_layout_type() const { + return static_cast(data.layout_type); + } + + /// Returns the layout ID of a logical tensor. The API should be called on a + /// logical tensor with opaque layout type. Otherwise, an exception will be + /// raised. + /// + /// @returns Layout ID. + size_t get_layout_id() const { + if (get_layout_type() != layout_type::opaque) { + error::wrap_c_api( + dnnl_invalid_arguments, "layout type should be opaque"); + } + + return data.layout.layout_id; + } + + /// Returns the strides of a logical tensor. The API should be called on a + /// logical tensor with strided layout type. Otherwise, an exception will be + /// raised. + /// + /// @returns A vector describing the stride size of each dimension. + dims get_strides() const { + if (get_layout_type() != layout_type::strided) { + error::wrap_c_api( + dnnl_invalid_arguments, "layout type should be strided"); + } + + if (data.ndims < 0) { + error::wrap_c_api(dnnl_invalid_arguments, + "cannot return strides when ndims < 0"); + } + + return {data.layout.strides, data.layout.strides + data.ndims}; + } + + /// Returns memory size in bytes required by this logical tensor. + /// + /// @returns The memory size in bytes. + size_t get_mem_size() const { + size_t size = 0; + error::wrap_c_api(dnnl_graph_logical_tensor_get_mem_size(&data, &size), + "could not get memory size from the logical_tensor"); + return size; + } + + /// Compares if two logical tenors are equal. Users can decide accordingly + /// if layout reordering is needed for two logical tensors. The method will + /// return true for below two circumstances: + /// + /// 1. the two logical tensors are equal regarding each field in the struct, + /// eg. id, ndims, dims, layout type, property, etc. + /// 2. If all other fields are equal but the layout types in two logical + /// tensors are different, the method will return true when the underlying + /// memory layout is the same. For example, one logical tensor has strided + /// layout type while the other one has opaque layout type, but underneath, + /// both layouts are NHWC, the method will still return true for this case. + /// + /// @param lt The input logical tensor to be compared. + /// @returns @c true if the two logical tensors are equal. @c false otherwise + bool is_equal(const logical_tensor <) const { + uint8_t equal = 0; + error::wrap_c_api( + dnnl_graph_logical_tensor_is_equal(&data, <.data, &equal), + "could not compare between the two logical tensors"); + return equal != 0; + } + +private: + static dnnl_data_type_t convert_to_c(data_type dtype) { + return static_cast(dtype); + } + + static dnnl_graph_layout_type_t convert_to_c(layout_type ltype) { + return static_cast(ltype); + } + + static dnnl_graph_tensor_property_t convert_to_c(property_type ptype) { + return static_cast(ptype); + } +}; + +/// @} dnnl_graph_api_logical_tensor + +/// @addtogroup dnnl_graph_api_tensor Tensor +/// +/// Tensor is an abstraction for multi-dimensional input and output data needed +/// in the execution of a compiled partition. A tensor object encapsulates a +/// handle to a memory buffer allocated on a specific engine and a logical +/// tensor which describes the dimensions, elements data type, and memory +/// layout. +/// +/// @{ + +/// A tensor object +class tensor : public tensor_handle { +public: + /// Default constructor. Constructs an empty object. + tensor() = default; + + /// Constructs a tensor object according to a given logical tensor, an + /// engine, and a memory handle. + /// + /// @param lt The given logical tensor + /// @param aengine Engine to store the data on. + /// @param handle Handle of memory buffer to use as an underlying storage. + /// - A pointer to the user-allocated buffer. In this case the library + /// doesn't own the buffer. + /// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to + /// allocate the buffer for the tensor. In this case the library + /// owns the buffer. + /// - DNNL_MEMORY_NONE to create tensor without an underlying buffer. + tensor(const logical_tensor <, const engine &aengine, void *handle) { + dnnl_graph_tensor_t t = nullptr; + error::wrap_c_api( + dnnl_graph_tensor_create(&t, &(lt.data), aengine.get(), handle), + "could not create tensor object with the logical_tensor, " + "engine, and handle"); + reset(t); + } + + /// Constructs a tensor object. + /// The underlying buffer for the memory will be allocated by the library. + /// + /// @param lt The given logical tensor + /// @param aengine Engine to store the data on. + tensor(const logical_tensor <, const engine &aengine) + : tensor(lt, aengine, DNNL_MEMORY_ALLOCATE) {} + + /// Returns the underlying memory buffer. + /// + /// On the CPU engine, or when using USM, this is a pointer to the + /// allocated memory. + void *get_data_handle() const { + void *handle = nullptr; + error::wrap_c_api(dnnl_graph_tensor_get_data_handle(get(), &handle), + "could not get data handle from the tensor"); + return handle; + } + + /// Sets the underlying memory handle. + /// + /// @param handle Memory handle. + void set_data_handle(void *handle) { + error::wrap_c_api(dnnl_graph_tensor_set_data_handle(get(), handle), + "setting data handle to the tensor failed"); + } + + /// Returns the associated engine. + /// + /// @returns An engine object + engine get_engine() const { + dnnl_engine_t c_engine = nullptr; + error::wrap_c_api(dnnl_graph_tensor_get_engine(get(), &c_engine), + "could not get an engine from a tensor object"); + return engine(c_engine, true); + } + + /// Returns the logical tensor of a tensor object. + /// + /// @returns A logical_tensor object. + logical_tensor get_logical_tensor() const { + dnnl_graph_logical_tensor_t lt; + error::wrap_c_api(dnnl_graph_tensor_get_logical_tensor(get(), <), + "could not get logical tensor from a tensor object"); + return logical_tensor(lt); + } +}; + +/// @} dnnl_graph_api_tensor + +/// @addtogroup dnnl_graph_api_compiled_partition Compiled Partition +/// +/// A compiled partition represents the generated kernels specialized for a +/// partition on a target hardware (engine) with input and output information +/// specified by the logical tensors. +/// +/// @{ + +/// A compiled partition object. +class compiled_partition : public compiled_partition_handle { +public: + /// Default constructor. Constructs an empty object. + compiled_partition() = default; + + /// Constructs a compiled partition object + compiled_partition(dnnl_graph_compiled_partition_t compiled_partition) { + reset(compiled_partition, false); + } + + /// Queries an input or output logical tensor according to tensor ID. If the + /// tensor ID doesn't belong to any input or output of the compiled + /// partition, an exception will be raised by the API. + /// + /// @param tid The unique id of required tensor. + /// @returns The logical tensor. + logical_tensor query_logical_tensor(size_t tid) const { + dnnl_graph_logical_tensor_t lt; + error::wrap_c_api(dnnl_graph_compiled_partition_query_logical_tensor( + get(), tid, <), + "query logical tensor from compiled_partition failed"); + return logical_tensor {lt}; + } + + /// Returns the hint of in-place pairs from a compiled partition. It + /// indicates that an input and an output of the partition can share the + /// same memory buffer for computation. In-place computation helps to reduce + /// the memory footprint and improves cache locality. But since the library + /// may not have a global view of user's application, it's possible that the + /// input tensor is used at other places in user's computation graph. In + /// this case, the user should take the in-place pair as a hint and pass a + /// different memory buffer for output tensor to avoid overwriting the input + /// memory buffer which will probably cause unexpected incorrect results. + /// + /// @returns A list of pairs of input and output IDs. + std::vector> get_inplace_ports() const { + size_t num = 0; + const dnnl_graph_inplace_pair_t *inplace_pairs; + + error::wrap_c_api(dnnl_graph_compiled_partition_get_inplace_ports( + get(), &num, &inplace_pairs), + "could not get the in-place pairs from a compiled partition"); + if (num == 0) return {}; + + std::vector> inplace_options; + inplace_options.reserve(num); + for (size_t i = 0; i < num; ++i) { + const dnnl_graph_inplace_pair_t *inplace_pair = inplace_pairs + i; + inplace_options.emplace_back( + inplace_pair->input_id, inplace_pair->output_id); + } + return inplace_options; + } + + /// Execute a compiled partition. + /// + /// @param astream Stream object to run over. + /// @param inputs A list of input tensors. + /// @param outputs A list of output tensors. + void execute(stream &astream, const std::vector &inputs, + const std::vector &outputs) const { + std::vector c_inputs; + c_inputs.reserve(inputs.size()); + for (auto &in : inputs) { + c_inputs.push_back(in.get()); + } + std::vector c_outputs; + c_outputs.reserve(outputs.size()); + for (auto &out : outputs) { + c_outputs.push_back(out.get()); + } + + error::wrap_c_api( + dnnl_graph_compiled_partition_execute(get(), astream.get(), + c_inputs.size(), c_inputs.data(), c_outputs.size(), + c_outputs.data()), + "could not execute the compiled_partition"); + } +}; + +/// @} dnnl_graph_api_compiled_partition + +/// @addtogroup dnnl_graph_api_op Op +/// +/// OP is an abstraction of computation logic for deep neural network +/// operations. An op object encapsulates an operation kind which describes the +/// computation logic, an unique ID which differentiates operations with the +/// same kind, and logical tensors which describes the input and output of the +/// operation and its connections to other operations in the graph. +/// +/// @{ + +/// An op object. +class op : public op_handle { +public: + /// Kinds of operations + enum class kind { + Abs = dnnl_graph_op_abs, + AbsBackward = dnnl_graph_op_abs_backward, + Add = dnnl_graph_op_add, + AvgPool = dnnl_graph_op_avg_pool, + AvgPoolBackward = dnnl_graph_op_avg_pool_backward, + BatchNormForwardTraining = dnnl_graph_op_batch_norm_forward_training, + BatchNormInference = dnnl_graph_op_batch_norm_inference, + BatchNormTrainingBackward = dnnl_graph_op_batch_norm_backward, + BiasAdd = dnnl_graph_op_bias_add, + BiasAddBackward = dnnl_graph_op_bias_add_backward, + Clamp = dnnl_graph_op_clamp, + ClampBackward = dnnl_graph_op_clamp_backward, + Concat = dnnl_graph_op_concat, + Convolution = dnnl_graph_op_convolution, + ConvolutionBackwardData = dnnl_graph_op_convolution_backward_data, + ConvolutionBackwardWeights = dnnl_graph_op_convolution_backward_weights, + ConvTranspose = dnnl_graph_op_conv_transpose, + ConvTransposeBackwardData = dnnl_graph_op_conv_transpose_backward_data, + ConvTransposeBackwardWeights + = dnnl_graph_op_conv_transpose_backward_weights, + Dequantize = dnnl_graph_op_dequantize, + Divide = dnnl_graph_op_divide, + DynamicDequantize = dnnl_graph_op_dynamic_dequantize, + DynamicQuantize = dnnl_graph_op_dynamic_quantize, + Elu = dnnl_graph_op_elu, + EluBackward = dnnl_graph_op_elu_backward, + End = dnnl_graph_op_end, + Exp = dnnl_graph_op_exp, + GELU = dnnl_graph_op_gelu, + GELUBackward = dnnl_graph_op_gelu_backward, + GroupNorm = dnnl_graph_op_group_norm, + HardSigmoid = dnnl_graph_op_hard_sigmoid, + HardSigmoidBackward = dnnl_graph_op_hard_sigmoid_backward, + HardSwish = dnnl_graph_op_hard_swish, + HardSwishBackward = dnnl_graph_op_hard_swish_backward, + Interpolate = dnnl_graph_op_interpolate, + InterpolateBackward = dnnl_graph_op_interpolate_backward, + LayerNorm = dnnl_graph_op_layer_norm, + LayerNormBackward = dnnl_graph_op_layer_norm_backward, + LeakyReLU = dnnl_graph_op_leaky_relu, + Log = dnnl_graph_op_log, + LogSoftmax = dnnl_graph_op_log_softmax, + LogSoftmaxBackward = dnnl_graph_op_log_softmax_backward, + MatMul = dnnl_graph_op_matmul, + Maximum = dnnl_graph_op_maximum, + MaxPool = dnnl_graph_op_max_pool, + MaxPoolBackward = dnnl_graph_op_max_pool_backward, + Minimum = dnnl_graph_op_minimum, + Mish = dnnl_graph_op_mish, + MishBackward = dnnl_graph_op_mish_backward, + Multiply = dnnl_graph_op_multiply, + Pow = dnnl_graph_op_pow, + PReLU = dnnl_graph_op_prelu, + PReLUBackward = dnnl_graph_op_prelu_backward, + Quantize = dnnl_graph_op_quantize, + Reciprocal = dnnl_graph_op_reciprocal, + ReduceL1 = dnnl_graph_op_reduce_l1, + ReduceL2 = dnnl_graph_op_reduce_l2, + ReduceMax = dnnl_graph_op_reduce_max, + ReduceMean = dnnl_graph_op_reduce_mean, + ReduceMin = dnnl_graph_op_reduce_min, + ReduceProd = dnnl_graph_op_reduce_prod, + ReduceSum = dnnl_graph_op_reduce_sum, + ReLU = dnnl_graph_op_relu, + ReLUBackward = dnnl_graph_op_relu_backward, + Reorder = dnnl_graph_op_reorder, + Round = dnnl_graph_op_round, + Select = dnnl_graph_op_select, + Sigmoid = dnnl_graph_op_sigmoid, + SigmoidBackward = dnnl_graph_op_sigmoid_backward, + SoftMax = dnnl_graph_op_softmax, + SoftMaxBackward = dnnl_graph_op_softmax_backward, + SoftPlus = dnnl_graph_op_softplus, + SoftPlusBackward = dnnl_graph_op_softplus_backward, + Sqrt = dnnl_graph_op_sqrt, + SqrtBackward = dnnl_graph_op_sqrt_backward, + Square = dnnl_graph_op_square, + SquaredDifference = dnnl_graph_op_squared_difference, + StaticReshape = dnnl_graph_op_static_reshape, + StaticTranspose = dnnl_graph_op_static_transpose, + Subtract = dnnl_graph_op_subtract, + Tanh = dnnl_graph_op_tanh, + TanhBackward = dnnl_graph_op_tanh_backward, + TypeCast = dnnl_graph_op_type_cast, + Wildcard = dnnl_graph_op_wildcard, + GenIndex = dnnl_graph_op_gen_index, + GreaterEqual = dnnl_graph_op_greater_equal, + // Sentinel + LastSymbol = dnnl_graph_op_last_symbol, + }; + + /// Attributes of operations. Different operations support different + /// attributes. Check the document of each operation for what attributes are + /// supported and what are the potential values for them. Missing required + /// attribute or illegal attribute value may lead to failure when adding the + /// operation to a graph. + enum class attr { + /// Undefined op attribute. + undef = dnnl_graph_op_attr_undef, + + // float32 attributes. The value of these attributes can be any single + // float32 number. + + /// Specifies an alpha attribute to an op. + alpha = dnnl_graph_op_attr_alpha, + /// Specifies an beta attribute to an op. + beta = dnnl_graph_op_attr_beta, + /// Specifies an epsilon attribute to an op. + epsilon = dnnl_graph_op_attr_epsilon, + /// Specifies a max attribute to an op. + max = dnnl_graph_op_attr_max, + /// Specifies a min attribute to an op. + min = dnnl_graph_op_attr_min, + /// Specifies a momentum attribute to an op. + momentum = dnnl_graph_op_attr_momentum, + + // float32 vector attributes. The value of these attributes can be a + // vector of float32 numbers. + + /// Specifies a scales attribute to an op. + scales = dnnl_graph_op_attr_scales, + + // int64_t attributes. The value of these attributes can be any single + // int64 number. + + /// Specifies an axis attribute to an op. + axis = dnnl_graph_op_attr_axis, + /// Specifies a begin_norm_axis attribute to an op. + begin_norm_axis = dnnl_graph_op_attr_begin_norm_axis, + /// Specifies a groups attribute to an op. + groups = dnnl_graph_op_attr_groups, + + // int64_t vector attributes. The value of these attributes can be a + // vector of int64 numbers. + + /// Specifies an axes attribute to an op. + axes = dnnl_graph_op_attr_axes, + /// Specifies a dilations attribute to an op. + dilations = dnnl_graph_op_attr_dilations, + /// Specifies an dst_shape attribute to an op. + dst_shape = dnnl_graph_op_attr_dst_shape, + /// Specifies a kernel attribute to an op. + kernel = dnnl_graph_op_attr_kernel, + /// Specifies an order attribute to an op. + order = dnnl_graph_op_attr_order, + /// Specifies an output_padding attribute to an op. + output_padding = dnnl_graph_op_attr_output_padding, + /// Specifies a pads_begin attribute to an op. + pads_begin = dnnl_graph_op_attr_pads_begin, + /// Specifies a pads_end attribute to an op. + pads_end = dnnl_graph_op_attr_pads_end, + /// Specifies a shape attribute to an op. + shape = dnnl_graph_op_attr_shape, + /// Specifies a sizes attribute to an op. + sizes = dnnl_graph_op_attr_sizes, + /// Specifies an src_shape attribute to an op. + src_shape = dnnl_graph_op_attr_src_shape, + /// Specifies a strides attribute to an op. + strides = dnnl_graph_op_attr_strides, + /// Specifies a weight_shape attribute to an op. + weights_shape = dnnl_graph_op_attr_weights_shape, + /// Specifies a zps attribute to an op. + zps = dnnl_graph_op_attr_zps, + /// Specifies the group shape of an op. The size of the vector should + /// match that of the input. For the dimensions where the grouped + /// quantization occurs, the values should correspond to the group + /// size, which indicates the number of elements that will share the + /// same scaling factor. + group_shape = dnnl_graph_op_attr_group_shape, + + // bool attributes. The value of these attributes can be any single bool + // value. + + /// Specifies an exclude_pad attribute to an op. + exclude_pad = dnnl_graph_op_attr_exclude_pad, + /// Specifies a keep_dims attribute to an op. + keep_dims = dnnl_graph_op_attr_keep_dims, + /// Specifies a keep_stats attribute to an op. + keep_stats = dnnl_graph_op_attr_keep_stats, + /// Specifies a per_channel_broadcast attribute to an op. + per_channel_broadcast = dnnl_graph_op_attr_per_channel_broadcast, + /// Specifies a special_zero attribute to an op. + special_zero = dnnl_graph_op_attr_special_zero, + /// Specifies a transpose_a attribute to an op. + transpose_a = dnnl_graph_op_attr_transpose_a, + /// Specifies a transpose_b attribute to an op. + transpose_b = dnnl_graph_op_attr_transpose_b, + /// Specifies an use_affine attribute to an op. + use_affine = dnnl_graph_op_attr_use_affine, + /// Specifies an use_dst attribute to an op. + use_dst = dnnl_graph_op_attr_use_dst, + + // string attributes. The value of these attributes can be a string. + + /// Specifies an auto_broadcast attribute to an op. The value can be + /// "none" or "numpy". + auto_broadcast = dnnl_graph_op_attr_auto_broadcast, + /// Specifies an auto_pad attribute to an op. The value can be "none", + /// "same_upper", "same_lower", or "valid". + auto_pad = dnnl_graph_op_attr_auto_pad, + /// Specifies an coordinate_transformation_mode attribute to an op. The + /// value can be "half_pixel" or "align_corners". The attribute is + /// defined for Interpolate operations. + coordinate_transformation_mode + = dnnl_graph_op_attr_coordinate_transformation_mode, + /// Specifies a data_format of an op. The value can be "NCX" or "NXC". + data_format = dnnl_graph_op_attr_data_format, + /// Specifies a mode attribute of an op. The value can be "nearest", + /// "linear", "bilinear", or "trilinear". The attribute is defined for + /// Interpolate operations. + mode = dnnl_graph_op_attr_mode, + /// Specifies a qtype attribute to an op. The value can be "per_channel" + /// or "per_tensor". The attribute is defined for quantization + /// operations. + qtype = dnnl_graph_op_attr_qtype, + /// Specifies a rounding_type attribute to an op. The value can be + /// "ceil" or "floor". + rounding_type = dnnl_graph_op_attr_rounding_type, + /// Specifies a weights_format of an op. The value can be "OIX", "XIO", + /// "IOX", or "XOI". Different operations may support different values. + weights_format = dnnl_graph_op_attr_weights_format, + + /// Specifies the end of all above exteral attributes for check. + end = dnnl_graph_op_attr_end, + }; + + /// Constructs an op object with an unique ID, an operation kind, and a name + /// string. + /// + /// @param id The unique ID of the op. + /// @param akind The op kind specifies which computation is represented by + /// the op, such as Convolution or ReLU. + /// @param verbose_name The string added as the op name. + op(size_t id, kind akind, const std::string &verbose_name = "") { + dnnl_graph_op_t op = nullptr; + error::wrap_c_api(dnnl_graph_op_create(&op, id, convert_to_c(akind), + verbose_name.c_str()), + "could not create op with id and op kind"); + reset(op); + } + + /// Constructs an op object with an unique ID, an operation kind, and + /// input/output logical tensors. + /// + /// @param id The unique ID of this op. + /// @param akind The op kind specifies which computation is represented by + /// this op, such as Convolution or ReLU. + /// @param inputs Input logical tensor to be bound to this op. + /// @param outputs Output logical tensor to be bound to this op. + /// @param verbose_name The string added as the op name. + op(size_t id, kind akind, const std::vector &inputs, + const std::vector &outputs, + const std::string &verbose_name = "") + : op(id, akind, verbose_name) { + for (const auto &input : inputs) { + error::wrap_c_api(dnnl_graph_op_add_input(get(), &(input.data)), + "adding input to the op failed"); + } + for (const auto &output : outputs) { + error::wrap_c_api(dnnl_graph_op_add_output(get(), &(output.data)), + "adding output to the op failed"); + } + } + + /// Adds an input logical tensor to the op. + /// + /// @param t Input logical tensor. + void add_input(const logical_tensor &t) { + error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)), + "adding input to the op failed"); + } + + /// Adds a vector of input logical tensors to the op. + /// + /// @param ts The list of input logical tensors. + void add_inputs(const std::vector &ts) { + for (const auto &t : ts) { + error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)), + "adding input to the op failed"); + } + } + + /// Adds an output logical tensor to the op. + /// + /// @param t Output logical tensor. + void add_output(const logical_tensor &t) { + error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)), + "adding output to the op failed"); + } + + /// Adds a vector of output logical tensors to the op. + /// + /// @param ts The list of output logical tensors. + void add_outputs(const std::vector &ts) { + for (const auto &t : ts) { + error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)), + "adding output to the op failed"); + } + } + + /// Sets the attribute according to the name and type (int64_t). + /// + /// @tparam Type_i Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template ::value> = true> + op &set_attr(attr name, const Type_i &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + error::wrap_c_api(dnnl_graph_op_set_attr_s64(get(), attr, &value, 1), + "could not set attribute to the op"); + return *this; + } + + /// Sets the attribute according to the name and type (float). + /// + /// @tparam Type_f Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template ::value> = true> + op &set_attr(attr name, const Type_f &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + error::wrap_c_api(dnnl_graph_op_set_attr_f32(get(), attr, &value, 1), + "could not set attribute to the op"); + return *this; + } + + /// Sets the attribute according to the name and type (bool). + /// + /// @tparam Type_b Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template ::value> = true> + op &set_attr(attr name, const Type_b &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + const uint8_t val = value; + error::wrap_c_api(dnnl_graph_op_set_attr_bool(get(), attr, &val, 1), + "could not set attribute to the op"); + return *this; + } + + /// Sets the attribute according to the name and type (string). + /// + /// @tparam Type_s Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template ::value> = true> + op &set_attr(attr name, const Type_s &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + error::wrap_c_api(dnnl_graph_op_set_attr_str( + get(), attr, value.c_str(), value.size()), + "could not set attribute to the op"); + return *this; + } + + /// Sets the attribute according to the name and type + /// (std::vector). + /// + /// @tparam Type_is Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template >::value> = true> + op &set_attr(attr name, const Type_is &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + error::wrap_c_api(dnnl_graph_op_set_attr_s64( + get(), attr, value.data(), value.size()), + "could not set attribute to the op"); + return *this; + } + + /// Sets the attribute according to the name and type (std::vector). + /// + /// @tparam Type_fs Attribute's type. + /// @param name Attribute's name. + /// @param value The attribute's value. + /// @returns The Op self. + template >::value> = true> + op &set_attr(attr name, const Type_fs &value) { + dnnl_graph_op_attr_t attr = convert_to_c(name); + error::wrap_c_api(dnnl_graph_op_set_attr_f32( + get(), attr, value.data(), value.size()), + "could not set attribute to the op"); + return *this; + } + +private: + dnnl_graph_op_kind_t convert_to_c(kind akind) { + return static_cast(akind); + } + + dnnl_graph_op_attr_t convert_to_c(attr aattr) { + return static_cast(aattr); + } +}; + +/// @} dnnl_graph_api_op + +/// @addtogroup dnnl_graph_api_partition Partition +/// +/// Partition represents a collection of operations and their input and output +/// logical tensors identified by library as the basic unit for compilation and +/// execution. +/// +/// @{ + +/// A partition object. +class partition : public partition_handle { +public: + /// Policy specifications for partitioning. + enum class policy { + /// Fusion policy returns partitions with typical post-op fusions, eg. + /// Convolution + ReLU or other element-wise operations or a chian of + /// post-ops. + fusion = dnnl_graph_partition_policy_fusion, + /// Debug policy doesn't not apply any fusions. It returns partitions + /// with single operations in each partition. The policy is useful when + /// users notice any bug or correctness issue in fusion policy. + debug = dnnl_graph_partition_policy_debug, + }; + + partition() = default; + + /// Constructs a partition object + /// + /// @param p A raw pointer to the C API handle + partition(dnnl_graph_partition_t p) { reset(p, false); } + + /// Creates a new partition with a given operator and engine kind. The API + /// is used to create a partition from an operation directly without + /// creating the graph and calling `get_partitions()`. The output partition + /// contains only one operation. + /// + /// @param aop An operation used to create the partition. + /// @param ekind Engine kind. + partition(const op &aop, engine::kind ekind) { + dnnl_graph_partition_t p = nullptr; + error::wrap_c_api(dnnl_graph_partition_create_with_op(&p, aop.get(), + static_cast(ekind)), + "could not create a partition with the op and engine kind"); + reset(p); + } + + /// Returns the number of operations contained in the partition. + /// + /// @returns Number of operations. + size_t get_ops_num() const { + size_t num {0}; + error::wrap_c_api(dnnl_graph_partition_get_op_num(get(), &num), + "could not get number of ops from the partition"); + return num; + } + + /// Returns all operation IDs contained in the partition. + /// + /// @returns An unordered set of operation IDs. + std::vector get_ops() const { + auto num = get_ops_num(); + std::vector ops(num); + + error::wrap_c_api(dnnl_graph_partition_get_ops(get(), num, ops.data()), + "could not get op ids from the partition"); + return ops; + } + + /// Returns the unique ID of the partition. Partition ID is generated by the + /// library internally. The ID can be used for debugging purpose or verbose. + /// + /// @returns ID of the partition. + size_t get_id() const { + size_t id {}; + error::wrap_c_api(dnnl_graph_partition_get_id(get(), &id), + "could not get id of the partition"); + return id; + } + + /// Compiles a partition with given input and output logical tensors. The + /// output logical tensors can contain unknown dimensions. For this case, + /// the compilation will deduce the output shapes according to input shapes. + /// The output logical tensors can also have layout type `any`. The + /// compilation will choose the optimal layout for output tensors. The + /// optimal layout will be represented as an opaque layout ID saved in the + /// output logical tensor. + /// + /// @param inputs A list of input logical tensors. + /// @param outputs A list of output logical tensors. + /// @param e The engine used to compile the partition. + /// @returns A compiled partition. + compiled_partition compile(const std::vector &inputs, + const std::vector &outputs, const engine &e) const { + if (!is_supported()) { + error::wrap_c_api(dnnl_invalid_arguments, + "could not compile an unsupported partition"); + } + + return compile_(inputs, outputs, e); + } + + /// Returns the supporting status of a partition. Some operations may not be + /// supported by the library under certain circumstances. During + /// partitioning stage, unsupported partitions will be returned to users + /// with each containing an unsupported operation. Users should check the + /// supporting status of a partition before transforming the computation + /// graph or compiling the partition. + /// + /// @returns @c true if this partition is supported or @c false if this + /// partition isn't supported by the library + bool is_supported() const { + uint8_t supported {0}; + error::wrap_c_api(dnnl_graph_partition_is_supported(get(), &supported), + "could not get supporting status of the partition"); + return supported != 0; + } + + /// Returns a list of input logical tensors from the partition. + /// + /// @returns A list of input logical tensors. + std::vector get_input_ports() const { + size_t num = 0; + error::wrap_c_api(dnnl_graph_partition_get_input_ports_num(get(), &num), + "could not get number of inputs of the partition"); + if (num == 0) return {}; + + std::vector c_inputs(num); + error::wrap_c_api(dnnl_graph_partition_get_input_ports( + get(), num, c_inputs.data()), + "could not get input logical tensors of the partition"); + + std::vector inputs; + inputs.reserve(num); + for (auto &c_lt : c_inputs) + inputs.emplace_back(c_lt); + return inputs; + } + + /// Returns a list of output logical tensors from the partition. + /// + /// @returns A list of output logical tensor. + std::vector get_output_ports() const { + size_t num = 0; + error::wrap_c_api( + dnnl_graph_partition_get_output_ports_num(get(), &num), + "cannot get number of outputs of the partition"); + if (num == 0) return {}; + + std::vector c_outputs(num); + error::wrap_c_api(dnnl_graph_partition_get_output_ports( + get(), num, c_outputs.data()), + "could not get output logical tensors of the partition"); + + std::vector outputs; + outputs.reserve(num); + for (auto &c_lt : c_outputs) + outputs.emplace_back(c_lt); + return outputs; + } + + /// Returns the engine kind of the partition + /// + /// @returns The engine kind + engine::kind get_engine_kind() const { + dnnl_engine_kind_t akind; + error::wrap_c_api(dnnl_graph_partition_get_engine_kind(get(), &akind), + "cannot get the engine kind from the partition"); + + return static_cast(akind); + } + +private: + compiled_partition compile_(const std::vector &inputs, + const std::vector &outputs, const engine &e) const { + std::vector c_inputs; + std::vector c_outputs; + + c_inputs.reserve(inputs.size()); + for (const auto &in : inputs) { + c_inputs.push_back(&(in.data)); + } + + c_outputs.reserve(outputs.size()); + for (const auto &out : outputs) { + c_outputs.push_back(&(out.data)); + } + + dnnl_graph_compiled_partition_t cpartitions = nullptr; + error::wrap_c_api( + dnnl_graph_compiled_partition_create(&cpartitions, get()), + "could not create compiled_partition"); + error::wrap_c_api(dnnl_graph_partition_compile(get(), cpartitions, + c_inputs.size(), c_inputs.data(), + c_outputs.size(), c_outputs.data(), e.get()), + "partition compile failed"); + + return compiled_partition(cpartitions); + } +}; + +/// @} dnnl_graph_api_partition + +/// @addtogroup dnnl_graph_api_graph Graph +/// +/// Graph represents a computational DAG with a set of operations. +/// #dnnl::graph::graph::add_op() adds an operation and its input and output +/// logical tensors into a graph. The library accumulates the operations and +/// logical tensors and constructs and validates the graph as an internal state. +/// A graph object is associated to a specific engine kind. The partitions +/// returned from the graph will inherit the engine kind of the graph. +/// +/// @{ + +/// A graph object. +class graph : public graph_handle { +public: + /// Constructs a graph with an engine kind. + /// + /// @param engine_kind Engine kind. + graph(engine::kind engine_kind) { + dnnl_graph_graph_t g = nullptr; + error::wrap_c_api( + dnnl_graph_graph_create(&g, convert_to_c(engine_kind)), + "could not create graph with engine kind"); + reset(g); + } + + /// Creates a new empty graph with an engine kind and a floating-point math + /// mode. All partitions returned from the graph will inherit the engine + /// kind and floating-point math mode. + /// + /// Setting the floating-point math mode enables automatic down-conversion + /// of inputs for the given graph, promoting speedup by using + /// lower-precision data types when available. + /// + /// @param engine_kind Engine kind. + /// @param mode Floating-point math mode. + graph(engine::kind engine_kind, fpmath_mode mode) { + dnnl_graph_graph_t g = nullptr; + error::wrap_c_api( + dnnl_graph_graph_create_with_fpmath_mode( + &g, convert_to_c(engine_kind), convert_to_c(mode)), + "could not create graph with engine kind and math mode"); + reset(g); + } + + /// Set the floating point math mode for a graph. Users can enforce the + /// graph to comply with the mode by specifying a boolean flag with the + /// setter function. + /// + /// @param mode The floating-point math mode. + /// @param apply_to_int The flag that controls whether to use + /// floating-point arithmetic for integral operations. + void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) { + error::wrap_c_api(dnnl_graph_graph_set_fpmath_mode( + get(), convert_to_c(mode), apply_to_int), + "could not set fpmath mode graph attribute"); + } + + /// Get the floating point math mode and the boolean flag that specifies + /// whether the graph will be enforced to comply the mode. + /// + /// @param mode The floating-point math mode. + /// @param apply_to_int The flag that controls whether to use + /// floating-point arithmetic for integral operations. + void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const { + dnnl_fpmath_mode_t c_mode; + int c_apply_to_int; + + error::wrap_c_api(dnnl_graph_graph_get_fpmath_mode( + get(), &c_mode, &c_apply_to_int), + "could not get fpmath mode graph attribute"); + + mode = fpmath_mode(c_mode); + apply_to_int = static_cast(c_apply_to_int); + } + + /// Adds an op into the graph to construct a computational DAG. The API will + /// return failure if the operator has already been added to the graph or + /// the operation cannot pass the schema check in the library (eg. input and + /// output numbers and data types, the attributes of the operation, etc.). + /// + /// @param op An operation to be added. + /// @param allow_exception A flag indicating whether the method is allowed + /// to throw an exception if it fails to add the op to the graph. + /// @returns #status::success or a status describing the error otherwise. + status add_op(const op &op, bool allow_exception = true) { + dnnl_status_t ret = dnnl_graph_add_op(get(), op.get()); + + if (allow_exception) { + error::wrap_c_api(ret, "could not add op to the graph"); + } + + return static_cast(ret); + } + + /// Finalizes a graph. It means users have finished adding operations into + /// the graph and the graph is ready for partitioning. Adding a new + /// operation into a finalized graph will return failures. Similarly, + /// partitioning on a un-finalized graph will also return failures. + void finalize() { + error::wrap_c_api(dnnl_graph_graph_finalize(get()), + "could not finalize the graph"); + } + + /// Checks if a graph is finalized. + /// + /// @return True if the graph is finalized or false if the graph is not + /// finalized. + bool is_finalized() const { + uint8_t ret = 0; + error::wrap_c_api(dnnl_graph_graph_is_finalized(get(), &ret), + "could not get the finalization status of the graph"); + + return ret != 0; + } + + /// Gets filtered partitions from a graph. Partitions will be claimed + /// internally according to the capability of the library, the engine kind + /// of the graph, and the policy. + /// + /// @param policy Partition policy, defaults to policy + /// #dnnl::graph::partition::policy::fusion. + /// @return A vector storing the partitions. + std::vector get_partitions( + partition::policy policy = partition::policy::fusion) { + if (!is_finalized()) { + error::wrap_c_api( + dnnl_invalid_graph, "the graph is not finalized yet"); + } + + error::wrap_c_api( + dnnl_graph_graph_filter(get(), + static_cast(policy)), + "could not filter the graph"); + + size_t num = 0; + error::wrap_c_api(dnnl_graph_graph_get_partition_num(get(), &num), + "could not get number of partitions from the graph"); + + // return early if there is no partitions in the graph. + if (num == 0) return {}; + + std::vector out_list; + out_list.reserve(num); + + std::vector partitions(num); + error::wrap_c_api( + dnnl_graph_graph_get_partitions(get(), num, partitions.data()), + "could not get partitions from the graph"); + + for (auto p : partitions) { + out_list.emplace_back(p); + } + + return out_list; + } + +private: + static dnnl_fpmath_mode_t convert_to_c(fpmath_mode mode) { + return static_cast(mode); + } + + static dnnl_engine_kind_t convert_to_c(engine::kind akind) { + return static_cast(akind); + } +}; + +/// @} dnnl_graph_api_graph + +/// @addtogroup dnnl_graph_api_compiled_partition_cache Compiled Partition Cache +/// +/// A set of functions that provide compiled partition cache control. +/// +/// @{ + +/// Returns the number of compiled partition that can be held in the compiled +/// partition cache at the same time. +inline int get_compiled_partition_cache_capacity() { + int result = 0; + error::wrap_c_api(dnnl_graph_get_compiled_partition_cache_capacity(&result), + "could not get compiled partition cache capacity"); + return result; +} + +/// @copydoc dnnl_graph_set_compiled_partition_cache_capacity(int capacity) +inline void set_compiled_partition_cache_capacity(int capacity) { + error::wrap_c_api( + dnnl_graph_set_compiled_partition_cache_capacity(capacity), + "could not set compiled partition cache capacity"); +} + +/// @} dnnl_graph_api_compiled_partition_cache + +/// @addtogroup dnnl_graph_api_constant_tensor_cache Constant Tensor Cache +/// +/// A set of functions that provide constant tensor cache control +/// +/// @{ + +/// Control the enabling or disabling of constant tensor cache. This API must be +/// called once before compilation stage. By default, constant tensor cache is +/// disabled in the library. +/// @note This API is deprecated and will be removed in future release, please +/// use the set_constant_tensor_cache_capacity API to disable +/// constant tensor cache by setting it's capacity to zero. +/// +/// @param flag Set to positive value to enable the cache and set to 0 to +/// disable the cache. Negative values are invalid. +inline void set_constant_tensor_cache(int flag) { + error::wrap_c_api(dnnl_graph_set_constant_tensor_cache(flag), + "fail to set constant tensor cache"); +} + +/// Return the enabling status of constant tensor cache. +/// @note This API is deprecated and will be removed in future release, please +/// use the get_constant_tensor_cache_capacity API to check the +/// enabling status by checking it's capacity. +inline int get_constant_tensor_cache() { + int result = 0; + error::wrap_c_api(dnnl_graph_get_constant_tensor_cache(&result), + "fail to get constant tensor cache"); + return result; +} + +/// Control the capacity for the constant tensor cache that used for specific +/// engine kind. This API is thread safe and can be called multiple times at +/// runtime. The capacity is set to zero by default which means the cache is +/// disabled. When calling this API, the corresponding cache will be flushed. +/// Setting capacity to 0 means to clear all cached tensors and disable cache. +/// Once the capacity limit is reached, no new tensors will be cached. If there +/// are multiple devices for an engine kind, the capacity set here is for each +/// device. +/// +/// @param kind The engine kind that the constant tensor cache used for. +/// @param size The constant tensor cache capacity size to set. +inline void set_constant_tensor_cache_capacity(engine::kind kind, size_t size) { + error::wrap_c_api(dnnl_graph_set_constant_tensor_cache_capacity( + static_cast(kind), size), + "fail to set constant tensor cache capacity"); +} + +/// Return the current capacity of constant tensor cache. +/// +/// @param kind The engine kind that the constant tensor cache used for. +inline size_t get_constant_tensor_cache_capacity(engine::kind kind) { + size_t size = 0; + error::wrap_c_api(dnnl_graph_get_constant_tensor_cache_capacity( + static_cast(kind), &size), + "fail to get constant tensor cache capacity"); + return size; +} + +/// @} dnnl_graph_api_constant_tensor_cache + +} // namespace graph + +/// @} dnnl_graph_api + +} // namespace dnnl + +/// @cond DO_NOT_DOCUMENT_THIS + +/// oneAPI namespace +// Contains the oneapi::dnnl namespace as an alias to the ::dnnl namespace. +namespace oneapi { +// Note: without this guard, doxygen warns of potentially recursive namespace +#ifndef DOXYGEN_SHOULD_SKIP_THIS +/// oneDNN alias namespace +namespace dnnl = ::dnnl; +#endif +} // namespace oneapi + +/// @endcond + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h new file mode 100644 index 0000000000000000000000000000000000000000..f33c90c5e5e130c982d3f2fad00559734261c15a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h @@ -0,0 +1,154 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_OCL_H +#define ONEAPI_DNNL_DNNL_GRAPH_OCL_H + +#include "oneapi/dnnl/dnnl_graph.h" + +/// @cond DO_NOT_DOCUMENT_THIS +// Set target version for OpenCL explicitly to suppress a compiler warning. +#ifndef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 120 +#endif + +#include +/// @endcond + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_graph_api +/// @{ + +/// @addtogroup dnnl_graph_api_interop +/// @{ + +/// @addtogroup dnnl_graph_api_ocl_interop +/// @{ + +/// Allocation call-back function interface for OpenCL. OpenCL allocator should +/// be used for OpenCL GPU runtime. The call-back should return a USM device +/// memory pointer. +/// +/// @param size Memory size in bytes for requested allocation +/// @param alignment The minimum alignment in bytes for the requested allocation +/// @param device A valid OpenCL device used to allocate +/// @param context A valid OpenCL context used to allocate +/// @returns The memory address of the requested USM allocation. +typedef void *(*dnnl_graph_ocl_allocate_f)( + size_t size, size_t alignment, cl_device_id device, cl_context context); + +/// Deallocation call-back function interface for OpenCL. OpenCL allocator +/// should be used for OpenCL runtime. The call-back should deallocate a USM +/// device memory returned by #dnnl_graph_ocl_allocate_f. The event should be +/// completed before deallocate the USM. +/// +/// @param buf The USM allocation to be released +/// @param device A valid OpenCL device the USM associated with +/// @param context A valid OpenCL context used to free the USM allocation +/// @param event A event which the USM deallocation depends on +typedef void (*dnnl_graph_ocl_deallocate_f)( + void *buf, cl_device_id device, cl_context context, cl_event event); + +/// Creates an allocator with the given allocation and deallocation call-back +/// function pointers. +/// +/// @param allocator Output allocator +/// @param ocl_malloc A pointer to OpenCL malloc function +/// @param ocl_free A pointer to OpenCL free function +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_graph_ocl_interop_allocator_create( + dnnl_graph_allocator_t *allocator, dnnl_graph_ocl_allocate_f ocl_malloc, + dnnl_graph_ocl_deallocate_f ocl_free); + +/// This API is a supplement for existing oneDNN engine API: +/// dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create( +/// dnnl_engine_t *engine, cl_device_id device, cl_context context); +/// +/// @param engine Output engine. +/// @param device Underlying OpenCL device to use for the engine. +/// @param context Underlying OpenCL context to use for the engine. +/// @param alloc Underlying allocator to use for the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_graph_ocl_interop_make_engine_with_allocator( + dnnl_engine_t *engine, cl_device_id device, cl_context context, + const_dnnl_graph_allocator_t alloc); + +/// This API is a supplement for existing oneDNN engine API: +/// dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob( +/// dnnl_engine_t *engine, cl_device_id device, cl_context context, +/// size_t size, const uint8_t *cache_blob); +/// +/// @param engine Output engine. +/// @param device The OpenCL device that this engine will encapsulate. +/// @param context The OpenCL context (containing the device) that this +/// engine will use for all operations. +/// @param alloc Underlying allocator to use for the engine. +/// @param size Size of the cache blob in bytes. +/// @param cache_blob Cache blob of size @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API +dnnl_graph_ocl_interop_make_engine_from_cache_blob_with_allocator( + dnnl_engine_t *engine, cl_device_id device, cl_context context, + const_dnnl_graph_allocator_t alloc, size_t size, + const uint8_t *cache_blob); + +/// Execute a compiled partition with OpenCL runtime. +/// +/// @param compiled_partition The handle of target compiled_partition. +/// @param stream The stream used for execution +/// @param num_inputs The number of input tensors +/// @param inputs A list of input tensors +/// @param num_outputs The number of output tensors +/// @param outputs A non-empty list of output tensors +/// @param deps Optional handle of list with `cl_event` dependencies. +/// @param ndeps Number of dependencies. +/// @param return_event The handle of cl_event. +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_graph_ocl_interop_compiled_partition_execute( + const_dnnl_graph_compiled_partition_t compiled_partition, + dnnl_stream_t stream, size_t num_inputs, + const_dnnl_graph_tensor_t *inputs, size_t num_outputs, + const_dnnl_graph_tensor_t *outputs, const cl_event *deps, int ndeps, + cl_event *return_event); + +/// @} dnnl_graph_api_ocl_interop + +/// @} dnnl_graph_api_interop + +/// @} dnnl_graph_api + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..be50bd934d46d4cd1ef8178e4435ad405b862675 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp @@ -0,0 +1,161 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// Graph OpenCL interop API + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP +#define ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP + +/// @cond DO_NOT_DOCUMENT_THIS +#include + +#include + +#include "oneapi/dnnl/dnnl_graph.hpp" +#include "oneapi/dnnl/dnnl_graph_ocl.h" +#include "oneapi/dnnl/dnnl_ocl.hpp" +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_graph_api +/// @{ + +namespace graph { + +/// @addtogroup dnnl_graph_api_interop Runtime interoperability API +/// API extensions to interact with the underlying run-time. +/// @{ + +/// @addtogroup dnnl_graph_api_ocl_interop OpenCL interoperability API +/// API extensions to interact with the underlying OpenCL run-time. +/// @{ + +/// OpenCL interoperability namespace +namespace ocl_interop { + +/// Constructs an allocator from OpenCL malloc and free function pointer. OpenCL +/// allocator should be used for OpenCL GPU runtime. Currently, only device USM +/// allocator is supported. +/// +/// @param ocl_malloc The pointer to OpenCL malloc function +/// @param ocl_free The pointer to OpenCL free function +/// @returns Created allocator +inline allocator make_allocator(dnnl_graph_ocl_allocate_f ocl_malloc, + dnnl_graph_ocl_deallocate_f ocl_free) { + dnnl_graph_allocator_t c_allocator = nullptr; + error::wrap_c_api(dnnl_graph_ocl_interop_allocator_create( + &c_allocator, ocl_malloc, ocl_free), + "could not create allocator for opencl device"); + return allocator(c_allocator); +} + +/// Constructs an engine from an OpenCL device, an OpenCL context, and an +/// allocator. +/// +/// @param device A valid OpenCL device to construct the engine +/// @param context A valid OpenCL context to construct the engine +/// @param alloc An allocator to associate with the engine +/// @returns Created engine +inline engine make_engine_with_allocator( + cl_device_id device, cl_context context, const allocator &alloc) { + dnnl_engine_t c_engine; + error::wrap_c_api(dnnl_graph_ocl_interop_make_engine_with_allocator( + &c_engine, device, context, alloc.get()), + "could not make an engine with allocator"); + return engine(c_engine); +} + +/// Constructs an engine from an OpenCL device, an OpenCL context, an +/// allocator, and a serialized engine cache blob. +/// +/// @param device A valid OpenCL device to construct the engine +/// @param context A valid OpenCL context to construct the engine +/// @param alloc An allocator to associate with the engine +/// @param cache_blob Cache blob serialized beforehand +/// @returns Created engine +inline engine make_engine_with_allocator(cl_device_id device, + cl_context context, const allocator &alloc, + const std::vector &cache_blob) { + dnnl_engine_t c_engine; + error::wrap_c_api( + dnnl_graph_ocl_interop_make_engine_from_cache_blob_with_allocator( + &c_engine, device, context, alloc.get(), cache_blob.size(), + cache_blob.data()), + "could not make an engine with allocator from cache blob"); + return engine(c_engine); +} + +/// Executes a compiled partition in a specified stream and returns a OpenCL +/// event. +/// +/// @param c_partition Compiled partition to execute. +/// @param astream Stream object to run over +/// @param inputs Arguments map. +/// @param outputs Arguments map. +/// @param deps Optional vector with `cl_event` dependencies. +/// @returns Output event. +inline cl_event execute(compiled_partition &c_partition, stream &astream, + const std::vector &inputs, std::vector &outputs, + const std::vector &deps = {}) { + std::vector c_inputs; + c_inputs.reserve(inputs.size()); + for (auto &in : inputs) { + c_inputs.push_back(in.get()); + } + std::vector c_outputs; + c_outputs.reserve(outputs.size()); + for (auto &out : outputs) { + c_outputs.push_back(out.get()); + } + + const cl_event *c_deps = deps.empty() ? nullptr : deps.data(); + + cl_event ocl_event; + error::wrap_c_api( + dnnl_graph_ocl_interop_compiled_partition_execute(c_partition.get(), + astream.get(), c_inputs.size(), c_inputs.data(), + c_outputs.size(), c_outputs.data(), c_deps, + (int)deps.size(), &ocl_event), + "could not execute the compiled_partition on a specified opencl " + "stream"); + return ocl_event; +} + +} // namespace ocl_interop + +/// @} dnnl_graph_api_ocl_interop + +/// @} dnnl_graph_api_interop + +} // namespace graph + +/// @} dnnl_graph_api + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h new file mode 100644 index 0000000000000000000000000000000000000000..eaf1380b364a74260f65707ba3082b872a2dce80 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h @@ -0,0 +1,104 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_H +#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_H + +#include "oneapi/dnnl/dnnl_graph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_graph_api +/// @{ + +/// @addtogroup dnnl_graph_api_interop +/// @{ + +/// @addtogroup dnnl_graph_api_sycl_interop +/// @{ + +/// Allocation call-back function interface for SYCL. SYCL allocator should be +/// used for SYCL runtime and host allocator should be used for non-SYCL. The +/// call-back should return a USM device memory pointer. +typedef void *(*dnnl_graph_sycl_allocate_f)( + size_t size, size_t alignment, const void *dev, const void *context); + +/// Deallocation call-back function interface for SYCL. SYCL allocator should be +/// used for SYCL runtime and host allocator should be used for non-SYCL. The +/// call-back should deallocate a USM device memory returned by +/// #dnnl_graph_sycl_allocate_f. +typedef void (*dnnl_graph_sycl_deallocate_f)( + void *buf, const void *dev, const void *context, void *event); + +/// Creates an allocator with the given allocation and deallocation call-back +/// function pointers. +/// +/// @param allocator Output allocator +/// @param sycl_malloc A pointer to SYCL malloc function +/// @param sycl_free A pointer to SYCL free function +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_graph_sycl_interop_allocator_create( + dnnl_graph_allocator_t *allocator, + dnnl_graph_sycl_allocate_f sycl_malloc, + dnnl_graph_sycl_deallocate_f sycl_free); + +/// This API is a supplement for existing onednn engine API. +dnnl_status_t DNNL_API dnnl_graph_sycl_interop_make_engine_with_allocator( + dnnl_engine_t *engine, const void *device, const void *context, + const_dnnl_graph_allocator_t alloc); + +/// Execute a compiled partition with sycl runtime. +/// +/// @param compiled_partition The handle of target compiled_partition. +/// @param stream The stream used for execution +/// @param num_inputs The number of input tensors +/// @param inputs A list of input tensors +/// @param num_outputs The number of output tensors +/// @param outputs A non-empty list of output tensors +/// @param deps Optional handle of list with `sycl::event` dependencies. +/// @param sycl_event The handle of sycl event. +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_graph_sycl_interop_compiled_partition_execute( + const_dnnl_graph_compiled_partition_t compiled_partition, + dnnl_stream_t stream, size_t num_inputs, + const_dnnl_graph_tensor_t *inputs, size_t num_outputs, + const_dnnl_graph_tensor_t *outputs, const void *deps, void *sycl_event); + +/// @} dnnl_graph_api_sycl_interop + +/// @} dnnl_graph_api_interop + +/// @} dnnl_graph_api + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2bfa589ed0f739429711eadd36418391d18f2033 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp @@ -0,0 +1,136 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// Graph SYCL interop API + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP +#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP + +/// @cond DO_NOT_DOCUMENT_THIS +#include + +#if __has_include() +#include +#else +#error "Unsupported compiler" +#endif + +#include "oneapi/dnnl/dnnl_graph.hpp" +#include "oneapi/dnnl/dnnl_graph_sycl.h" +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_graph_api +/// @{ + +namespace graph { + +/// @addtogroup dnnl_graph_api_interop Runtime interoperability API +/// API extensions to interact with the underlying run-time. +/// @{ + +/// @addtogroup dnnl_graph_api_sycl_interop SYCL interoperability API +/// API extensions to interact with the underlying SYCL run-time. +/// @{ + +/// SYCL interoperability namespace +namespace sycl_interop { + +/// Constructs an allocator from SYCL malloc and free function pointer. SYCL +/// allocator should be used for SYCL runtime and host allocator should be used +/// for non-SYCL. Currently, only device USM allocator is supported. +/// +/// @param sycl_malloc The pointer to SYCL malloc function +/// @param sycl_free The pointer to SYCL free function +/// @returns Created allocator +inline allocator make_allocator(dnnl_graph_sycl_allocate_f sycl_malloc, + dnnl_graph_sycl_deallocate_f sycl_free) { + dnnl_graph_allocator_t c_allocator = nullptr; + error::wrap_c_api(dnnl_graph_sycl_interop_allocator_create( + &c_allocator, sycl_malloc, sycl_free), + "could not create allocator for sycl device"); + return allocator(c_allocator); +} + +inline engine make_engine_with_allocator(const sycl::device &adevice, + const sycl::context &acontext, const allocator &alloc) { + dnnl_engine_t c_engine; + error::wrap_c_api( + dnnl_graph_sycl_interop_make_engine_with_allocator(&c_engine, + static_cast(&adevice), + static_cast(&acontext), alloc.get()), + "could not make an engine with allocator"); + return engine(c_engine); +} + +/// Executes a compiled partition in a specified stream and returns a SYCL +/// event. +/// +/// @param c_partition Compiled partition to execute. +/// @param astream Stream object to run over +/// @param inputs Arguments map. +/// @param outputs Arguments map. +/// @param deps Optional vector with `sycl::event` dependencies. +/// @returns Output event. +inline sycl::event execute(compiled_partition &c_partition, stream &astream, + const std::vector &inputs, std::vector &outputs, + const std::vector &deps = {}) { + std::vector c_inputs; + c_inputs.reserve(inputs.size()); + for (auto &in : inputs) { + c_inputs.push_back(in.get()); + } + std::vector c_outputs; + c_outputs.reserve(outputs.size()); + for (auto &out : outputs) { + c_outputs.push_back(out.get()); + } + + sycl::event sycl_event; + error::wrap_c_api(dnnl_graph_sycl_interop_compiled_partition_execute( + c_partition.get(), astream.get(), c_inputs.size(), + c_inputs.data(), c_outputs.size(), + c_outputs.data(), &deps, &sycl_event), + "could not execute the compiled_partition on a specified sycl " + "stream"); + return sycl_event; +} + +} // namespace sycl_interop + +/// @} dnnl_graph_api_sycl_interop + +/// @} dnnl_graph_api_interop + +} // namespace graph + +/// @} dnnl_graph_api + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h new file mode 100644 index 0000000000000000000000000000000000000000..421b3db10427d3536a4031f363e8af3b7d358269 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h @@ -0,0 +1,480 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* + * Copyright 2020-2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +/// @file +/// C API definitions + +#ifndef ONEAPI_DNNL_DNNL_GRAPH_TYPES_H +#define ONEAPI_DNNL_DNNL_GRAPH_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include + +#include "oneapi/dnnl/dnnl_common_types.h" +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_graph_api +/// @{ + +/// @addtogroup dnnl_graph_api_logical_tensor +/// @{ + +/// A wildcard value for number of dimensions which is unknown at a tensor or +/// operation creation time. +#define DNNL_GRAPH_UNKNOWN_NDIMS -1 + +/// A wildcard value for dimensions that are unknown at a tensor or operation +/// creation time. +#define DNNL_GRAPH_UNKNOWN_DIM INT64_MIN + +/// Layout type specification +typedef enum { + /// Undefined layout type + dnnl_graph_layout_type_undef = 0, + /// Any means to let the library to decide the layout for a tensor during + /// partition compilation. + dnnl_graph_layout_type_any = 1, + /// Strided means that the layout of a tensor is determined by the strides + /// field in the logical tensor. + dnnl_graph_layout_type_strided = 2, + /// Opaque means that the layout of a tensor is the library specific. + /// Usually, an opaque layout is generated by a partition which is compiled + /// with layout type any. + dnnl_graph_layout_type_opaque = 3, +} dnnl_graph_layout_type_t; + +/// Logical tensor property +typedef enum { + /// Undefined tensor property + dnnl_graph_tensor_property_undef = 0, + /// Variable means the tensor may be changed during computation or between + /// different iterations. + dnnl_graph_tensor_property_variable = 1, + /// Constant means the tensor will keep unchanged during computation and + /// between different iterations. It's useful for the library to apply + /// optimizations for constant tensors or cache constant tensors inside the + /// library. For example, constant weight tensors in inference scenarios. + dnnl_graph_tensor_property_constant = 2, +} dnnl_graph_tensor_property_t; + +/// Logical tensor. It is based on an ID, a number of dimensions, dimensions +/// themselves, element data type, tensor property and tensor memory layout. +typedef struct { + /// Unique id of each logical tensor. The library uses logical tensor IDs to + /// build up the connections between operations if the output of one + /// operation has the same ID as the input of another operation. + size_t id; + + /// Number of dimensions. -1 means unknown (DNNL_GRAPH_UNKNOWN_NDIMS). 0 is + /// used to define scalar tensor. + int ndims; + + /// Size of each dimension. #DNNL_GRAPH_UNKNOWN_DIM means the size of that + /// dimension is unknown. 0 is used to define zero-dimension tensor. The + /// library supports to deduce output shapes according to input shapes + /// during compilation. Unlike memory descriptor in oneDNN primitive API, + /// the order of dimensions is not defined in logical tensor. It is defined + /// by the operations which respect the order through the attributes + /// #dnnl_graph_op_attr_data_format or #dnnl_graph_op_attr_weights_format. + /// For example, for a Convolution with `data_format=NXC`, it means the + /// first element of dims of activation tensor is mini-batch size, the last + /// effective element of dims is channel size, and other elements between + /// them are spatial dimensions. + dnnl_dims_t dims; + + /// Data type of the tensor elements. + dnnl_data_type_t data_type; + + /// Property type of the tensor. + dnnl_graph_tensor_property_t property; + + /// Layout type of the tensor. + dnnl_graph_layout_type_t layout_type; + union { + /// The field is valid when `layout_type` is + /// #dnnl_graph_layout_type_strided. #DNNL_GRAPH_UNKNOWN_DIM means the + /// stride of the dimension is unknown. The library currently doesn't + /// support other negative stride values. + dnnl_dims_t strides; + + /// The field is valid when `layout_type` is + /// #dnnl_graph_layout_type_opaque. An opaque layout ID is usually + /// generated by a partition which is compiled with layout type any. + size_t layout_id; + } layout; +} dnnl_graph_logical_tensor_t; + +/// @} dnnl_graph_api_logical_tensor + +/// @addtogroup dnnl_graph_api_partition +/// @{ + +/// Policy specifications for partitioning +typedef enum { + /// Fusion policy returns partitions with typical post-op fusions, eg. + /// Convolution + ReLU or other element-wise operations or a chian of + /// post-ops. + dnnl_graph_partition_policy_fusion = 1, + /// Debug policy doesn't not apply any fusions. It returns partitions with + /// single operation in each partition. The policy is useful when users + /// notice any bug or correctness issue in fusion policy. + dnnl_graph_partition_policy_debug = 2, +} dnnl_graph_partition_policy_t; + +/// An opaque structure to describe a partition. +struct dnnl_graph_partition; + +/// A partition handle. +typedef struct dnnl_graph_partition *dnnl_graph_partition_t; + +/// A constant partition handle. +typedef const struct dnnl_graph_partition *const_dnnl_graph_partition_t; + +/// @} dnnl_graph_api_partition + +/// @addtogroup dnnl_graph_api_graph +/// @{ + +/// An opaque structure to describe a graph. +struct dnnl_graph_graph; + +/// A graph handle. +typedef struct dnnl_graph_graph *dnnl_graph_graph_t; + +/// A constant graph handle. +typedef const struct dnnl_graph_graph *const_dnnl_graph_graph_t; + +/// @} dnnl_graph_api_graph + +/// @addtogroup dnnl_graph_api_op +/// @{ + +/// Kinds of operations +typedef enum { + dnnl_graph_op_abs, + dnnl_graph_op_abs_backward, + dnnl_graph_op_add, + dnnl_graph_op_avg_pool, + dnnl_graph_op_avg_pool_backward, + dnnl_graph_op_batch_norm_backward, + dnnl_graph_op_batch_norm_forward_training, + dnnl_graph_op_batch_norm_inference, + dnnl_graph_op_bias_add, + dnnl_graph_op_bias_add_backward, + dnnl_graph_op_clamp, + dnnl_graph_op_clamp_backward, + dnnl_graph_op_concat, + dnnl_graph_op_convolution, + dnnl_graph_op_convolution_backward_data, + dnnl_graph_op_convolution_backward_weights, + dnnl_graph_op_conv_transpose, + dnnl_graph_op_conv_transpose_backward_data, + dnnl_graph_op_conv_transpose_backward_weights, + dnnl_graph_op_dequantize, + dnnl_graph_op_divide, + dnnl_graph_op_dynamic_dequantize, + dnnl_graph_op_dynamic_quantize, + dnnl_graph_op_elu, + dnnl_graph_op_elu_backward, + dnnl_graph_op_end, + dnnl_graph_op_exp, + dnnl_graph_op_gelu, + dnnl_graph_op_gelu_backward, + dnnl_graph_op_hard_swish, + dnnl_graph_op_hard_swish_backward, + dnnl_graph_op_interpolate, + dnnl_graph_op_interpolate_backward, + dnnl_graph_op_layer_norm, + dnnl_graph_op_layer_norm_backward, + dnnl_graph_op_leaky_relu, + dnnl_graph_op_log, + dnnl_graph_op_log_softmax, + dnnl_graph_op_log_softmax_backward, + dnnl_graph_op_matmul, + dnnl_graph_op_maximum, + dnnl_graph_op_max_pool, + dnnl_graph_op_max_pool_backward, + dnnl_graph_op_minimum, + dnnl_graph_op_mish, + dnnl_graph_op_mish_backward, + dnnl_graph_op_multiply, + dnnl_graph_op_prelu, + dnnl_graph_op_prelu_backward, + dnnl_graph_op_quantize, + dnnl_graph_op_reciprocal, + dnnl_graph_op_reduce_l1, + dnnl_graph_op_reduce_l2, + dnnl_graph_op_reduce_max, + dnnl_graph_op_reduce_mean, + dnnl_graph_op_reduce_min, + dnnl_graph_op_reduce_prod, + dnnl_graph_op_reduce_sum, + dnnl_graph_op_relu, + dnnl_graph_op_relu_backward, + dnnl_graph_op_reorder, + dnnl_graph_op_round, + dnnl_graph_op_sigmoid, + dnnl_graph_op_sigmoid_backward, + dnnl_graph_op_softmax, + dnnl_graph_op_softmax_backward, + dnnl_graph_op_softplus, + dnnl_graph_op_softplus_backward, + dnnl_graph_op_sqrt, + dnnl_graph_op_sqrt_backward, + dnnl_graph_op_square, + dnnl_graph_op_squared_difference, + dnnl_graph_op_static_reshape, + dnnl_graph_op_static_transpose, + dnnl_graph_op_subtract, + dnnl_graph_op_tanh, + dnnl_graph_op_tanh_backward, + dnnl_graph_op_type_cast, + dnnl_graph_op_wildcard, + dnnl_graph_op_hard_sigmoid, + dnnl_graph_op_hard_sigmoid_backward, + dnnl_graph_op_select, + dnnl_graph_op_pow, + dnnl_graph_op_group_norm, + dnnl_graph_op_gen_index, + dnnl_graph_op_greater_equal, + dnnl_graph_op_last_symbol, +} dnnl_graph_op_kind_t; + +/// Attributes of operations +typedef enum { + /// Undefined op attribute. + dnnl_graph_op_attr_undef = 0, + + // float32 attributes. The value of these attributes can be any single + // float32 number. + + /// Specifies an alpha attribute to an op. + dnnl_graph_op_attr_alpha = 0x1, + /// Specifies an beta attribute to an op. + dnnl_graph_op_attr_beta, + /// Specifies an epsilon attribute to an op. + dnnl_graph_op_attr_epsilon, + /// Specifies a max attribute to an op. + dnnl_graph_op_attr_max, + ///Specifies a min attribute to an op. + dnnl_graph_op_attr_min, + /// Specifies a momentum attribute to an op. + dnnl_graph_op_attr_momentum, + + // float32 vector attributes. The value of these attributes can be a vector + // of float32 numbers. + + /// Specifies a scales attribute to an op. + dnnl_graph_op_attr_scales = 0x20, + + // int64_t attributes. The value of these attributes can be any single int64 + // number. + + /// Specifies an axis attribute to an op. + dnnl_graph_op_attr_axis = 0x30, + /// Specifies a begin_norm_axis attribute to an op. + dnnl_graph_op_attr_begin_norm_axis, + /// Specifies a groups attribute to an op. + dnnl_graph_op_attr_groups, + + // int64_t vector attributes. The value of these attributes can be a vector + // of int64 numbers. + + /// Specifies an axes attribute to an op. + dnnl_graph_op_attr_axes = 0x40, + /// Specifies a dilations attribute to an op. + dnnl_graph_op_attr_dilations, + /// Specifies an dst_shape attribute to an op. + dnnl_graph_op_attr_dst_shape, + /// Specifies a kernel attribute to an op. + dnnl_graph_op_attr_kernel, + /// Specifies an order attribute to an op. + dnnl_graph_op_attr_order, + /// Specifies an output_padding attribute to an op. + dnnl_graph_op_attr_output_padding, + /// Specifies a pads_begin attribute to an op. + dnnl_graph_op_attr_pads_begin, + /// Specifies a pads_end attribute to an op. + dnnl_graph_op_attr_pads_end, + /// Specifies a shape attribute to an op. + dnnl_graph_op_attr_shape, + /// Specifies a sizes attribute to an op. + dnnl_graph_op_attr_sizes, + /// Specifies a input_shape attribute to an op. + dnnl_graph_op_attr_src_shape, + /// Specifies a strides attribute to an op. + dnnl_graph_op_attr_strides, + /// Specifies a weight_shape attribute to an op. + dnnl_graph_op_attr_weights_shape, + /// Specifies a zps attribute to an op. + dnnl_graph_op_attr_zps, + /// Specifies a group shape attribute to an op. + dnnl_graph_op_attr_group_shape, + + // bool attributes. The value of these attributes can be any single bool + // value. + + /// Specifies an exclude_pad attribute to an op. + dnnl_graph_op_attr_exclude_pad = 0x60, + /// Specifies a keep_dims attribute to an op. + dnnl_graph_op_attr_keep_dims, + /// Specifies a keep_stats attribute to an op. + dnnl_graph_op_attr_keep_stats, + /// Specifies a per_channel_broadcast attribute to an op. + dnnl_graph_op_attr_per_channel_broadcast, + /// Specifies a special_zero attribute to an op. + dnnl_graph_op_attr_special_zero, + /// Specifies a transpose_a attribute to an op. + dnnl_graph_op_attr_transpose_a, + /// Specifies a transpose_b attribute to an op. + dnnl_graph_op_attr_transpose_b, + /// Specifies an use_affine attribute to an op. + dnnl_graph_op_attr_use_affine, + /// Specifies an use_dst attribute to an op. + dnnl_graph_op_attr_use_dst, + + // string attributes. The value of these attributes can be a string. + + /// Specifies an auto_broadcast attribute to an op. The value can be "none" + /// or "numpy". + dnnl_graph_op_attr_auto_broadcast = 0x80, + /// Specifies an auto_pad attribute to an op. The value can be "none", + /// "same_upper", "same_lower", or "valid". + dnnl_graph_op_attr_auto_pad, + /// Specifies an coordinate_transformation_mode attribute to an op. The + /// value can be "half_pixel" or "align_corners". The attribute is defined + /// for Interpolate operations. + dnnl_graph_op_attr_coordinate_transformation_mode, + /// Specifies a data_format of an op. The value can be "NCX" or "NXC". + dnnl_graph_op_attr_data_format, + /// Specifies a mode attribute of an op. The value can be "nearest", + /// "linear", "bilinear", or "trilinear". The attribute is defined for + /// Interpolate operations. + dnnl_graph_op_attr_mode, + /// Specifies a qtype attribute to an op. The value can be "per_channel" or + /// "per_tensor". The attribute is defined for quantization operations. + dnnl_graph_op_attr_qtype, + /// Specifies a rounding_type attribute to an op. The value can be "ceil" or + /// "floor". + dnnl_graph_op_attr_rounding_type, + /// Specifies a weights_format of an op. The value can be "OIX", "XIO", + /// "IOX", or "XOI". Different operations may support different values. + dnnl_graph_op_attr_weights_format, + + /// Specifies the end of all above exteral attributes for check. + dnnl_graph_op_attr_end = 0xFF, +} dnnl_graph_op_attr_t; + +/// An opaque structure to describe an operation. +struct dnnl_graph_op; + +/// An operation handle. +typedef struct dnnl_graph_op *dnnl_graph_op_t; + +/// A constant operation handle. +typedef const struct dnnl_graph_op *const_dnnl_graph_op_t; + +/// @} dnnl_graph_api_op + +/// @addtogroup dnnl_graph_api_allocator +/// @{ + +/// Allocation call-back function interface for host. For SYCL allocator, see +/// #dnnl_graph_sycl_allocate_f. +typedef void *(*dnnl_graph_host_allocate_f)(size_t size, size_t alignment); + +/// Deallocation call-back function interface for host. For SYCL allocator, see +/// #dnnl_graph_sycl_deallocate_f. +typedef void (*dnnl_graph_host_deallocate_f)(void *); + +/// An opaque structure to describe an allocator. +struct dnnl_graph_allocator; + +/// An allocator handle. +typedef struct dnnl_graph_allocator *dnnl_graph_allocator_t; + +/// A constant allocator handle. +typedef const struct dnnl_graph_allocator *const_dnnl_graph_allocator_t; + +/// @} dnnl_graph_api_allocator + +/// @addtogroup dnnl_graph_api_compiled_partition +/// @{ + +/// In-place pair definition. It can queried from a compiled partition +/// indicating that an input and an output of the partition can share the same +/// memory buffer for computation. In-place computation helps to reduce the +/// memory footprint and improves cache locality. But since the library may not +/// have a global view of user's application, it's possible that the tensor with +/// `input_id` is used at other places in user's computation graph. In this +/// case, the user should take the in-place pair as a hint and pass a different +/// memory buffer for output tensor to avoid overwriting the input memory buffer +/// which will probably cause unexpected incorrect results. +typedef struct { + /// The id of input tensor + size_t input_id; + + /// The id of output tensor + size_t output_id; +} dnnl_graph_inplace_pair_t; + +/// An opaque structure to describe a compiled partition. +struct dnnl_graph_compiled_partition; + +/// A compiled partition handle. +typedef struct dnnl_graph_compiled_partition *dnnl_graph_compiled_partition_t; + +/// A constant compiled partition handle. +typedef const struct dnnl_graph_compiled_partition + *const_dnnl_graph_compiled_partition_t; + +/// @} dnnl_graph_api_compiled_partition + +/// @addtogroup dnnl_graph_api_tensor +/// @{ + +/// An opaque structure to describe a tensor. +struct dnnl_graph_tensor; + +/// A tensor handle. +typedef struct dnnl_graph_tensor *dnnl_graph_tensor_t; + +/// A constant tensor handle. +typedef const struct dnnl_graph_tensor *const_dnnl_graph_tensor_t; + +/// @} dnnl_graph_api_tensor + +/// @} dnnl_graph_api + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h new file mode 100644 index 0000000000000000000000000000000000000000..a7e33c54c3d7bcdb98a0908a8a7ceade34a3d485 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h @@ -0,0 +1,281 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_OCL_H +#define ONEAPI_DNNL_DNNL_OCL_H + +#include "oneapi/dnnl/dnnl.h" + +#include "oneapi/dnnl/dnnl_ocl_types.h" + +/// @cond DO_NOT_DOCUMENT_THIS +// Set target version for OpenCL explicitly to suppress a compiler warning. +#ifndef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 120 +#endif + +#include +/// @endcond + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_ocl_interop +/// @{ + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl_memory_set_data_handle() has been called, if @p memory_kind is equal +/// to dnnl_ocl_interop_usm, or +/// - dnnl_ocl_interop_memory_set_mem_object() has been called, if @p memory_kind +/// is equal to dnnl_ocl_interop_buffer. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param memory_kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl_ocl_interop_usm. +/// - An OpenCL buffer. In this case the library doesn't own the buffer. +/// Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + dnnl_ocl_interop_memory_kind_t memory_kind, void *handle); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory object with multiple handles. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param memory_kind Memory allocation kind to specify the type of handles. +/// @param nhandles Number of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl_ocl_interop_usm. +/// - An OpenCL buffer. In this case the library doesn't own the buffer. +/// Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create_v2(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + dnnl_ocl_interop_memory_kind_t memory_kind, int nhandles, + void **handles); +#endif + +/// Returns the memory allocation kind associated with a memory object. +/// +/// @param memory Memory to query. +/// @param memory_kind Output underlying memory allocation kind of the memory +/// object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_memory_kind( + const_dnnl_memory_t memory, + dnnl_ocl_interop_memory_kind_t *memory_kind); + +/// Returns an OpenCL memory object associated with a memory object. +/// +/// @param memory Memory object. +/// @param mem_object Output OpenCL memory object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object( + const_dnnl_memory_t memory, cl_mem *mem_object); + +/// Sets OpenCL memory object associated with a memory object. +/// +/// For behavioral details, see dnnl_memory_set_data_handle(). +/// +/// @param memory Memory object. +/// @param mem_object OpenCL memory object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object( + dnnl_memory_t memory, cl_mem mem_object); + +/// Retrieves a cache blob ID for the OpenCL device. +/// +/// @warning +/// This API is intended to be used with +/// #dnnl_ocl_interop_engine_get_cache_blob() and +/// #dnnl_ocl_interop_engine_create_from_cache_blob(). The returned cache +/// blob ID can only be used as an ID of the cache blob returned by +/// #dnnl_ocl_interop_engine_get_cache_blob(). +/// +/// @note The cache blob ID can be empty (@p size will be 0 and +/// @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to +/// put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will +/// return an empty cache blob). +/// +/// @param device An OpenCL device. +/// @param size Size of the cache blob ID in bytes. +/// @param cache_blob_id Cache blob id of size @p size. If +/// the @p cache_blob_id is nullptr then the size of the cache blob ID is +/// returned in @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob_id( + cl_device_id device, size_t *size, uint8_t *cache_blob_id); + +/// Retrieves a cache blob associated with the given engine. +/// +/// @note The cache blob can be empty (@p size will be 0 and @p cache_blob +/// will be nullptr) if oneDNN doesn't have anything to put in the cache +/// blob. It's the user's responsibility to check whether it's empty +/// prior to passing it to +/// #dnnl_ocl_interop_engine_create_from_cache_blob(). +/// +/// @param engine Engine to query for the cache blob. +/// @param size Size of the cache blob in bytes. +/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is +/// nullptr then the size of the cache blob is returned in @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob( + dnnl_engine_t engine, size_t *size, uint8_t *cache_blob); + +/// Creates an engine from the given cache blob. +/// +/// @param engine Output engine. +/// @param device The OpenCL device that this engine will encapsulate. +/// @param context The OpenCL context (containing the device) that this +/// engine will use for all operations. +/// @param size Size of the cache blob in bytes. +/// @param cache_blob Cache blob of size @p size. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob( + dnnl_engine_t *engine, cl_device_id device, cl_context context, + size_t size, const uint8_t *cache_blob); + +/// Creates an engine associated with an OpenCL device and an OpenCL context. +/// +/// @param engine Output engine. +/// @param device Underlying OpenCL device to use for the engine. +/// @param context Underlying OpenCL context to use for the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create( + dnnl_engine_t *engine, cl_device_id device, cl_context context); + +/// Returns the OpenCL context associated with an engine. +/// +/// @param engine Engine to query. +/// @param context Output underlying OpenCL context of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_context( + dnnl_engine_t engine, cl_context *context); + +/// Returns the OpenCL device associated with an engine. +/// +/// @param engine Engine to query. +/// @param device Output underlying OpenCL device of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_get_device( + dnnl_engine_t engine, cl_device_id *device); + +/// Creates an execution stream for a given engine associated with +/// an OpenCL command queue. +/// +/// @param stream Output execution stream. +/// @param engine Engine to create the execution stream on. +/// @param queue OpenCL command queue to use. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_stream_create( + dnnl_stream_t *stream, dnnl_engine_t engine, cl_command_queue queue); + +/// Returns the OpenCL command queue associated with an execution stream. +/// +/// @param stream Execution stream to query. +/// @param queue Output OpenCL command queue. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_stream_get_command_queue( + dnnl_stream_t stream, cl_command_queue *queue); + +/// Executes computations specified by the primitive in a specified stream and +/// returns an OpenCL event. +/// +/// @param primitive Primitive to execute. +/// @param stream Stream to use. +/// @param nargs Number of arguments. +/// @param args Array of arguments. Each argument is an +/// pair. The index is one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see +/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory +/// descriptor as that returned by +/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index). +/// @param deps A pointer to a vector of size @p ndeps that contains +/// dependencies. +/// @param ndeps Number of dependencies. +/// @param return_event Output event. It's the user's responsibility to +/// manage lifetime of the event. Can be NULL. When @p stream is in-order +/// NULL will be returned. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ocl_interop_primitive_execute( + const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs, + const dnnl_exec_arg_t *args, const cl_event *deps, int ndeps, + cl_event *return_event); + +/// @} dnnl_api_ocl_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bc8258fe2420873476069ec78bdc13dc53ffef9c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp @@ -0,0 +1,450 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_OCL_HPP +#define ONEAPI_DNNL_DNNL_OCL_HPP + +#include "oneapi/dnnl/dnnl.hpp" + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include +#include +#include +#include +#include +#include + +#include "oneapi/dnnl/dnnl_ocl.h" + +#include +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_api_interop Runtime interoperability API +/// API extensions to interact with the underlying run-time. +/// @{ + +/// @addtogroup dnnl_api_ocl_interop OpenCL interoperability API +/// API extensions to interact with the underlying OpenCL run-time. +/// +/// @sa @ref dev_guide_opencl_interoperability in developer guide +/// @{ + +/// OpenCL interoperability namespace +namespace ocl_interop { + +/// Memory allocation kind. +enum class memory_kind { + /// USM (device, shared, host, or unknown) memory allocation kind. + usm = dnnl_ocl_interop_usm, + /// Buffer memory allocation kind - default. + buffer = dnnl_ocl_interop_buffer, +}; + +/// Converts a memory allocation kind enum value from C++ API to C API type. +/// +/// @param akind C++ API memory allocation kind enum value. +/// @returns Corresponding C API memory allocation kind enum value. +inline dnnl_ocl_interop_memory_kind_t convert_to_c(memory_kind akind) { + return static_cast(akind); +} + +/// Returns the cache blob ID of the OpenCL device. +/// +/// @warning +/// This API is intended to be used with +/// #dnnl::ocl_interop::get_engine_cache_blob() and +/// #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector &). +/// The returned cache blob ID can only be used as an ID of the cache blob +/// returned by #dnnl::ocl_interop::get_engine_cache_blob(). +/// +/// @note The cache blob ID can be empty (@p size will be 0 and +/// @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to +/// put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will +/// return an empty cache blob). +/// +/// @param device An OpenCL device. +/// @returns A vector containing the cache blob ID. +inline std::vector get_engine_cache_blob_id(cl_device_id device) { + size_t size = 0; + error::wrap_c_api( + dnnl_ocl_interop_engine_get_cache_blob_id(device, &size, nullptr), + "could not get an engine cache blob id size"); + + std::vector cache_blob_id(size); + error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob_id( + device, &size, cache_blob_id.data()), + "could not get an engine cache blob id"); + return cache_blob_id; +} + +/// Returns a cache blob for the engine. +/// +/// @note The cache blob vector can be empty if oneDNN doesn't have anything +/// to put in the cache blob. It's the user's responsibility to check +/// whether it's empty prior to passing it to +/// #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector &) +/// +/// @param aengine Engine to query for the cache blob. +/// @returns Vector containing the cache blob. +inline std::vector get_engine_cache_blob(const engine &aengine) { + size_t size = 0; + error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob( + aengine.get(), &size, nullptr), + "could not get an engine cache blob size"); + + std::vector cache_blob(size); + error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob( + aengine.get(), &size, cache_blob.data()), + "could not get an engine cache blob"); + return cache_blob; +} + +/// Constructs an engine from the given cache blob. +/// +/// @param device The OpenCL device that this engine will encapsulate. +/// @param context The OpenCL context (containing the device) that this +/// engine will use for all operations. +/// @param cache_blob Cache blob. +/// @returns An engine. +inline engine make_engine(cl_device_id device, cl_context context, + const std::vector &cache_blob) { + dnnl_engine_t c_engine; + error::wrap_c_api( + dnnl_ocl_interop_engine_create_from_cache_blob(&c_engine, device, + context, cache_blob.size(), cache_blob.data()), + "could not create an engine from cache blob"); + return engine(c_engine); +} + +/// Constructs an engine from OpenCL device and context objects. +/// +/// @param device The OpenCL device that this engine will encapsulate. +/// @param context The OpenCL context (containing the device) that this +/// engine will use for all operations. +/// @returns An engine. +inline engine make_engine(cl_device_id device, cl_context context) { + dnnl_engine_t c_engine; + error::wrap_c_api( + dnnl_ocl_interop_engine_create(&c_engine, device, context), + "could not create an engine"); + return engine(c_engine); +} + +/// Returns OpenCL context associated with the engine. +/// +/// @param aengine An engine. +/// @returns Underlying OpenCL context. +inline cl_context get_context(const engine &aengine) { + cl_context context = nullptr; + error::wrap_c_api( + dnnl_ocl_interop_engine_get_context(aengine.get(), &context), + "could not get an OpenCL context from an engine"); + return context; +} + +/// Returns OpenCL device associated with the engine. +/// +/// @param aengine An engine. +/// @returns Underlying OpenCL device. +inline cl_device_id get_device(const engine &aengine) { + cl_device_id device = nullptr; + error::wrap_c_api(dnnl_ocl_interop_get_device(aengine.get(), &device), + "could not get an OpenCL device from an engine"); + return device; +} + +/// Constructs an execution stream for the specified engine and OpenCL queue. +/// +/// @param aengine Engine to create the stream on. +/// @param queue OpenCL queue to use for the stream. +/// @returns An execution stream. +inline stream make_stream(const engine &aengine, cl_command_queue queue) { + dnnl_stream_t c_stream; + error::wrap_c_api( + dnnl_ocl_interop_stream_create(&c_stream, aengine.get(), queue), + "could not create a stream"); + return stream(c_stream); +} + +/// Returns OpenCL queue object associated with the execution stream. +/// +/// @param astream An execution stream. +/// @returns Underlying OpenCL queue. +inline cl_command_queue get_command_queue(const stream &astream) { + cl_command_queue queue = nullptr; + error::wrap_c_api( + dnnl_ocl_interop_stream_get_command_queue(astream.get(), &queue), + "could not get an OpenCL command queue from a stream"); + return queue; +} + +/// Returns the OpenCL memory object associated with the memory object. +/// +/// @param amemory A memory object. +/// @returns Underlying OpenCL memory object. +inline cl_mem get_mem_object(const memory &amemory) { + cl_mem mem_object; + error::wrap_c_api( + dnnl_ocl_interop_memory_get_mem_object(amemory.get(), &mem_object), + "could not get OpenCL buffer object from a memory object"); + return mem_object; +} + +/// Sets the OpenCL memory object associated with the memory object. +/// +/// For behavioral details see memory::set_data_handle(). +/// +/// @param amemory A memory object. +/// @param mem_object OpenCL cl_mem object to use as the underlying +/// storage. It must have at least get_desc().get_size() bytes +/// allocated. +inline void set_mem_object(memory &amemory, cl_mem mem_object) { + error::wrap_c_api( + dnnl_ocl_interop_memory_set_mem_object(amemory.get(), mem_object), + "could not set OpenCL buffer object from a memory object"); +} + +/// Returns the memory allocation kind associated with a memory object. +/// +/// @param amemory A memory object. +/// +/// @returns The underlying memory allocation kind of the memory object. +inline memory_kind get_memory_kind(const memory &amemory) { + dnnl_ocl_interop_memory_kind_t ckind; + error::wrap_c_api( + dnnl_ocl_interop_memory_get_memory_kind(amemory.get(), &ckind), + "could not get memory kind"); + return static_cast(ckind); +} + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory object with multiple handles. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl_ocl_interop_usm. +/// - An OpenCL buffer. In this case the library doesn't own the buffer. +/// Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// If the @p handles vector is not provided the library will allocate all +/// buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE. +/// +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, + std::vector handles = {}) { + if (handles.empty()) { + const int nhandles = memory_desc.get_num_handles(); + handles.resize(nhandles, DNNL_MEMORY_ALLOCATE); + } + + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_ocl_interop_memory_create_v2(&c_memory, memory_desc.get(), + aengine.get(), convert_to_c(kind), (int)handles.size(), + handles.data()), + "could not create a memory"); + return memory(c_memory); +} + +/// Constructs a memory object with multiple OpenCL buffers. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param mem_objects A vector of OpenCL buffers to use. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, std::vector mem_objects) { + const int nhandles = memory_desc.get_num_handles(); + std::vector handles(nhandles, DNNL_MEMORY_NONE); + memory amemory(memory_desc, aengine, handles); + for (int i = 0; i < nhandles; i++) + amemory.set_data_handle(mem_objects[i], i); + return amemory; +} + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is +/// equal to dnnl::ocl_interop::memory_kind::usm, or +/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is +/// equal to dnnl::ocl_interop::memory_kind::buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::ocl_interop::memory_kind::usm. +/// - An OpenCL buffer. In this case the library doesn't own the buffer. +/// Requires @p memory_kind be equal to be equal to +/// dnnl::ocl_interop::memory_kind::buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, void *handle) { + return make_memory( + memory_desc, aengine, kind, std::vector {handle}); +} + +/// Constructs a memory object from an OpenCL buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param mem_object An OpenCL buffer to use. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, cl_mem mem_object) { + return make_memory(memory_desc, aengine, std::vector {mem_object}); +} +#else + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is +/// equal to dnnl::ocl_interop::memory_kind::usm, or +/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is +/// equal to dnnl::ocl_interop::memory_kind::buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::ocl_interop::memory_kind::usm. +/// - An OpenCL buffer. In this case the library doesn't own the buffer. +/// Requires @p memory_kind be equal to be equal to +/// dnnl::ocl_interop::memory_kind::buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, + void *handle = DNNL_MEMORY_ALLOCATE) { + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_ocl_interop_memory_create(&c_memory, memory_desc.get(), + aengine.get(), convert_to_c(kind), handle), + "could not create a memory"); + return memory(c_memory); +} + +/// Constructs a memory object from an OpenCL buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param mem_object An OpenCL buffer to use. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, cl_mem mem_object) { + memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE); + set_mem_object(amemory, mem_object); + return amemory; +} +#endif + +/// Executes computations specified by the primitive in a specified stream and +/// returns a SYCL event. +/// +/// Arguments are passed via an arguments map containing +/// pairs. The index must be one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor +/// matching the one returned by +/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using +/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL). +/// +/// @param aprimitive Primitive to execute. +/// @param astream Stream object. The stream must belong to the same engine +/// as the primitive. +/// @param args Arguments map. +/// @param deps Optional vector with `cl_event` dependencies. +/// +/// @returns Output event. It's the user's responsibility to manage lifetime +/// of the event. +inline cl_event execute(const dnnl::primitive &aprimitive, + const stream &astream, const std::unordered_map &args, + const std::vector &deps = {}) { + std::vector c_args; + c_args.reserve(args.size()); + for (const auto &a : args) + c_args.push_back({a.first, a.second.get()}); + + const cl_event *c_deps = deps.empty() ? nullptr : deps.data(); + + cl_event return_event; + error::wrap_c_api(dnnl_ocl_interop_primitive_execute(aprimitive.get(), + astream.get(), (int)c_args.size(), c_args.data(), + c_deps, (int)deps.size(), &return_event), + "could not execute a primitive"); + return return_event; +} + +} // namespace ocl_interop + +/// @} dnnl_api_ocl_interop + +/// @} dnnl_api_interop + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8e3ab7be1897a19a4db44393e0cfdf4f9a36d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_OCL_TYPES_H +#define ONEAPI_DNNL_DNNL_OCL_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_ocl_interop +/// @{ + +/// Memory allocation kind. +typedef enum { + /// USM (device, shared, host, or unknown) memory allocation kind. + dnnl_ocl_interop_usm, + /// Buffer memory allocation kind - default. + dnnl_ocl_interop_buffer, +} dnnl_ocl_interop_memory_kind_t; + +/// @} dnnl_api_ocl_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h new file mode 100644 index 0000000000000000000000000000000000000000..443ff82bb93536da911a8c6dbb5be838c6f0e9ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h @@ -0,0 +1,204 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_SYCL_H +#define ONEAPI_DNNL_DNNL_SYCL_H + +#include "oneapi/dnnl/dnnl.h" + +#include "oneapi/dnnl/dnnl_sycl_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_sycl_interop +/// @{ + +/// Creates an engine associated with a SYCL device and a SYCL context. +/// +/// @param engine Output engine. +/// @param device Pointer to the SYCL device to use for the engine. +/// @param context Pointer to the SYCL context to use for the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_engine_create( + dnnl_engine_t *engine, const void *device, const void *context); + +/// Returns the SYCL context associated with an engine. +/// +/// @param engine Engine to query. +/// @param context Pointer to the underlying SYCL context of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_context( + dnnl_engine_t engine, void **context); + +/// Returns the SYCL device associated with an engine. +/// +/// @param engine Engine to query. +/// @param device Pointer to the underlying SYCL device of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_device( + dnnl_engine_t engine, void **device); + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal +/// to dnnl_sycl_interop_usm, or +/// - dnnl_sycl_interop_memory_set_buffer() has been called, if @p memory_kind +/// is equal to dnnl_sycl_interop_buffer. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param memory_kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl_sycl_interop_usm. +/// - A pointer to SYCL buffer. In this case the library doesn't own the +/// buffer. Requires @p memory_kind be equal to be equal to +/// dnnl_sycl_interop_buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + dnnl_sycl_interop_memory_kind_t memory_kind, void *handle); + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory object with multiple handles. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param memory_kind Memory allocation kind to specify the type of handles. +/// @param nhandles Number of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl_sycl_interop_usm. +/// - A pointer to SYCL buffer. In this case the library doesn't own the +/// buffer. Requires @p memory_kind be equal to be equal to +/// dnnl_sycl_interop_buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create_v2(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + dnnl_sycl_interop_memory_kind_t memory_kind, int nhandles, + void **handles); +#endif + +/// Returns the memory allocation kind associated with a memory object. +/// +/// @param memory Memory to query. +/// @param memory_kind Output underlying memory allocation kind of the memory +/// object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_memory_get_memory_kind( + const_dnnl_memory_t memory, + dnnl_sycl_interop_memory_kind_t *memory_kind); + +/// Sets a SYCL buffer for a memory object. +/// +/// @param memory Memory object. +/// @param buffer SYCL buffer to be set in the memory object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_memory_set_buffer( + dnnl_memory_t memory, void *buffer); + +/// Creates an execution stream for a given engine associated with a SYCL +/// queue. +/// +/// @param stream Output execution stream. +/// @param engine Engine to create the execution stream on. +/// @param queue SYCL queue to use. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_stream_create( + dnnl_stream_t *stream, dnnl_engine_t engine, void *queue); + +/// Returns the SYCL queue associated with an execution stream. +/// +/// @param stream Execution stream to query. +/// @param queue Output SYCL command queue. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_stream_get_queue( + dnnl_stream_t stream, void **queue); + +/// Executes computations specified by the primitive in a specified stream and +/// returns a SYCL event. +/// +/// @param primitive Primitive to execute. +/// @param stream Stream to use. +/// @param nargs Number of arguments. +/// @param args Array of arguments. Each argument is an +/// pair. The index is one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see +/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory +/// descriptor as that returned by +/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index). +/// @param deps A pointer to std::vector that contains +/// dependencies. +/// @param return_event Output event. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_sycl_interop_primitive_execute( + const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs, + const dnnl_exec_arg_t *args, const void *deps, void *return_event); + +/// @} dnnl_api_sycl_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ab95e4f0ea13eaecb6daa934178d887a4207998c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp @@ -0,0 +1,389 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_SYCL_HPP +#define ONEAPI_DNNL_DNNL_SYCL_HPP + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +#else +#error "Unsupported compiler" +#endif + +#include "oneapi/dnnl/dnnl.hpp" +#include "oneapi/dnnl/dnnl_sycl.h" + +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_sycl_interop SYCL interoperability API +/// API extensions to interact with the underlying SYCL run-time. +/// +/// @sa @ref dev_guide_dpcpp_interoperability in developer guide +/// @{ + +/// SYCL interoperability namespace +namespace sycl_interop { + +/// Memory allocation kind. +enum class memory_kind { + /// USM (device, shared, host, or unknown) memory allocation kind - default. + usm = dnnl_sycl_interop_usm, + /// Buffer memory allocation kind. + buffer = dnnl_sycl_interop_buffer, +}; + +/// Converts a memory allocation kind enum value from C++ API to C API type. +/// +/// @param akind C++ API memory allocation kind enum value. +/// @returns Corresponding C API memory allocation kind enum value. +inline dnnl_sycl_interop_memory_kind_t convert_to_c(memory_kind akind) { + return static_cast(akind); +} + +/// Constructs an engine from SYCL device and context objects. +/// +/// @param adevice SYCL device. +/// @param acontext SYCL context. +/// +/// @returns Created engine. +inline engine make_engine( + const sycl::device &adevice, const sycl::context &acontext) { + dnnl_engine_t aengine; + error::wrap_c_api(dnnl_sycl_interop_engine_create(&aengine, + static_cast(&adevice), + static_cast(&acontext)), + "could not create an engine"); + return engine(aengine); +} + +/// Returns the SYCL context associated with an engine. +/// +/// @param aengine Engine to query. +/// +/// @returns The underlying SYCL device of the engine. +inline sycl::context get_context(const engine &aengine) { + void *ctx_ptr; + error::wrap_c_api( + dnnl_sycl_interop_engine_get_context(aengine.get(), &ctx_ptr), + "could not get a context handle"); + auto ctx = *static_cast(ctx_ptr); + return ctx; +} + +/// Returns the SYCL device associated with an engine. +/// +/// @param aengine Engine to query. +/// +/// @returns The underlying SYCL context of the engine. +inline sycl::device get_device(const engine &aengine) { + void *dev_ptr; + error::wrap_c_api( + dnnl_sycl_interop_engine_get_device(aengine.get(), &dev_ptr), + "could not get a device handle"); + auto dev = *static_cast(dev_ptr); + return dev; +} + +/// Creates an execution stream for a given engine associated with a SYCL +/// queue. +/// +/// @param aengine Engine object to use for the stream. +/// @param aqueue SYCL queue to use for the stream. +/// +/// @returns An execution stream. +inline stream make_stream(const engine &aengine, sycl::queue &aqueue) { + dnnl_stream_t astream; + error::wrap_c_api( + dnnl_sycl_interop_stream_create(&astream, aengine.get(), &aqueue), + "could not create a stream"); + return stream(astream); +} + +/// Returns the SYCL queue associated with an execution stream. +/// +/// @param astream Execution stream to query. +/// +/// @returns SYCL queue object. +inline sycl::queue get_queue(const stream &astream) { + void *queue_ptr; + error::wrap_c_api( + dnnl_sycl_interop_stream_get_queue(astream.get(), &queue_ptr), + "could not get a stream handle"); + auto queue = *static_cast(queue_ptr); + return queue; +} + +/// Returns the SYCL buffer associated with a memory object. +/// +/// Throws an exception if the memory allocation kind associated with the +/// memory object is not equal to dnnl::sycl_interop::memory_kind::buffer. +/// +/// @tparam T Type of the requested buffer. +/// @tparam ndims Number of dimensions of the requested buffer. +/// @param amemory Memory object. +/// +/// @returns SYCL buffer associated with the memory object. +template +sycl::buffer get_buffer(const memory &amemory) { + static_assert(ndims == 1, "only 1D buffers supported"); + + // XXX: workaround: when CPU runtime is not SYCL and amemory was created + // for CPU engine `get_buffer` should return an error. Use interop API to + // implement the check. + dnnl_sycl_interop_memory_kind_t ckind; + error::wrap_c_api( + dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind), + "could not get SYCL buffer object"); + + void *handle_ptr; + error::wrap_c_api(dnnl_memory_get_data_handle(amemory.get(), &handle_ptr), + "could not get SYCL buffer object"); + + // XXX: workaround: zero-range buffer cannot be constructed. + if (!handle_ptr) return sycl::buffer(sycl::range<1>(1)); + + auto &buf_u8 = *static_cast *>(handle_ptr); + + auto range = sycl::range<1>(buf_u8.byte_size() / sizeof(T)); + return buf_u8.reinterpret(range); +} + +/// Sets SYCL buffer associated with a memory object. +/// +/// @tparam T Type of the buffer. +/// @tparam ndims Number of dimensions of the buffer. +/// @param amemory Memory object to change. +/// @param abuffer SYCL buffer. +template +void set_buffer(memory &amemory, sycl::buffer &abuffer) { + auto range = sycl::range<1>(abuffer.byte_size()); + auto buf_u8 = abuffer.template reinterpret(range); + error::wrap_c_api(dnnl_sycl_interop_memory_set_buffer( + amemory.get(), static_cast(&buf_u8)), + "could not set SYCL buffer object"); +} + +/// Returns the memory allocation kind associated with a memory object. +/// +/// @param amemory A memory object. +/// +/// @returns The underlying memory allocation kind of the memory object. +inline memory_kind get_memory_kind(const memory &amemory) { + dnnl_sycl_interop_memory_kind_t ckind; + error::wrap_c_api( + dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind), + "could not get memory kind"); + return static_cast(ckind); +} + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Creates a memory object with multiple handles. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::sycl_interop::memory_kind::usm. +/// - A pointer to SYCL buffer. In this case the library doesn't own the +/// buffer. Requires @p memory_kind be equal to be equal to +/// dnnl::sycl_interop::memory_kind::buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// If the @p handles vector is not provided the library will allocate all +/// buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, + std::vector handles = {}) { + if (handles.empty()) { + const int nhandles = memory_desc.get_num_handles(); + handles.resize(nhandles, DNNL_MEMORY_ALLOCATE); + } + + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_sycl_interop_memory_create_v2(&c_memory, memory_desc.get(), + aengine.get(), convert_to_c(kind), (int)handles.size(), + handles.data()), + "could not create a memory"); + return memory(c_memory); +} + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is +/// equal to dnnl::sycl_interop::memory_kind::usm, or +/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is +/// equal to dnnl::sycl_interop::memory_kind::buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::sycl_interop::memory_kind::usm. +/// - A pointer to SYCL buffer. In this case the library doesn't own the +/// buffer. Requires @p memory_kind be equal to be equal to +/// dnnl::sycl_interop::memory_kind::buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, void *handle) { + return make_memory( + memory_desc, aengine, kind, std::vector {handle}); +} +#else + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is +/// equal to dnnl::sycl_interop::memory_kind::usm, or +/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is +/// equal to dnnl::sycl_interop::memory_kind::buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param kind Memory allocation kind to specify the type of handle. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::sycl_interop::memory_kind::usm. +/// - A pointer to SYCL buffer. In this case the library doesn't own the +/// buffer. Requires @p memory_kind be equal to be equal to +/// dnnl::sycl_interop::memory_kind::buffer. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer that corresponds to the memory allocation kind +/// @p memory_kind for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, memory_kind kind, + void *handle = DNNL_MEMORY_ALLOCATE) { + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_sycl_interop_memory_create(&c_memory, memory_desc.get(), + aengine.get(), convert_to_c(kind), handle), + "could not create a memory"); + return memory(c_memory); +} +#endif + +/// Constructs a memory object from a SYCL buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param abuffer A SYCL buffer to use. +/// +/// @returns Created memory object. +template +memory make_memory(const memory::desc &memory_desc, const engine &aengine, + sycl::buffer &abuffer) { + memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE); + set_buffer(amemory, abuffer); + return amemory; +} + +/// Executes computations specified by the primitive in a specified stream and +/// returns a SYCL event. +/// +/// Arguments are passed via an arguments map containing +/// pairs. The index must be one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor +/// matching the one returned by +/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using +/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL). +/// +/// @param aprimitive Primitive to execute. +/// @param astream Stream object. The stream must belong to the same engine +/// as the primitive. +/// @param args Arguments map. +/// @param deps Optional vector with `sycl::event` dependencies. +/// +/// @returns Output event. +inline sycl::event execute(const dnnl::primitive &aprimitive, + const stream &astream, const std::unordered_map &args, + const std::vector &deps = {}) { + std::vector c_args; + c_args.reserve(args.size()); + for (const auto &a : args) + c_args.push_back({a.first, a.second.get()}); + + sycl::event return_event; + error::wrap_c_api( + dnnl_sycl_interop_primitive_execute(aprimitive.get(), astream.get(), + (int)c_args.size(), c_args.data(), &deps, &return_event), + "could not execute a primitive"); + return return_event; +} + +} // namespace sycl_interop + +/// @} dnnl_api_sycl_interop + +/// @} dnnl_api_interop + +} // namespace dnnl + +/// @} dnnl_api + +#endif // DNNL_SYCL_HPP + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h new file mode 100644 index 0000000000000000000000000000000000000000..d137c666508351ef0a9aad39f38cd6f439b74c83 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_SYCL_TYPES_H +#define ONEAPI_DNNL_DNNL_SYCL_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_sycl_interop +/// @{ + +/// Memory allocation kind. +typedef enum { + /// USM (device, shared, host, or unknown) memory allocation kind - default. + dnnl_sycl_interop_usm, + /// Buffer memory allocation kind. + dnnl_sycl_interop_buffer, +} dnnl_sycl_interop_memory_kind_t; + +/// @} dnnl_api_sycl_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..586624e53a5775535136fefd83ef81feb0f6e527 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h @@ -0,0 +1,123 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2022 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_H +#define ONEAPI_DNNL_DNNL_THREADPOOL_H + +#include "oneapi/dnnl/dnnl_config.h" +#include "oneapi/dnnl/dnnl_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_threadpool_interop +/// @{ + +/// Creates an execution stream with specified threadpool. +/// +/// @sa @ref dev_guide_threadpool +/// +/// @param stream Output execution stream. +/// @param engine Engine to create the execution stream on. +/// @param threadpool Pointer to an instance of a C++ class that implements +/// dnnl::threapdool_iface interface. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_create( + dnnl_stream_t *stream, dnnl_engine_t engine, void *threadpool); + +/// Returns a threadpool to be used by the execution stream. +/// +/// @sa @ref dev_guide_threadpool +/// +/// @param astream Execution stream. +/// @param threadpool Output pointer to an instance of a C++ class that +/// implements dnnl::threapdool_iface interface. Set to NULL if the +/// stream was created without threadpool. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_get_threadpool( + dnnl_stream_t astream, void **threadpool); + +/// Sets the maximum concurrency assumed by oneDNN when outside a +/// parallel call. +/// +/// @param max_concurrency The maximum concurrency assumed by oneDNN +/// when outside a parallel call. This is a threadlocal setting. +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_threadpool_interop_set_max_concurrency( + int max_concurrency); + +/// Gets the maximum concurrency assumed by oneDNN when outside a +/// parallel call. +/// +/// @param max_concurrency The maximum concurrency assumed by oneDNN +/// when outside a parallel call. This is a threadlocal setting. +/// @returns #dnnl_success on success and a status describing the +/// error otherwise. +dnnl_status_t DNNL_API dnnl_threadpool_interop_get_max_concurrency( + int *max_concurrency); + +/// @copydoc dnnl_sgemm() +/// @param threadpool A pointer to a threadpool interface (only when built with +/// the THREADPOOL CPU runtime). +dnnl_status_t DNNL_API dnnl_threadpool_interop_sgemm(char transa, char transb, + dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A, + dnnl_dim_t lda, const float *B, dnnl_dim_t ldb, float beta, float *C, + dnnl_dim_t ldc, void *threadpool); + +/// @copydoc dnnl_gemm_u8s8s32() +/// @param threadpool A pointer to a threadpool interface (only when built with +/// the THREADPOOL CPU runtime). +dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_u8s8s32(char transa, + char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, + float alpha, const uint8_t *A, dnnl_dim_t lda, uint8_t ao, + const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C, + dnnl_dim_t ldc, const int32_t *co, void *threadpool); + +/// @copydoc dnnl_gemm_s8s8s32() +/// @param threadpool A pointer to a threadpool interface (only when built with +/// the THREADPOOL CPU runtime). +dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_s8s8s32(char transa, + char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, + float alpha, const int8_t *A, dnnl_dim_t lda, int8_t ao, + const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C, + dnnl_dim_t ldc, const int32_t *co, void *threadpool); + +/// @} dnnl_api_threadpool_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f483868c6cc55d48131881dcc46d1a9c5e104cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp @@ -0,0 +1,118 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_HPP +#define ONEAPI_DNNL_DNNL_THREADPOOL_HPP + +#include "oneapi/dnnl/dnnl.hpp" +#include "oneapi/dnnl/dnnl_threadpool.h" + +#include "oneapi/dnnl/dnnl_threadpool_iface.hpp" + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_threadpool_interop Threadpool interoperability API +/// API extensions to interact with the underlying Threadpool run-time. +/// @{ + +/// Threadpool interoperability namespace +namespace threadpool_interop { + +/// Constructs an execution stream for the specified engine and threadpool. +/// +/// @sa @ref dev_guide_threadpool +/// +/// @param aengine Engine to create the stream on. +/// @param threadpool Pointer to an instance of a C++ class that implements +/// dnnl::threapdool_iface interface. +/// @returns An execution stream. +inline dnnl::stream make_stream( + const dnnl::engine &aengine, threadpool_iface *threadpool) { + dnnl_stream_t c_stream; + dnnl::error::wrap_c_api(dnnl_threadpool_interop_stream_create( + &c_stream, aengine.get(), threadpool), + "could not create stream"); + return dnnl::stream(c_stream); +} + +/// Returns the pointer to a threadpool that is used by an execution stream. +/// +/// @sa @ref dev_guide_threadpool +/// +/// @param astream An execution stream. +/// @returns Output pointer to an instance of a C++ class that implements +/// dnnl::threapdool_iface interface or NULL if the stream was created +/// without threadpool. +inline threadpool_iface *get_threadpool(const dnnl::stream &astream) { + void *tp; + dnnl::error::wrap_c_api( + dnnl_threadpool_interop_stream_get_threadpool(astream.get(), &tp), + "could not get stream threadpool"); + return static_cast(tp); +} + +/// @copydoc dnnl_threadpool_interop_sgemm() +inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N, + dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda, + const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc, + threadpool_iface *threadpool) { + return static_cast(dnnl_threadpool_interop_sgemm(transa, transb, M, + N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool)); +} +/// @copydoc dnnl_threadpool_interop_gemm_u8s8s32() +inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A, + dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co, + threadpool_iface *threadpool) { + return static_cast(dnnl_threadpool_interop_gemm_u8s8s32(transa, + transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C, + ldc, co, threadpool)); +} + +/// @copydoc dnnl_threadpool_interop_gemm_s8s8s32() +inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A, + dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo, + float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co, + threadpool_iface *threadpool) { + return static_cast(dnnl_threadpool_interop_gemm_s8s8s32(transa, + transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C, + ldc, co, threadpool)); +} + +} // namespace threadpool_interop + +/// @} dnnl_api_threadpool_interop + +/// @} dnnl_api_interop + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp new file mode 100644 index 0000000000000000000000000000000000000000..124b8cfa34ee87e2b090bbdc57085e0541cfc3ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2020-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP +#define ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP + +#include +#include + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_threadpool_interop +/// @{ + +namespace threadpool_interop { + +/// Abstract threadpool interface. The users are expected to subclass this +/// interface and pass an object to the library during CPU stream creation or +/// directly in case of BLAS functions. +struct threadpool_iface { + /// Returns the number of worker threads. + virtual int get_num_threads() const = 0; + + /// Returns true if the calling thread belongs to this threadpool. + virtual bool get_in_parallel() const = 0; + + /// Submits n instances of a closure for execution in parallel: + /// + /// for (int i = 0; i < n; i++) fn(i, n); + /// + virtual void parallel_for(int n, const std::function &fn) + = 0; + + /// Returns threadpool behavior flags bit mask (see below). + virtual uint64_t get_flags() const = 0; + + /// If set, parallel_for() returns immediately and oneDNN needs implement + /// waiting for the submitted closures to finish execution on its own. + static constexpr uint64_t ASYNCHRONOUS = 1; + + virtual ~threadpool_iface() {} +}; + +} // namespace threadpool_interop + +/// @} dnnl_api_threadpool_interop + +/// @} dnnl_api_interop + +} // namespace dnnl + +/// @} dnnl_api + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h new file mode 100644 index 0000000000000000000000000000000000000000..7809085bf53ae01821921d9917c3b6abaf9ea243 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h @@ -0,0 +1,2941 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2016-2025 Intel Corporation +* Copyright 2024 FUJITSU LIMITED +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// C API types definitions + +#ifndef ONEAPI_DNNL_DNNL_TYPES_H +#define ONEAPI_DNNL_DNNL_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include +/// @endcond + +#include "oneapi/dnnl/dnnl_config.h" + +#include "oneapi/dnnl/dnnl_common_types.h" + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_memory +/// @{ + +/// Memory format kind +typedef enum { + /// Undefined memory format kind, used for empty memory descriptors. + dnnl_format_kind_undef = 0, + /// A special format kind that indicates that the actual format will be + /// selected by a primitive automatically. + dnnl_format_kind_any, + /// A tensor in a generic format described by the stride and blocking + /// values in each dimension. + dnnl_blocked, + /// A special format kind that indicates that tensor format is opaque. + dnnl_format_kind_opaque, +#ifdef DNNL_EXPERIMENTAL_SPARSE + /// Format kind for sparse tensors. + dnnl_format_kind_sparse, +#endif + /// Parameter to allow internal only format kinds without undefined + /// behavior. This parameter is chosen to be valid for so long as + /// sizeof(int) >= 2. + dnnl_format_kind_max = 0x7fff, +} dnnl_format_kind_t; + +#ifdef DNNL_EXPERIMENTAL_SPARSE +/// Sparse encodings. +typedef enum { + /// Undefined sparse encoding kind, used for empty memory descriptors. + dnnl_sparse_encoding_undef = 0, + /// Compressed Sparse Row (CSR) encoding. + dnnl_csr, + /// An encoding that is used for an opaque storage schema for + /// tensors with unstructured sparsity. A memory descriptor with the + /// packed encoding cannot be used to create a memory object. It can + /// only be used to create a primitive descriptor to query the + /// actual memory descriptor (similar to the format tag `any`). + dnnl_packed, + /// Coordinate Sparse Encoding (COO). + dnnl_coo, +} dnnl_sparse_encoding_t; +#endif + +#ifdef DNNL_EXPERIMENTAL_PROFILING +/// Profiling data kind. +typedef enum { + /// Undefined profiling data kind. + dnnl_profiling_data_kind_undef = 0, + /// Data kind to query an execution time in nanoseconds. + dnnl_profiling_data_kind_time, +} dnnl_profiling_data_kind_t; + +#endif + +/// Memory format tag specification. +/// +/// oneDNN formats describe physical data layout. The physical layout +/// is described as a sequence of the dimensions as they are laid out in the +/// memory (from the outer-most to the inner-most). Note that this order +/// doesn't affect the logical order of the dimensions that is kept in the +/// `dims` field of the dnnl_memory_desc_t structure. The logical order of the +/// dimensions is specified by the primitive that uses the tensor. +/// +/// For example, CNN 5D tensor always has its logical dimensions in the order +/// `(batch, channels, depth, height, width)`, while the physical layout might be +/// `NCDHW` (corresponds to #dnnl_ncdhw format tag) or +/// `NDHWC` (corresponds to #dnnl_ndhwc format tag). +/// +/// ~~~cpp +/// int batch = 2, channels = 16, depth = 13, height = 13, width = 13; +/// +/// int ndims = 5; // 5D tensor +/// dnnl_dims_t dims = {batch, channels, depth, height, width}; +/// dnnl_memory_desc_t data_in_ncdhw; +/// dnnl_memory_desc_create_with_tag( +/// &data_in_ncdhw, 5, dims, dnnl_f32, dnnl_ncdhw); +/// +/// // note that in both cases dims passed are the same +/// dnnl_memory_desc_t data_in_ndhwc; +/// dnnl_memory_desc_create_with_tag( +/// &data_in_ndhwc, 5, dims, dnnl_f32, dnnl_ndhwc); +/// +/// dnnl_memory_desc_destroy(data_in_ncdhw); +/// dnnl_memory_desc_destroy(data_in_ndhwc); +/// ~~~ +/// +/// Memory format tags can be further divided into two categories: +/// - Domain-agnostic names, i.e. names the do not depend on the tensor usage +/// in the specific primitive. These names use letters from `a` to `l` to +/// denote logical dimension from 1 to 12, and form the order in which the +/// dimensions are laid in memory. For instance, #dnnl_ab is used to denote +/// 2D tensor where the second logical dimension (aka `b`) is the innermost, +/// i.e. has stride = 1, and the first logical dimension (`a`) laid out in +/// memory with stride equal to the size of second dimension. On the other +/// hand, #dnnl_ba is just transposed version of the same tensor: the +/// first dimension (`a`) becomes the innermost one. +/// - Domain-specific names, i.e. names that make sense only in the context of +/// a certain domain, such as CNN. This names are just aliases to the +/// corresponding domain-agnostic tags and used mostly for the convenience. +/// For example, #dnnl_nc is used to denote 2D CNN activations tensor +/// memory format, where channels are the innermost dimension and batch is an +/// outermost one. Moreover, #dnnl_nc is just an alias to #dnnl_ab, +/// since for oneDNN CNN primitives the logical dimensions of +/// activations tensors come in order: batch, channels, spatial. +/// In other words, batch corresponds to the first logical dimension (`a`), +/// channels correspond to the second one (`b`). +/// +/// The following domain-specific notation applies to memory format tags: +/// - @c 'n' denotes the mini-batch dimension +/// - @c 'c' denotes a channels dimension +/// - When there are multiple channel dimensions (for example, in convolution +/// weights tensor), @c 'i' and @c 'o' denote dimensions of input and output +/// channels +/// - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width +/// respectively +/// +/// Upper-case letters indicate that the data is laid out in blocks for a +/// particular dimension. In such cases, the format name contains both upper- +/// and lower-case letters for that dimension with a lower-case letter preceded +/// by the block size. For example: #dnnl_nChw8c describes a format where the +/// outermost dimension is mini-batch, followed by the channel block number, +/// followed by the spatial height and width, and finally followed by 8-element +/// channel blocks. +/// +/// @sa @ref dev_guide_understanding_memory_formats +typedef enum { + /// Undefined memory format tag + dnnl_format_tag_undef = 0, + /// Undefined memory format tag. + /// The primitive selects a format automatically. + dnnl_format_tag_any, + + // Semantic agnostic section + // The physical order of dimensions is defined by the permutation of the + // characters, assuming that ab..z defines the natural order. + + // Plain formats + + dnnl_a, ///< plain 1D tensor + dnnl_ab, ///< plain 2D tensor + dnnl_abc, ///< plain 3D tensor + dnnl_abcd, ///< plain 4D tensor + dnnl_abcde, ///< plain 5D tensor + dnnl_abcdef, ///< plain 6D tensor + dnnl_abcdefg, ///< plain 7D tensor + dnnl_abcdefgh, ///< plain 8D tensor + dnnl_abcdefghi, ///< plain 9D tensor + dnnl_abcdefghij, ///< plain 10D tensor + dnnl_abcdefghijk, ///< plain 11D tensor + dnnl_abcdefghijkl, ///< plain 12D tensor + + // Permuted plain formats + + dnnl_ba, ///< permuted 2D tensor + dnnl_acb, ///< permuted 3D tensor + dnnl_bac, ///< permuted 3D tensor + dnnl_bca, ///< permuted 3D tensor + dnnl_cab, ///< permuted 3D tensor + dnnl_cba, ///< permuted 3D tensor + dnnl_abdc, ///< permuted 4D tensor + dnnl_acbd, ///< permuted 4D tensor + dnnl_acdb, ///< permuted 4D tensor + dnnl_adbc, ///< permuted 4D tensor + dnnl_adcb, ///< permuted 4D tensor + dnnl_bacd, ///< permuted 4D tensor + dnnl_bcda, ///< permuted 4D tensor + dnnl_cdab, ///< permuted 4D tensor + dnnl_cdba, ///< permuted 4D tensor + dnnl_dcab, ///< permuted 4D tensor + dnnl_abced, ///< permuted 5D tensor + dnnl_abdec, ///< permuted 5D tensor + dnnl_acbde, ///< permuted 5D tensor + dnnl_acdeb, ///< permuted 5D tensor + dnnl_adecb, ///< permuted 5D tensor + dnnl_bacde, ///< permuted 5D tensor + dnnl_bcdea, ///< permuted 5D tensor + dnnl_cdeab, ///< permuted 5D tensor + dnnl_cdeba, ///< permuted 5D tensor + dnnl_decab, ///< permuted 5D tensor + dnnl_abcdfe, ///< permuted 6D tensor + dnnl_abdefc, ///< permuted 6D tensor + dnnl_abdfce, ///< permuted 6D tensor + dnnl_acbdef, ///< permuted 6D tensor + dnnl_adefcb, ///< permuted 6D tensor + dnnl_defcab, ///< permuted 6D tensor + dnnl_abcdegf, ///< permuted 7D tensor + dnnl_abcdefhg, ///< permuted 8D tensor + dnnl_abcdefgih, ///< permuted 9D tensor + dnnl_abcdefghji, ///< permuted 10D tensor + dnnl_abcdefghikj, ///< permuted 11D tensor + dnnl_abcdefghijlk, ///< permuted 12D tensor + + // Opaque blocked formats + + dnnl_Abc16a, + dnnl_ABc16a16b, + dnnl_ABc32a32b, + dnnl_ABc4a4b, + /// 3D tensor blocked by 2nd dimension with block size 16 + dnnl_aBc16b, + dnnl_ABc16b16a, + dnnl_Abc4a, + /// 3D tensor blocked by 2nd dimension with block size 32 + dnnl_aBc32b, + /// 3D tensor blocked by 2nd dimension with block size 4 + dnnl_aBc4b, + dnnl_ABc4b16a4b, + dnnl_ABc2b8a4b, + dnnl_ABc16b16a4b, + dnnl_ABc16b16a2b, + dnnl_ABc4b4a, + dnnl_ABc8a16b2a, + dnnl_ABc8a8b, + dnnl_ABc8a4b, + /// 3D tensor blocked by 2nd dimension with block size 8 + dnnl_aBc8b, + dnnl_ABc8b16a2b, + dnnl_BAc8a16b2a, + dnnl_ABc8b8a, + dnnl_Abcd16a, + dnnl_Abcd8a, + dnnl_ABcd16a16b, + dnnl_Abcd32a, + dnnl_ABcd32a32b, + /// 4D tensor blocked by 2nd dimension with block size 16 + dnnl_aBcd16b, + dnnl_ABcd16b16a, + dnnl_aBCd16b16c, + dnnl_aBCd16c16b, + dnnl_Abcd4a, + /// 4D tensor blocked by 2nd dimension with block size 32 + dnnl_aBcd32b, + /// 4D tensor blocked by 2nd dimension with block size 4 + dnnl_aBcd4b, + dnnl_ABcd4b16a4b, + dnnl_ABcd16b16a4b, + dnnl_ABcd16b16a2b, + dnnl_ABcd4b4a, + dnnl_ABcd4a4b, + dnnl_aBCd2c4b2c, + dnnl_aBCd4b8c2b, + dnnl_aBCd4c16b4c, + dnnl_aBCd2c8b4c, + dnnl_aBCd16c16b4c, + dnnl_aBCd16c16b2c, + dnnl_aBCd4c4b, + dnnl_aBCd4b4c, + dnnl_ABcd8a16b2a, + dnnl_ABcd2b8a4b, + dnnl_ABcd8a8b, + dnnl_ABcd8a4b, + /// 4D tensor blocked by 2nd dimension with block size 8 + dnnl_aBcd8b, + dnnl_aBCd4c8b2c, + dnnl_ABcd8b16a2b, + dnnl_aBCd8b16c2b, + dnnl_BAcd8a16b2a, + /// 4D tensor blocked by 1st and 2nd dimension with block size 8 + dnnl_ABcd8b8a, + dnnl_aBCd8b8c, + dnnl_aBCd8b4c, + dnnl_aBCd8c16b2c, + dnnl_ABcde8a16b2a, + dnnl_aCBd8b16c2b, + dnnl_aBCd8c8b, + dnnl_Abcde16a, + dnnl_Abcde32a, + dnnl_ABcde16a16b, + dnnl_BAcde8a16b2a, + /// 4D tensor blocked by 3rd dimension with block size 4 + dnnl_aBCd2b4c2b, + /// 5D tensor blocked by 1st dimension with block size 16 + dnnl_ABcde4b16a4b, + /// 5D tensor blocked by 1st dimension with block size 8 + dnnl_ABcde2b8a4b, + /// 5D tensor blocked by 2nd dimension with block size 16 + dnnl_aBcde16b, + dnnl_ABcde16b16a, + dnnl_aBCde16b16c, + dnnl_aBCde16c16b, + dnnl_aBCde2c8b4c, + dnnl_Abcde4a, + /// 5D tensor blocked by 2nd dimension with block size 32 + dnnl_aBcde32b, + /// 5D tensor blocked by 2nd dimension with block size 4 + dnnl_aBcde4b, + dnnl_ABcde4b4a, + dnnl_ABcde4a4b, + dnnl_aBCde4b4c, + dnnl_aBCde2c4b2c, + dnnl_aBCde4b8c2b, + dnnl_aBCde4c16b4c, + dnnl_aBCde16c16b4c, + dnnl_aBCde16c16b2c, + dnnl_aBCde4c4b, + dnnl_Abcde8a, + dnnl_ABcde8a8b, + dnnl_ABcde8a4b, + dnnl_BAcde16b16a, + /// 5D tensor blocked by 2nd dimension with block size 8 + dnnl_aBcde8b, + dnnl_ABcde8b16a2b, + dnnl_aBCde8b16c2b, + dnnl_aBCde4c8b2c, + dnnl_aCBde8b16c2b, + dnnl_ABcde8b8a, + dnnl_ABcde32a32b, + dnnl_aBCde8b8c, + dnnl_aBCde8b4c, + dnnl_ABc4a8b8a4b, + dnnl_ABcd4a8b8a4b, + dnnl_ABcde4a8b8a4b, + dnnl_BAc4b8a8b4a, + dnnl_BAcd4b8a8b4a, + dnnl_BAcde4b8a8b4a, + dnnl_ABcd2a8b8a2b, + dnnl_aBCd4b8c8b4c, + dnnl_aBCde4b8c8b4c, + dnnl_aBCde2b8c8b2c, + dnnl_aBCde8c16b2c, + dnnl_aBCde8c8b, + /// 5D tensor blocked by 3rd dimension with block size 4 + dnnl_aBCde2b4c2b, + /// 6D tensor blocked by 2nd dimension with block size 16 + dnnl_aBcdef16b, + dnnl_aBCdef16b16c, + dnnl_aBCdef16c16b, + dnnl_aBCdef4c16b4c, + /// 6D tensor blocked by 2nd dimension with block size 8 + dnnl_aBCdef2c8b4c, + dnnl_aBCdef4c8b2c, + /// 6D tensor blocked by 3rd dimension with block size 4 + dnnl_aBCdef2b4c2b, + /// 6D tensor blocked by 2nd dimension with block size 4 + dnnl_aBcdef4b, + dnnl_aBCdef4c4b, + dnnl_aBCdef4b4c, + dnnl_aBCdef2c4b2c, + dnnl_aBCdef4b8c2b, + dnnl_aBCdef8b8c, + dnnl_aBCdef8b4c, + dnnl_aBCdef8c16b2c, + dnnl_aBCdef4b8c8b4c, + dnnl_aBCdef8b16c2b, + dnnl_aCBdef8b16c2b, + dnnl_aBCdef8c8b, + dnnl_aBdc16b, + dnnl_aBdC16b2c, + dnnl_aBdC16b4c, + dnnl_aBdc4b, + dnnl_aBdc8b, + dnnl_aBdec16b, + dnnl_aBdeC16b2c, + dnnl_aBdeC16b4c, + dnnl_aBdec32b, + dnnl_aBdec4b, + dnnl_aBdec8b, + dnnl_aBdefc16b, + dnnl_aBdefC16b2c, + dnnl_aCBdef16c16b, + dnnl_aBdefc4b, + dnnl_aBdefc8b, + dnnl_Abcdef16a, + dnnl_Abcdef32a, + dnnl_aBedc16b, + dnnl_Acb16a, + dnnl_AcB16a2b, + dnnl_AcB16a4b, + dnnl_Acb4a, + dnnl_Acb8a, + dnnl_aCBd16b16c, + dnnl_aCBd16c16b, + dnnl_aCBde16b16c, + dnnl_aCBde16c16b, + dnnl_Acdb16a, + dnnl_AcdB16a2b, + dnnl_AcdB16a4b, + dnnl_Acdb32a, + dnnl_Acdb4a, + dnnl_Acdb8a, + dnnl_Acdeb16a, + dnnl_AcdeB16a2b, + dnnl_Acdeb4a, + dnnl_Acdeb8a, + dnnl_Adcb16a, + dnnl_BAc16a16b, + dnnl_BAc16b16a, + dnnl_BAcd16a16b, + dnnl_BAcd16b16a, + dnnl_aCBd4c8b8c4b, + dnnl_aCBde4c8b8c4b, + dnnl_aCBdef4c8b8c4b, + dnnl_BAcde16a16b, + dnnl_aCBdef16b16c, + dnnl_ABc16b32a, + dnnl_ABc16b64a, + dnnl_ABc4b32a4b, + dnnl_ABc4b64a4b, + dnnl_ABc8b32a2b, + dnnl_ABc8b64a2b, + dnnl_AB16b16a, + dnnl_AB16b32a, + dnnl_AB16b64a, + dnnl_AB8b16a2b, + dnnl_AB8b32a2b, + dnnl_AB8b64a2b, + dnnl_AB4b16a4b, + dnnl_AB4b32a4b, + dnnl_AB4b64a4b, + dnnl_AB16b16a4b, + dnnl_ABcd16b32a, + dnnl_ABcd16b64a, + dnnl_ABcd4b32a4b, + dnnl_ABcd4b64a4b, + dnnl_ABcd8b32a2b, + dnnl_ABcd8b64a2b, + dnnl_ABcde4b32a4b, + dnnl_ABcde4b64a4b, + dnnl_ABcde16b16a4b, + dnnl_ABcde16b16a2b, + dnnl_ABcde16b32a, + dnnl_ABcde16b64a, + dnnl_ABcde8b32a2b, + dnnl_ABcde8b64a2b, + dnnl_aBCdef16c16b4c, + dnnl_aBCdef16c16b2c, + dnnl_AB32a32b8a4b, + dnnl_AB8a4b, + dnnl_AB32a32b8a2b, + dnnl_AB8a2b, + dnnl_abDc32d, + dnnl_abDC32d4c, + dnnl_abdEc32e, + dnnl_abdEC32e2c, + dnnl_abdEC32e4c, + dnnl_aBdefC16b4c, + dnnl_AcdeB16a4b, + dnnl_ABcd16a16b2a, + dnnl_ABc16a16b2a, + dnnl_aBCd16b16c2b, + dnnl_aBCde16b16c2b, + dnnl_Acb32a, + dnnl_AcB32a2b, + dnnl_AcB32a4b, + dnnl_Acb48a, + dnnl_AcB48a2b, + dnnl_AcB48a4b, + dnnl_Acb64a, + dnnl_AcB64a2b, + dnnl_AcB64a4b, + dnnl_cBa2b, + dnnl_cBa4b, + dnnl_aBdc32b, + dnnl_aBdC32b2c, + dnnl_aBdC32b4c, + dnnl_aBdc48b, + dnnl_aBdC48b2c, + dnnl_aBdC48b4c, + dnnl_aBdc64b, + dnnl_aBdC64b2c, + dnnl_aBdC64b4c, + dnnl_adCb2c, + dnnl_adCb4c, + dnnl_AcdB32a2b, + dnnl_AcdB32a4b, + dnnl_Acdb48a, + dnnl_AcdB48a2b, + dnnl_AcdB48a4b, + dnnl_Acdb64a, + dnnl_AcdB64a2b, + dnnl_AcdB64a4b, + dnnl_cdBa2b, + dnnl_cdBa4b, + dnnl_aBdeC32b2c, + dnnl_aBdeC32b4c, + dnnl_aBdec48b, + dnnl_aBdeC48b2c, + dnnl_aBdeC48b4c, + dnnl_aBdec64b, + dnnl_aBdeC64b2c, + dnnl_aBdeC64b4c, + dnnl_adeCb2c, + dnnl_adeCb4c, + dnnl_Acdeb32a, + dnnl_AcdeB32a2b, + dnnl_AcdeB32a4b, + dnnl_Acdeb48a, + dnnl_AcdeB48a2b, + dnnl_AcdeB48a4b, + dnnl_Acdeb64a, + dnnl_AcdeB64a2b, + dnnl_AcdeB64a4b, + dnnl_cdeBa2b, + dnnl_cdeBa4b, + dnnl_aBdefc32b, + dnnl_aBdefC32b2c, + dnnl_aBdefC32b4c, + dnnl_aBdefc48b, + dnnl_aBdefC48b2c, + dnnl_aBdefC48b4c, + dnnl_aBdefc64b, + dnnl_aBdefC64b2c, + dnnl_aBdefC64b4c, + dnnl_adefCb2c, + dnnl_adefCb4c, + dnnl_AB16b32a4b, + dnnl_AB16b48a4b, + dnnl_AB16b64a4b, + dnnl_AB16b16a2b, + dnnl_AB16b32a2b, + dnnl_AB16b48a2b, + dnnl_AB16b64a2b, + dnnl_ABc16b32a4b, + dnnl_ABc16b48a4b, + dnnl_ABc16b64a4b, + dnnl_ABc16b32a2b, + dnnl_ABc16b48a2b, + dnnl_ABc16b64a2b, + dnnl_ABcd16b32a4b, + dnnl_ABcd16b48a4b, + dnnl_ABcd16b64a4b, + dnnl_ABcd16b32a2b, + dnnl_ABcd16b48a2b, + dnnl_ABcd16b64a2b, + dnnl_ABcde16b32a4b, + dnnl_ABcde16b48a4b, + dnnl_ABcde16b64a4b, + dnnl_ABcde16b32a2b, + dnnl_ABcde16b48a2b, + dnnl_ABcde16b64a2b, + dnnl_ABc32a16b, + dnnl_ABcd32a16b, + dnnl_ABcde32a16b, + dnnl_AB48a16b, + dnnl_AB48a32b, + dnnl_ABc40a16b, + dnnl_ABc40a32b, + dnnl_aBC48b16c, + dnnl_aBC48b32c, + dnnl_ABcd40a16b, + dnnl_ABcd40a32b, + dnnl_abCd32c, + dnnl_abdCe32c, + dnnl_abdCE32c2e, + dnnl_BA16a16b2a, + dnnl_BA16a32b2a, + dnnl_BA16a48b2a, + dnnl_BA16a64b2a, + dnnl_BA16a16b4a, + dnnl_BA16a32b4a, + dnnl_BA16a48b4a, + dnnl_BA16a64b4a, + dnnl_ABcd8a2b, + dnnl_aBdeC16c16b2c, + dnnl_aBdeC16c16b4c, + dnnl_aBdefC16c16b2c, + dnnl_AcB16b16a2b, + dnnl_AcB16b16a4b, + dnnl_AcdB16b16a2b, + dnnl_AcdB16b16a4b, + dnnl_AcdeB16b16a2b, + dnnl_aBdefC16c16b4c, + dnnl_AcdeB16b16a4b, + dnnl_AcB16b32a2b, + dnnl_AcB16b32a4b, + dnnl_AcB16b48a2b, + dnnl_AcB16b48a4b, + dnnl_AcB16b64a2b, + dnnl_AcB16b64a4b, + dnnl_aBdC16c16b2c, + dnnl_aBdC16c16b4c, + dnnl_aBdC16c32b2c, + dnnl_aBdC16c32b4c, + dnnl_aBdC16c48b2c, + dnnl_aBdC16c48b4c, + dnnl_aBdC16c64b2c, + dnnl_aBdC16c64b4c, + dnnl_AcdB16b32a2b, + dnnl_AcdB16b32a4b, + dnnl_AcdB16b48a2b, + dnnl_AcdB16b48a4b, + dnnl_AcdB16b64a2b, + dnnl_AcdB16b64a4b, + dnnl_aBdeC16c32b2c, + dnnl_aBdeC16c32b4c, + dnnl_aBdeC16c48b2c, + dnnl_aBdeC16c48b4c, + dnnl_aBdeC16c64b2c, + dnnl_aBdeC16c64b4c, + dnnl_AcdeB16b32a2b, + dnnl_AcdeB16b32a4b, + dnnl_AcdeB16b48a2b, + dnnl_AcdeB16b48a4b, + dnnl_AcdeB16b64a2b, + dnnl_AcdeB16b64a4b, + dnnl_aBdefC16c32b2c, + dnnl_aBdefC16c32b4c, + dnnl_aBdefC16c48b2c, + dnnl_aBdefC16c48b4c, + dnnl_aBdefC16c64b2c, + dnnl_aBdefC16c64b4c, + dnnl_decbA16a, + dnnl_ABc4a2b, + dnnl_ABc8a2b, + dnnl_aBCd8b2c, + dnnl_ABcde4a2b, + dnnl_ABcde8a2b, + dnnl_ABcde40a16b, + dnnl_ABcde40a32b, + dnnl_aBCde8b2c, + dnnl_ABcde4a8b8a2b, + dnnl_ABcd4a8b8a2b, + dnnl_ABc4a8b8a2b, + dnnl_aBCdef4b8c8b2c, + dnnl_aBCde4b8c8b2c, + dnnl_aBCd4b8c8b2c, + dnnl_BAcde4b8a8b2a, + dnnl_BAcd4b8a8b2a, + dnnl_BAc4b8a8b2a, + dnnl_aCBdef4c8b8c2b, + dnnl_aCBde4c8b8c2b, + dnnl_aCBd4c8b8c2b, + dnnl_aBCdef8b2c, + dnnl_AB32a16b, + dnnl_AB32a32b, + dnnl_BA4b8a8b2a, + dnnl_BA4b8a8b4a, + dnnl_aBC32b16c, + dnnl_aBC32b32c, + dnnl_aCB4c8b8c2b, + dnnl_aCB4c8b8c4b, + dnnl_ABcd4a2b, + dnnl_ABc2b8a16b4a, + dnnl_ABcd2b8a16b4a, + dnnl_ABcde2b8a16b4a, + dnnl_ABc2a8b16a4b, + dnnl_ABc2a8b16a2b, + dnnl_ABc2b32a8b, + dnnl_ABcd2a8b16a4b, + dnnl_ABcd2a8b16a2b, + dnnl_aCBd2c8b16c2b, + dnnl_ABcd2b32a8b, + dnnl_aBCd2c8b16c2b, + dnnl_ABcde2a8b16a4b, + dnnl_ABcde2a8b16a2b, + dnnl_aCBde2c8b16c2b, + dnnl_ABcde2b32a8b, + dnnl_aBC2b8c16b2c, + dnnl_aBCd2b8c16b2c, + dnnl_aBCde2b8c16b2c, + dnnl_aBCdef2b8c16b2c, + dnnl_BAcde2b8a16b4a, + dnnl_BAcd2b8a16b4a, + dnnl_BAc2b8a16b4a, + dnnl_BAcde2b8a16b2a, + dnnl_BAcd2b8a16b2a, + dnnl_BAc2b8a16b2a, + dnnl_aBCde2c8b16c2b, + dnnl_aBCdef2c8b16c2b, + dnnl_aCBdef2c8b16c2b, + dnnl_aBCd2b8c16b4c, + dnnl_aBCde2b8c16b4c, + dnnl_BA4b8a16b2a, + dnnl_BA4b8a16b4a, + dnnl_aCB4c8b16c2b, + dnnl_aCB4c8b16c4b, + dnnl_BA16a16b, + dnnl_BA16a32b, + dnnl_BA16a48b, + dnnl_BA16a64b, + dnnl_aCB16c2b, + dnnl_aCB16c4b, + dnnl_BA16b2a, + dnnl_BA16b4a, + dnnl_aBC16b16c, + dnnl_aBC16b32c, + dnnl_AB16a16b, + dnnl_AB16a32b, + dnnl_ABcde16a16b2a, + dnnl_aBCdef16b16c2b, + dnnl_Acedb16a, + dnnl_aBdfec16b, + dnnl_abdEC64e2c, + dnnl_abdEC64e4c, + dnnl_aCB16b16c, + dnnl_aCB16b32c, + dnnl_aCB16b48c, + dnnl_aCB16b64c, + dnnl_aCB16b16c2b, + dnnl_aCB16b32c2b, + dnnl_aCB16b48c2b, + dnnl_aCB16b64c2b, + dnnl_aCB16b16c4b, + dnnl_aCB16b32c4b, + dnnl_aCB16b48c4b, + dnnl_aCB16b64c4b, + dnnl_abCd4c, + dnnl_abCde4c, + dnnl_abCdef4c, + dnnl_abCde32c, + dnnl_abCdef32c, + dnnl_ABcd16a32b, + dnnl_decbA8a, + dnnl_aCdefB16b32c2b, + dnnl_aCdefB16b32c4b, + dnnl_aCdefB16b48c2b, + dnnl_aCdefB16b48c4b, + dnnl_aCdefB16b64c2b, + dnnl_aCdefB16b64c4b, + dnnl_BcdeA16a32b2a, + dnnl_BcdeA16a32b4a, + dnnl_BcdeA16a48b2a, + dnnl_BcdeA16a48b4a, + dnnl_BcdeA16a64b2a, + dnnl_BcdeA16a64b4a, + dnnl_aCdefb32c, + dnnl_aCdefB32c2b, + dnnl_aCdefB32c4b, + dnnl_aCdefb48c, + dnnl_aCdefB48c2b, + dnnl_aCdefB48c4b, + dnnl_aCdefb64c, + dnnl_aCdefB64c2b, + dnnl_aCdefB64c4b, + dnnl_Bcdea32b, + dnnl_BcdeA32b2a, + dnnl_BcdeA32b4a, + dnnl_Bcdea48b, + dnnl_BcdeA48b2a, + dnnl_BcdeA48b4a, + dnnl_Bcdea64b, + dnnl_BcdeA64b2a, + dnnl_BcdeA64b4a, + dnnl_Bca32b, + dnnl_BcA32b2a, + dnnl_BcA32b4a, + dnnl_Bca48b, + dnnl_BcA48b2a, + dnnl_BcA48b4a, + dnnl_Bca64b, + dnnl_BcA64b2a, + dnnl_BcA64b4a, + dnnl_aCdb32c, + dnnl_aCdB32c2b, + dnnl_aCdB32c4b, + dnnl_aCdb48c, + dnnl_aCdB48c2b, + dnnl_aCdB48c4b, + dnnl_aCdb64c, + dnnl_aCdB64c2b, + dnnl_aCdB64c4b, + dnnl_BcA16a16b2a, + dnnl_BcA16a16b4a, + dnnl_BcdA16a16b2a, + dnnl_BcdA16a16b4a, + dnnl_BcdeA16a16b2a, + dnnl_BcdeA16a16b4a, + dnnl_aCdB16b16c2b, + dnnl_aCdB16b16c4b, + dnnl_aCdeB16b16c2b, + dnnl_aCdeB16b16c4b, + dnnl_aCdefB16b16c2b, + dnnl_aCdefB16b16c4b, + dnnl_BcA16a32b2a, + dnnl_BcA16a32b4a, + dnnl_BcA16a48b2a, + dnnl_BcA16a48b4a, + dnnl_BcA16a64b2a, + dnnl_BcA16a64b4a, + dnnl_aCdB16b32c2b, + dnnl_aCdB16b32c4b, + dnnl_aCdB16b48c2b, + dnnl_aCdB16b48c4b, + dnnl_aCdB16b64c2b, + dnnl_aCdB16b64c4b, + dnnl_BcdA16a32b2a, + dnnl_BcdA16a32b4a, + dnnl_BcdA16a48b2a, + dnnl_BcdA16a48b4a, + dnnl_BcdA16a64b2a, + dnnl_BcdA16a64b4a, + dnnl_aCdeB16b32c2b, + dnnl_aCdeB16b32c4b, + dnnl_aCdeB16b48c2b, + dnnl_aCdeB16b48c4b, + dnnl_aCdeB16b64c2b, + dnnl_aCdeB16b64c4b, + dnnl_Bca16b, + dnnl_BcA16b2a, + dnnl_BcA16b4a, + dnnl_Bcda16b, + dnnl_BcdA16b2a, + dnnl_BcdA16b4a, + dnnl_Bcdea16b, + dnnl_BcdeA16b2a, + dnnl_BcdeA16b4a, + dnnl_aCdb16c, + dnnl_aCdB16c2b, + dnnl_aCdB16c4b, + dnnl_aCdeb16c, + dnnl_aCdeB16c2b, + dnnl_aCdeB16c4b, + dnnl_aCdefb16c, + dnnl_aCdefB16c2b, + dnnl_aCdefB16c4b, + dnnl_Bcda32b, + dnnl_BcdA32b2a, + dnnl_BcdA32b4a, + dnnl_Bcda48b, + dnnl_BcdA48b2a, + dnnl_BcdA48b4a, + dnnl_Bcda64b, + dnnl_BcdA64b2a, + dnnl_BcdA64b4a, + dnnl_aCdeb32c, + dnnl_aCdeB32c2b, + dnnl_aCdeB32c4b, + dnnl_aCdeb48c, + dnnl_aCdeB48c2b, + dnnl_aCdeB48c4b, + dnnl_aCdeb64c, + dnnl_aCdeB64c2b, + dnnl_aCdeB64c4b, + dnnl_Acb24a, + dnnl_Acdb24a, + dnnl_Acdeb24a, + dnnl_aBdc24b, + dnnl_aBdec24b, + dnnl_aBdefc24b, + dnnl_abDc16d, + dnnl_abdEc16e, + dnnl_abdCe16c, + dnnl_AcB24a2b, + dnnl_AcdB24a2b, + dnnl_AcdeB24a2b, + dnnl_aBdC24b2c, + dnnl_aBdeC24b2c, + dnnl_aBdefC24b2c, + dnnl_AcB8a2b, + dnnl_AcdB8a2b, + dnnl_AcdeB8a2b, + dnnl_aBdC8b2c, + dnnl_aBdeC8b2c, + dnnl_aBdefC8b2c, + dnnl_AB8b32a, + dnnl_ABc8b32a, + dnnl_ABcd8b32a, + dnnl_ABcde8b32a, + dnnl_AB8b24a, + dnnl_ABc8b24a, + dnnl_ABcd8b24a, + dnnl_ABcde8b24a, + dnnl_AB8b16a, + dnnl_ABc8b16a, + dnnl_ABcd8b16a, + dnnl_ABcde8b16a, + dnnl_AB8b8a, + dnnl_AB4b8a4b, + dnnl_AB4b24a4b, + dnnl_ABc4b8a4b, + dnnl_ABc4b24a4b, + dnnl_ABcd4b8a4b, + dnnl_ABcd4b24a4b, + dnnl_ABcde4b8a4b, + dnnl_ABcde4b24a4b, + dnnl_AB8b24a2b, + dnnl_ABc8b24a2b, + dnnl_ABcd8b24a2b, + dnnl_ABcde8b24a2b, + dnnl_AB8b8a2b, + dnnl_ABc8b8a2b, + dnnl_ABcd8b8a2b, + dnnl_ABcde8b8a2b, + dnnl_AcB24a4b, + dnnl_AcdB24a4b, + dnnl_AcdeB24a4b, + dnnl_aBdC24b4c, + dnnl_aBdeC24b4c, + dnnl_aBdefC24b4c, + dnnl_AcB8a4b, + dnnl_AcdB8a4b, + dnnl_AcdeB8a4b, + dnnl_aBdC8b4c, + dnnl_aBdeC8b4c, + dnnl_aBdefC8b4c, + dnnl_Bca8b, + dnnl_BcA8b2a, + dnnl_Bcda8b, + dnnl_BcdA8b2a, + dnnl_Bcdea8b, + dnnl_BcdeA8b2a, + dnnl_aCdb8c, + dnnl_aCdB8c2b, + dnnl_aCdeb8c, + dnnl_aCdeB8c2b, + dnnl_aCdefb8c, + dnnl_aCdefB8c2b, + dnnl_Bca24b, + dnnl_BcA24b2a, + dnnl_Bcda24b, + dnnl_BcdA24b2a, + dnnl_Bcdea24b, + dnnl_BcdeA24b2a, + dnnl_aCdb24c, + dnnl_aCdB24c2b, + dnnl_aCdeb24c, + dnnl_aCdeB24c2b, + dnnl_aCdefb24c, + dnnl_aCdefB24c2b, + dnnl_BcA8b4a, + dnnl_BcdA8b4a, + dnnl_BcdeA8b4a, + dnnl_aCdB8c4b, + dnnl_aCdeB8c4b, + dnnl_aCdefB8c4b, + dnnl_BcA24b4a, + dnnl_BcdA24b4a, + dnnl_BcdeA24b4a, + dnnl_aCdB24c4b, + dnnl_aCdeB24c4b, + dnnl_aCdefB24c4b, + dnnl_AB16b48a, + dnnl_ABc16b48a, + dnnl_ABcd16b48a, + dnnl_ABcde16b48a, + dnnl_ABc16a4b, + dnnl_ABcd16a4b, + dnnl_ABcde16a4b, + dnnl_defcbA16a, + dnnl_defcbA8a, + dnnl_AcB16b64a, + dnnl_AcdB16b64a, + dnnl_AcdeB16b64a, + dnnl_AcB16b48a, + dnnl_AcdB16b48a, + dnnl_AcdeB16b48a, + dnnl_AcB16b32a, + dnnl_AcdB16b32a, + dnnl_AcdeB16b32a, + dnnl_AcB16b16a, + dnnl_AcdB16b16a, + dnnl_AcdeB16b16a, + dnnl_AcB8b32a, + dnnl_AcdB8b32a, + dnnl_AcdeB8b32a, + dnnl_AcB8b24a, + dnnl_AcdB8b24a, + dnnl_AcdeB8b24a, + dnnl_AcB8b16a, + dnnl_AcdB8b16a, + dnnl_AcdeB8b16a, + dnnl_AcB8b8a, + dnnl_AcdB8b8a, + dnnl_AcdeB8b8a, + dnnl_AcB8b64a2b, + dnnl_AcdB8b64a2b, + dnnl_AcdeB8b64a2b, + dnnl_AcB8b32a2b, + dnnl_AcdB8b32a2b, + dnnl_AcdeB8b32a2b, + dnnl_AcB8b24a2b, + dnnl_AcdB8b24a2b, + dnnl_AcdeB8b24a2b, + dnnl_AcB8b16a2b, + dnnl_AcdB8b16a2b, + dnnl_AcdeB8b16a2b, + dnnl_AcB8b8a2b, + dnnl_AcdB8b8a2b, + dnnl_AcdeB8b8a2b, + dnnl_AcB4b64a4b, + dnnl_AcdB4b64a4b, + dnnl_AcdeB4b64a4b, + dnnl_AcB4b32a4b, + dnnl_AcdB4b32a4b, + dnnl_AcdeB4b32a4b, + dnnl_AcB4b24a4b, + dnnl_AcdB4b24a4b, + dnnl_AcdeB4b24a4b, + dnnl_AcB4b16a4b, + dnnl_AcdB4b16a4b, + dnnl_AcdeB4b16a4b, + dnnl_AcB4b8a4b, + dnnl_AcdB4b8a4b, + dnnl_AcdeB4b8a4b, + dnnl_Ab4a, + dnnl_Ab8a, + dnnl_BA4b4a, + dnnl_BA8b4a, + dnnl_BA2a24b, + dnnl_aCB2b24c, + dnnl_BA2a8b, + dnnl_aCB2b8c, + dnnl_BA8a24b, + dnnl_aCB8b24c, + dnnl_BA8a16b, + dnnl_aCB8b16c, + dnnl_BA8a8b, + dnnl_aCB8b8c, + dnnl_bcad, + dnnl_cabd, + dnnl_dabc, + dnnl_Ab32a, + dnnl_aCBd8b8c, + dnnl_aCBde8b8c, + dnnl_BAc8a8b, + dnnl_BAcd8a8b, + dnnl_BAcde8a8b, + dnnl_aCBdef8b8c, + dnnl_abdEC16e4c, + dnnl_abDC16d4c, + + /// Just a sentinel, not real memory format tag. Must be changed after new + /// format tag is added. + dnnl_format_tag_last, + + // Aliases + + /// 1D tensor, an alias to #dnnl_a + dnnl_x = dnnl_a, + /// 2D CNN activations tensor, an alias to #dnnl_ab + dnnl_nc = dnnl_ab, + /// 2D CNN activations tensor, an alias to #dnnl_ba + dnnl_cn = dnnl_ba, + /// 2D RNN statistics tensor, an alias to #dnnl_ab + dnnl_tn = dnnl_ab, + /// 2D RNN statistics tensor, an alias to #dnnl_ba + dnnl_nt = dnnl_ba, + /// 3D CNN activations tensor, an alias to #dnnl_abc + dnnl_ncw = dnnl_abc, + /// 3D CNN activations tensor, an alias to #dnnl_acb + dnnl_nwc = dnnl_acb, + /// 4D CNN activations tensor, an alias to #dnnl_abcd + dnnl_nchw = dnnl_abcd, + /// 4D CNN activations tensor, an alias to #dnnl_acdb + dnnl_nhwc = dnnl_acdb, + /// 4D CNN activations tensor, an alias to #dnnl_bcda + dnnl_chwn = dnnl_bcda, + /// 5D CNN activations tensor, an alias to #dnnl_abcde + dnnl_ncdhw = dnnl_abcde, + /// 5D CNN activations tensor, an alias to #dnnl_acdeb + dnnl_ndhwc = dnnl_acdeb, + + /// 2D CNN weights tensor, an alias to #dnnl_ab + dnnl_oi = dnnl_ab, + /// 2D CNN weights tensor, an alias to #dnnl_ba + dnnl_io = dnnl_ba, + /// 3D CNN weights tensor, an alias to #dnnl_abc + dnnl_oiw = dnnl_abc, + /// 3D CNN weights tensor, an alias to #dnnl_acb + dnnl_owi = dnnl_acb, + /// 3D CNN weights tensor, an alias to #dnnl_cba + dnnl_wio = dnnl_cba, + /// 3D CNN weights tensor, an alias to #dnnl_cab + dnnl_woi = dnnl_cab, + /// 3D CNN weights tensor, an alias to #dnnl_bca + dnnl_iwo = dnnl_bca, + /// 4D CNN weights tensor, an alias to #dnnl_abcd + dnnl_oihw = dnnl_abcd, + /// 4D CNN weights tensor, an alias to #dnnl_cdba + dnnl_hwio = dnnl_cdba, + /// 4D CNN weights tensor, an alias to #dnnl_cdab + dnnl_hwoi = dnnl_cdab, + /// 4D CNN weights tensor, an alias to #dnnl_acdb + dnnl_ohwi = dnnl_acdb, + /// 4D CNN weights tensor, an alias to #dnnl_bcda + dnnl_ihwo = dnnl_bcda, + /// 4D CNN weights tensor, an alias to #dnnl_bacd + dnnl_iohw = dnnl_bacd, + /// 5D CNN weights tensor, an alias to #dnnl_abcde + dnnl_oidhw = dnnl_abcde, + /// 5D CNN weights tensor, an alias to #dnnl_bacde + dnnl_iodhw = dnnl_bacde, + /// 5D CNN weights tensor, an alias to #dnnl_cdeba + dnnl_dhwio = dnnl_cdeba, + /// 5D CNN weights tensor, an alias to #dnnl_cdeab + dnnl_dhwoi = dnnl_cdeab, + /// 5D CNN weights tensor, an alias to #dnnl_acdeb + dnnl_odhwi = dnnl_acdeb, + /// 5D CNN weights tensor, an alias to #dnnl_bcdea + dnnl_idhwo = dnnl_bcdea, + + /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abcd + dnnl_goiw = dnnl_abcd, + /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abdc + dnnl_gowi = dnnl_abdc, + /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_dcab + dnnl_wigo = dnnl_dcab, + /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abcde + dnnl_goihw = dnnl_abcde, + /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abdec + dnnl_gohwi = dnnl_abdec, + /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_decab + dnnl_hwigo = dnnl_decab, + /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_acbde + dnnl_giohw = dnnl_acbde, + /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abcdef + dnnl_goidhw = dnnl_abcdef, + /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abdefc + dnnl_godhwi = dnnl_abdefc, + /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_acbdef + dnnl_giodhw = dnnl_acbdef, + /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_defcab + dnnl_dhwigo = dnnl_defcab, + + /// 3D RNN data tensor in the format (seq_length, batch, input channels), + /// an alias to #dnnl_abc. + dnnl_tnc = dnnl_abc, + /// 3D RNN data tensor in the format (batch, seq_length, input channels), + /// an alias to #dnnl_bac. + dnnl_ntc = dnnl_bac, + /// 4D RNN states tensor in the format (num_layers, num_directions, + /// batch, state channels), an alias to #dnnl_abcd. + dnnl_ldnc = dnnl_abcd, + /// 5D RNN weights tensor in the format (num_layers, num_directions, + /// input_channels, num_gates, output_channels), an alias to #dnnl_abcde. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + dnnl_ldigo = dnnl_abcde, + /// 5D RNN weights tensor in the format (num_layers, num_directions, + /// num_gates, output_channels, input_channels), an alias to #dnnl_abdec. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + dnnl_ldgoi = dnnl_abdec, + /// 4D LSTM projection tensor in the format (num_layers, num_directions, + /// num_channels_in_hidden_state, num_channels_in_recurrent_projection), + /// an alias to #dnnl_abcd. + dnnl_ldio = dnnl_abcd, + /// 4D LSTM projection tensor in the format (num_layers, num_directions, + /// num_channels_in_recurrent_projection, num_channels_in_hidden_state), + /// an alias to #dnnl_abdc. + dnnl_ldoi = dnnl_abdc, + /// 4D RNN bias tensor in the format (num_layers, num_directions, + /// num_gates, output_channels), an alias to #dnnl_abcd. + /// + /// - For LSTM cells, the gates order is input, forget, candidate + /// and output gate. + /// - For GRU cells, the gates order is update, reset and output gate. + dnnl_ldgo = dnnl_abcd, + /// 5D LSTM projection tensor + dnnl_ldOi16o = dnnl_abDc16d, + dnnl_ldOi32o = dnnl_abDc32d, + dnnl_ldOI16o4i = dnnl_abDC16d4c, + dnnl_ldOI32o4i = dnnl_abDC32d4c, + dnnl_ldIo32i = dnnl_abCd32c, + /// 6D RNN weights tensor + dnnl_ldgOi16o = dnnl_abdEc16e, + dnnl_ldgOI16o4i = dnnl_abdEC16e4c, + dnnl_ldgOi32o = dnnl_abdEc32e, + dnnl_ldgOI32o2i = dnnl_abdEC32e2c, + dnnl_ldgOI32o4i = dnnl_abdEC32e4c, + dnnl_ldgOI64o2i = dnnl_abdEC64e2c, + dnnl_ldgOI64o4i = dnnl_abdEC64e4c, + dnnl_ldgIo16i = dnnl_abdCe16c, + dnnl_ldgIo32i = dnnl_abdCe32c, + dnnl_ldgIO32i2o = dnnl_abdCE32c2e, + + // Opaque data types, are not to be used explicitly + + // data + /// 5D CNN activations tensor blocked by channels with block size 32, + /// an alias to #dnnl_aBcde32b + dnnl_nCdhw32c = dnnl_aBcde32b, + /// 5D CNN activations tensor blocked by channels with block size 16, + /// an alias to #dnnl_aBcde16b + dnnl_nCdhw16c = dnnl_aBcde16b, + /// 5D CNN activations tensor blocked by channels with block size 4, + /// an alias to #dnnl_aBcde4b + dnnl_nCdhw4c = dnnl_aBcde4b, + /// 5D CNN activations tensor blocked by channels with block size 8, + /// an alias to #dnnl_aBcde8b + dnnl_nCdhw8c = dnnl_aBcde8b, + /// 4D CNN activations tensor blocked by channels with block size 32, + /// an alias to #dnnl_aBcd32b + dnnl_nChw32c = dnnl_aBcd32b, + /// 4D CNN activations tensor blocked by channels with block size 16, + /// an alias to #dnnl_aBcd16b + dnnl_nChw16c = dnnl_aBcd16b, + /// 4D CNN activations tensor blocked by channels with block size 4, + /// an alias to #dnnl_aBcd4b + dnnl_nChw4c = dnnl_aBcd4b, + /// 4D CNN activations tensor blocked by channels with block size 8, + /// an alias to #dnnl_aBcd8b + dnnl_nChw8c = dnnl_aBcd8b, + /// 3D CNN activations tensor blocked by channels with block size 32, + /// an alias to #dnnl_aBc32b + dnnl_nCw32c = dnnl_aBc32b, + /// 3D CNN activations tensor blocked by channels with block size 16, + /// an alias to #dnnl_aBc16b + dnnl_nCw16c = dnnl_aBc16b, + /// 3D CNN activations tensor blocked by channels with block size 4, + /// an alias to #dnnl_aBc4b + dnnl_nCw4c = dnnl_aBc4b, + /// 3D CNN activations tensor blocked by channels with block size 8, + /// an alias to #dnnl_aBc8b + dnnl_nCw8c = dnnl_aBc8b, + dnnl_NCw16n16c = dnnl_ABc16a16b, + dnnl_NCdhw16n16c = dnnl_ABcde16a16b, + dnnl_NChw16n16c = dnnl_ABcd16a16b, + dnnl_NCw32n16c = dnnl_ABc32a16b, + dnnl_NChw32n16c = dnnl_ABcd32a16b, + dnnl_NChw16n32c = dnnl_ABcd16a32b, + dnnl_NCdhw32n16c = dnnl_ABcde32a16b, + dnnl_NCw32n32c = dnnl_ABc32a32b, + dnnl_NChw32n32c = dnnl_ABcd32a32b, + dnnl_NCdhw32n32c = dnnl_ABcde32a32b, + + // weights, 2D + dnnl_OI16i16o = dnnl_AB16b16a, + dnnl_OI16i32o = dnnl_AB16b32a, + dnnl_OI16i48o = dnnl_AB16b48a, + dnnl_OI16i64o = dnnl_AB16b64a, + dnnl_OI8i8o2i = dnnl_AB8b8a2b, + dnnl_OI8i16o2i = dnnl_AB8b16a2b, + dnnl_OI8i24o2i = dnnl_AB8b24a2b, + dnnl_OI8i32o2i = dnnl_AB8b32a2b, + dnnl_OI8i64o2i = dnnl_AB8b64a2b, + dnnl_OI4i8o4i = dnnl_AB4b8a4b, + dnnl_OI4i16o4i = dnnl_AB4b16a4b, + dnnl_OI4i24o4i = dnnl_AB4b24a4b, + dnnl_OI4i32o4i = dnnl_AB4b32a4b, + dnnl_OI4i64o4i = dnnl_AB4b64a4b, + dnnl_OI16i16o4i = dnnl_AB16b16a4b, + dnnl_OI8i32o = dnnl_AB8b32a, + dnnl_OI8i24o = dnnl_AB8b24a, + dnnl_OI8i16o = dnnl_AB8b16a, + dnnl_OI8i8o = dnnl_AB8b8a, + + // weights, 3D + dnnl_IOw8o8i = dnnl_BAc8a8b, + dnnl_IOw16o16i = dnnl_BAc16a16b, + dnnl_IOw16i16o = dnnl_BAc16b16a, + dnnl_OIw16i16o = dnnl_ABc16b16a, + dnnl_OwI16i16o = dnnl_AcB16b16a, + dnnl_OIw16i32o = dnnl_ABc16b32a, + dnnl_OwI16i32o = dnnl_AcB16b32a, + dnnl_OIw16i48o = dnnl_ABc16b48a, + dnnl_OwI16i48o = dnnl_AcB16b48a, + dnnl_OIw16i64o = dnnl_ABc16b64a, + dnnl_OwI16i64o = dnnl_AcB16b64a, + dnnl_OIw16o16i = dnnl_ABc16a16b, + dnnl_Oiw16o = dnnl_Abc16a, + dnnl_OIw4i8o4i = dnnl_ABc4b8a4b, + dnnl_OwI4i8o4i = dnnl_AcB4b8a4b, + dnnl_OIw4i16o4i = dnnl_ABc4b16a4b, + dnnl_OwI4i16o4i = dnnl_AcB4b16a4b, + dnnl_OIw4i24o4i = dnnl_ABc4b24a4b, + dnnl_OwI4i24o4i = dnnl_AcB4b24a4b, + dnnl_OIw4i32o4i = dnnl_ABc4b32a4b, + dnnl_OwI4i32o4i = dnnl_AcB4b32a4b, + dnnl_OIw4i64o4i = dnnl_ABc4b64a4b, + dnnl_OwI4i64o4i = dnnl_AcB4b64a4b, + dnnl_OIw2i8o4i = dnnl_ABc2b8a4b, + dnnl_OIw16i16o4i = dnnl_ABc16b16a4b, + dnnl_OIw16i16o2i = dnnl_ABc16b16a2b, + dnnl_OIw16o16i2o = dnnl_ABc16a16b2a, + dnnl_OIw4i4o = dnnl_ABc4b4a, + dnnl_OIw4o4i = dnnl_ABc4a4b, + dnnl_Oiw4o = dnnl_Abc4a, + dnnl_OIw8i8o2i = dnnl_ABc8b8a2b, + dnnl_OwI8i8o2i = dnnl_AcB8b8a2b, + dnnl_OIw8i16o2i = dnnl_ABc8b16a2b, + dnnl_OwI8i16o2i = dnnl_AcB8b16a2b, + dnnl_OIw8i24o2i = dnnl_ABc8b24a2b, + dnnl_OwI8i24o2i = dnnl_AcB8b24a2b, + dnnl_OIw8i32o2i = dnnl_ABc8b32a2b, + dnnl_OwI8i32o2i = dnnl_AcB8b32a2b, + dnnl_OIw8i64o2i = dnnl_ABc8b64a2b, + dnnl_OwI8i64o2i = dnnl_AcB8b64a2b, + dnnl_OIw8i8o = dnnl_ABc8b8a, + dnnl_OwI8i8o = dnnl_AcB8b8a, + dnnl_OIw8o16i2o = dnnl_ABc8a16b2a, + dnnl_IOw8o16i2o = dnnl_BAc8a16b2a, + dnnl_OIw8o8i = dnnl_ABc8a8b, + dnnl_OIw8o4i = dnnl_ABc8a4b, + dnnl_Owi16o = dnnl_Acb16a, + dnnl_OwI16o2i = dnnl_AcB16a2b, + dnnl_OwI16o4i = dnnl_AcB16a4b, + dnnl_Iwo8i = dnnl_Bca8b, + dnnl_IwO8i2o = dnnl_BcA8b2a, + dnnl_IwO8i4o = dnnl_BcA8b4a, + dnnl_Iwo16i = dnnl_Bca16b, + dnnl_IwO16i2o = dnnl_BcA16b2a, + dnnl_IwO16i4o = dnnl_BcA16b4a, + dnnl_Iwo24i = dnnl_Bca24b, + dnnl_IwO24i2o = dnnl_BcA24b2a, + dnnl_IwO24i4o = dnnl_BcA24b4a, + dnnl_Owi4o = dnnl_Acb4a, + dnnl_Owi8o = dnnl_Acb8a, + dnnl_OwI8o2i = dnnl_AcB8a2b, + dnnl_OIw8i32o = dnnl_ABc8b32a, + dnnl_OwI8i32o = dnnl_AcB8b32a, + dnnl_OIw8i24o = dnnl_ABc8b24a, + dnnl_OwI8i24o = dnnl_AcB8b24a, + dnnl_OIw8i16o = dnnl_ABc8b16a, + dnnl_OwI8i16o = dnnl_AcB8b16a, + dnnl_OwI8o4i = dnnl_AcB8a4b, + + // weights, 4D + dnnl_IOhw16i16o = dnnl_BAcd16b16a, + dnnl_IOhw8o8i = dnnl_BAcd8a8b, + dnnl_IOhw16o16i = dnnl_BAcd16a16b, + dnnl_Ohwi16o = dnnl_Acdb16a, + dnnl_OhwI16o2i = dnnl_AcdB16a2b, + dnnl_OhwI16o4i = dnnl_AcdB16a4b, + dnnl_Ihwo8i = dnnl_Bcda8b, + dnnl_IhwO8i2o = dnnl_BcdA8b2a, + dnnl_IhwO8i4o = dnnl_BcdA8b4a, + dnnl_Ihwo16i = dnnl_Bcda16b, + dnnl_IhwO16i2o = dnnl_BcdA16b2a, + dnnl_IhwO16i4o = dnnl_BcdA16b4a, + dnnl_Ihwo24i = dnnl_Bcda24b, + dnnl_IhwO24i2o = dnnl_BcdA24b2a, + dnnl_IhwO24i4o = dnnl_BcdA24b4a, + dnnl_Ohwi24o = dnnl_Acdb24a, + dnnl_Ohwi32o = dnnl_Acdb32a, + dnnl_Ohwi4o = dnnl_Acdb4a, + dnnl_Ohwi8o = dnnl_Acdb8a, + dnnl_OhwI8o2i = dnnl_AcdB8a2b, + dnnl_OhwI8o4i = dnnl_AcdB8a4b, + dnnl_OIhw16i16o = dnnl_ABcd16b16a, + dnnl_OhwI16i16o = dnnl_AcdB16b16a, + dnnl_OIhw16i32o = dnnl_ABcd16b32a, + dnnl_OhwI16i32o = dnnl_AcdB16b32a, + dnnl_OIhw16i48o = dnnl_ABcd16b48a, + dnnl_OhwI16i48o = dnnl_AcdB16b48a, + dnnl_OIhw16i64o = dnnl_ABcd16b64a, + dnnl_OhwI16i64o = dnnl_AcdB16b64a, + dnnl_OIhw16o16i = dnnl_ABcd16a16b, + dnnl_Oihw16o = dnnl_Abcd16a, + dnnl_OIhw4i8o4i = dnnl_ABcd4b8a4b, + dnnl_OhwI4i8o4i = dnnl_AcdB4b8a4b, + dnnl_OIhw4i16o4i = dnnl_ABcd4b16a4b, + dnnl_OhwI4i16o4i = dnnl_AcdB4b16a4b, + dnnl_OIhw4i24o4i = dnnl_ABcd4b24a4b, + dnnl_OhwI4i24o4i = dnnl_AcdB4b24a4b, + dnnl_OIhw4i32o4i = dnnl_ABcd4b32a4b, + dnnl_OhwI4i32o4i = dnnl_AcdB4b32a4b, + dnnl_OIhw4i64o4i = dnnl_ABcd4b64a4b, + dnnl_OhwI4i64o4i = dnnl_AcdB4b64a4b, + dnnl_OIhw16i16o4i = dnnl_ABcd16b16a4b, + dnnl_OIhw16i16o2i = dnnl_ABcd16b16a2b, + dnnl_OIhw16o16i2o = dnnl_ABcd16a16b2a, + dnnl_OIhw4i4o = dnnl_ABcd4b4a, + dnnl_OIhw4o4i = dnnl_ABcd4a4b, + dnnl_Oihw4o = dnnl_Abcd4a, + dnnl_OIhw8i8o2i = dnnl_ABcd8b8a2b, + dnnl_OhwI8i8o2i = dnnl_AcdB8b8a2b, + dnnl_OIhw8i16o2i = dnnl_ABcd8b16a2b, + dnnl_OhwI8i16o2i = dnnl_AcdB8b16a2b, + dnnl_OIhw8i32o2i = dnnl_ABcd8b32a2b, + dnnl_OhwI8i32o2i = dnnl_AcdB8b32a2b, + dnnl_OIhw8i24o2i = dnnl_ABcd8b24a2b, + dnnl_OhwI8i24o2i = dnnl_AcdB8b24a2b, + dnnl_OIhw8i64o2i = dnnl_ABcd8b64a2b, + dnnl_OhwI8i64o2i = dnnl_AcdB8b64a2b, + dnnl_OIhw8i8o = dnnl_ABcd8b8a, + dnnl_OhwI8i8o = dnnl_AcdB8b8a, + dnnl_OIhw8o16i2o = dnnl_ABcd8a16b2a, + dnnl_OIhw2i8o4i = dnnl_ABcd2b8a4b, + dnnl_IOhw8o16i2o = dnnl_BAcd8a16b2a, + dnnl_OIhw8o8i = dnnl_ABcd8a8b, + dnnl_OIhw8o4i = dnnl_ABcd8a4b, + dnnl_Owhi16o = dnnl_Adcb16a, + dnnl_OIhw8i32o = dnnl_ABcd8b32a, + dnnl_OhwI8i32o = dnnl_AcdB8b32a, + dnnl_OIhw8i24o = dnnl_ABcd8b24a, + dnnl_OhwI8i24o = dnnl_AcdB8b24a, + dnnl_OIhw8i16o = dnnl_ABcd8b16a, + dnnl_OhwI8i16o = dnnl_AcdB8b16a, + + // weights, 5D + dnnl_Odhwi16o = dnnl_Acdeb16a, + dnnl_OdhwI16o2i = dnnl_AcdeB16a2b, + dnnl_OdhwI16o4i = dnnl_AcdeB16a4b, + dnnl_Idhwo8i = dnnl_Bcdea8b, + dnnl_IdhwO8i2o = dnnl_BcdeA8b2a, + dnnl_IdhwO8i4o = dnnl_BcdeA8b4a, + dnnl_Idhwo16i = dnnl_Bcdea16b, + dnnl_IdhwO16i2o = dnnl_BcdeA16b2a, + dnnl_IdhwO16i4o = dnnl_BcdeA16b4a, + dnnl_Idhwo24i = dnnl_Bcdea24b, + dnnl_IdhwO24i2o = dnnl_BcdeA24b2a, + dnnl_IdhwO24i4o = dnnl_BcdeA24b4a, + dnnl_Odhwi4o = dnnl_Acdeb4a, + dnnl_Odhwi8o = dnnl_Acdeb8a, + dnnl_OdhwI8o2i = dnnl_AcdeB8a2b, + dnnl_OdhwI8o4i = dnnl_AcdeB8a4b, + dnnl_Odwhi16o = dnnl_Acedb16a, + dnnl_OIdhw16i16o = dnnl_ABcde16b16a, + dnnl_OdhwI16i16o = dnnl_AcdeB16b16a, + dnnl_OIdhw16i32o = dnnl_ABcde16b32a, + dnnl_OdhwI16i32o = dnnl_AcdeB16b32a, + dnnl_OIdhw16i48o = dnnl_ABcde16b48a, + dnnl_OdhwI16i48o = dnnl_AcdeB16b48a, + dnnl_OIdhw16i64o = dnnl_ABcde16b64a, + dnnl_OdhwI16i64o = dnnl_AcdeB16b64a, + dnnl_OIdhw16o16i = dnnl_ABcde16a16b, + dnnl_Oidhw16o = dnnl_Abcde16a, + dnnl_OIdhw4i4o = dnnl_ABcde4b4a, + dnnl_OIdhw4o4i = dnnl_ABcde4a4b, + dnnl_Oidhw4o = dnnl_Abcde4a, + dnnl_OIdhw8i8o2i = dnnl_ABcde8b8a2b, + dnnl_OdhwI8i8o2i = dnnl_AcdeB8b8a2b, + dnnl_OIdhw8i16o2i = dnnl_ABcde8b16a2b, + dnnl_OdhwI8i16o2i = dnnl_AcdeB8b16a2b, + dnnl_OIdhw8i32o2i = dnnl_ABcde8b32a2b, + dnnl_OdhwI8i32o2i = dnnl_AcdeB8b32a2b, + dnnl_OIdhw8i24o2i = dnnl_ABcde8b24a2b, + dnnl_OdhwI8i24o2i = dnnl_AcdeB8b24a2b, + dnnl_OIdhw8i64o2i = dnnl_ABcde8b64a2b, + dnnl_OdhwI8i64o2i = dnnl_AcdeB8b64a2b, + dnnl_OIdhw8i8o = dnnl_ABcde8b8a, + dnnl_OdhwI8i8o = dnnl_AcdeB8b8a, + dnnl_OIdhw8o16i2o = dnnl_ABcde8a16b2a, + dnnl_IOdhw8o16i2o = dnnl_BAcde8a16b2a, + dnnl_OIdhw4i8o4i = dnnl_ABcde4b8a4b, + dnnl_OdhwI4i8o4i = dnnl_AcdeB4b8a4b, + dnnl_OIdhw4i16o4i = dnnl_ABcde4b16a4b, + dnnl_OdhwI4i16o4i = dnnl_AcdeB4b16a4b, + dnnl_OIdhw4i24o4i = dnnl_ABcde4b24a4b, + dnnl_OdhwI4i24o4i = dnnl_AcdeB4b24a4b, + dnnl_OIdhw4i32o4i = dnnl_ABcde4b32a4b, + dnnl_OdhwI4i32o4i = dnnl_AcdeB4b32a4b, + dnnl_OIdhw4i64o4i = dnnl_ABcde4b64a4b, + dnnl_OdhwI4i64o4i = dnnl_AcdeB4b64a4b, + dnnl_OIdhw16i16o4i = dnnl_ABcde16b16a4b, + dnnl_OIdhw16i16o2i = dnnl_ABcde16b16a2b, + dnnl_OIdhw2i8o4i = dnnl_ABcde2b8a4b, + dnnl_OIdhw8o8i = dnnl_ABcde8a8b, + dnnl_OIdhw8o4i = dnnl_ABcde8a4b, + dnnl_IOdhw16i16o = dnnl_BAcde16b16a, + dnnl_OIdhw4o8i8o4i = dnnl_ABcde4a8b8a4b, + dnnl_IOdhw8o8i = dnnl_BAcde8a8b, + dnnl_IOdhw16o16i = dnnl_BAcde16a16b, + dnnl_OIdhw16o16i2o = dnnl_ABcde16a16b2a, + dnnl_OIdhw8i32o = dnnl_ABcde8b32a, + dnnl_OdhwI8i32o = dnnl_AcdeB8b32a, + dnnl_OIdhw8i24o = dnnl_ABcde8b24a, + dnnl_OdhwI8i24o = dnnl_AcdeB8b24a, + dnnl_OIdhw8i16o = dnnl_ABcde8b16a, + dnnl_OdhwI8i16o = dnnl_AcdeB8b16a, + + // weights w/ groups, 3D + dnnl_Goiw16g = dnnl_Abcd16a, + dnnl_Goiw8g = dnnl_Abcd8a, + dnnl_Goiw4g = dnnl_Abcd4a, + dnnl_gIOw8o8i = dnnl_aCBd8b8c, + dnnl_gIOw16o16i = dnnl_aCBd16b16c, + dnnl_gIOw16i16o = dnnl_aCBd16c16b, + dnnl_gOIw16i16o = dnnl_aBCd16c16b, + dnnl_gOIw16o16i = dnnl_aBCd16b16c, + dnnl_gOiw16o = dnnl_aBcd16b, + dnnl_gOIw4i16o4i = dnnl_aBCd4c16b4c, + dnnl_gOIw2i8o4i = dnnl_aBCd2c8b4c, + dnnl_gOIw16i16o4i = dnnl_aBCd16c16b4c, + dnnl_gOIw16i16o2i = dnnl_aBCd16c16b2c, + dnnl_gOIw16o16i2o = dnnl_aBCd16b16c2b, + dnnl_gOIw4i4o = dnnl_aBCd4c4b, + dnnl_gOIw4o4i = dnnl_aBCd4b4c, + dnnl_gOiw4o = dnnl_aBcd4b, + dnnl_gOIw8i16o2i = dnnl_aBCd8c16b2c, + dnnl_gOIw8i8o = dnnl_aBCd8c8b, + dnnl_gOIw8o16i2o = dnnl_aBCd8b16c2b, + dnnl_gIOw8o16i2o = dnnl_aCBd8b16c2b, + dnnl_gOIw8o8i = dnnl_aBCd8b8c, + dnnl_gOIw8o4i = dnnl_aBCd8b4c, + dnnl_gOwi16o = dnnl_aBdc16b, + dnnl_gOwI16o2i = dnnl_aBdC16b2c, + dnnl_gOwI16o4i = dnnl_aBdC16b4c, + dnnl_gIwo8i = dnnl_aCdb8c, + dnnl_gIwO8i2o = dnnl_aCdB8c2b, + dnnl_gIwO8i4o = dnnl_aCdB8c4b, + dnnl_gIwo16i = dnnl_aCdb16c, + dnnl_gIwO16i2o = dnnl_aCdB16c2b, + dnnl_gIwO16i4o = dnnl_aCdB16c4b, + dnnl_gIwo24i = dnnl_aCdb24c, + dnnl_gIwO24i2o = dnnl_aCdB24c2b, + dnnl_gIwO24i4o = dnnl_aCdB24c4b, + dnnl_gOwi4o = dnnl_aBdc4b, + dnnl_gOwi8o = dnnl_aBdc8b, + dnnl_gOwI8o2i = dnnl_aBdC8b2c, + dnnl_gOwI8o4i = dnnl_aBdC8b4c, + dnnl_Goiw32g = dnnl_Abcd32a, + dnnl_gOIw2i4o2i = dnnl_aBCd2c4b2c, + dnnl_gOIw2o4i2o = dnnl_aBCd2b4c2b, + dnnl_gOIw4i8o2i = dnnl_aBCd4c8b2c, + dnnl_gOIw4o8i2o = dnnl_aBCd4b8c2b, + dnnl_goIw4i = dnnl_abCd4c, + dnnl_goIw32i = dnnl_abCd32c, + + // weights w/ groups, 4D + dnnl_gIOhw16i16o = dnnl_aCBde16c16b, + dnnl_gIOhw8o8i = dnnl_aCBde8b8c, + dnnl_gIOhw16o16i = dnnl_aCBde16b16c, + dnnl_gOhwi16o = dnnl_aBdec16b, + dnnl_gOhwI16o2i = dnnl_aBdeC16b2c, + dnnl_gOhwI16o4i = dnnl_aBdeC16b4c, + dnnl_gIhwo8i = dnnl_aCdeb8c, + dnnl_gIhwO8i2o = dnnl_aCdeB8c2b, + dnnl_gIhwO8i4o = dnnl_aCdeB8c4b, + dnnl_gIhwo16i = dnnl_aCdeb16c, + dnnl_gIhwO16i2o = dnnl_aCdeB16c2b, + dnnl_gIhwO16i4o = dnnl_aCdeB16c4b, + dnnl_gIhwo24i = dnnl_aCdeb24c, + dnnl_gIhwO24i2o = dnnl_aCdeB24c2b, + dnnl_gIhwO24i4o = dnnl_aCdeB24c4b, + dnnl_gOhwi32o = dnnl_aBdec32b, + dnnl_gOhwi24o = dnnl_aBdec24b, + dnnl_gOhwI24o2i = dnnl_aBdeC24b2c, + dnnl_gOhwI24o4i = dnnl_aBdeC24b4c, + dnnl_gOhwi4o = dnnl_aBdec4b, + dnnl_gOhwi8o = dnnl_aBdec8b, + dnnl_gOhwI8o2i = dnnl_aBdeC8b2c, + dnnl_gOhwI8o4i = dnnl_aBdeC8b4c, + dnnl_Goihw16g = dnnl_Abcde16a, + dnnl_gOIhw16i16o = dnnl_aBCde16c16b, + dnnl_gOIhw16o16i = dnnl_aBCde16b16c, + dnnl_gOihw16o = dnnl_aBcde16b, + dnnl_gOIhw2i8o4i = dnnl_aBCde2c8b4c, + dnnl_gOIhw4i16o4i = dnnl_aBCde4c16b4c, + dnnl_gOIhw16i16o4i = dnnl_aBCde16c16b4c, + dnnl_gOIhw16i16o2i = dnnl_aBCde16c16b2c, + dnnl_gOIhw16o16i2o = dnnl_aBCde16b16c2b, + dnnl_gOIhw4i4o = dnnl_aBCde4c4b, + dnnl_gOIhw4o4i = dnnl_aBCde4b4c, + dnnl_gOihw4o = dnnl_aBcde4b, + dnnl_Goihw8g = dnnl_Abcde8a, + dnnl_Goihw4g = dnnl_Abcde4a, + dnnl_gOIhw8i16o2i = dnnl_aBCde8c16b2c, + dnnl_gOIhw8i8o = dnnl_aBCde8c8b, + dnnl_gOIhw8o16i2o = dnnl_aBCde8b16c2b, + dnnl_gIOhw8o16i2o = dnnl_aCBde8b16c2b, + dnnl_gOIhw8o8i = dnnl_aBCde8b8c, + dnnl_gOIhw8o4i = dnnl_aBCde8b4c, + dnnl_Goihw32g = dnnl_Abcde32a, + dnnl_gOwhi16o = dnnl_aBedc16b, + dnnl_goIhw4i = dnnl_abCde4c, + dnnl_goIhw32i = dnnl_abCde32c, + + dnnl_OIw4o8i8o4i = dnnl_ABc4a8b8a4b, + dnnl_OIhw4o8i8o4i = dnnl_ABcd4a8b8a4b, + dnnl_IOw4i8o8i4o = dnnl_BAc4b8a8b4a, + dnnl_IOhw4i8o8i4o = dnnl_BAcd4b8a8b4a, + dnnl_IOdhw4i8o8i4o = dnnl_BAcde4b8a8b4a, + + dnnl_OIhw2o8i8o2i = dnnl_ABcd2a8b8a2b, + dnnl_gOIw4o8i8o4i = dnnl_aBCd4b8c8b4c, + dnnl_gOIhw4o8i8o4i = dnnl_aBCde4b8c8b4c, + dnnl_gOIdhw4o8i8o4i = dnnl_aBCdef4b8c8b4c, + dnnl_gIOw4i8o8i4o = dnnl_aCBd4c8b8c4b, + dnnl_gIOhw4i8o8i4o = dnnl_aCBde4c8b8c4b, + dnnl_gIOdhw4i8o8i4o = dnnl_aCBdef4c8b8c4b, + dnnl_gOIhw2o8i8o2i = dnnl_aBCde2b8c8b2c, + dnnl_gOIhw2i4o2i = dnnl_aBCde2c4b2c, + dnnl_gOIhw2o4i2o = dnnl_aBCde2b4c2b, + dnnl_gOIhw4i8o2i = dnnl_aBCde4c8b2c, + dnnl_gOIhw4o8i2o = dnnl_aBCde4b8c2b, + + // weights w/ groups, 6D + dnnl_gIOdhw16i16o = dnnl_aCBdef16c16b, + dnnl_gIOdhw8o8i = dnnl_aCBdef8b8c, + dnnl_gIOdhw16o16i = dnnl_aCBdef16b16c, + dnnl_gOdhwi16o = dnnl_aBdefc16b, + dnnl_gOdhwI16o2i = dnnl_aBdefC16b2c, + dnnl_gOdhwI16o4i = dnnl_aBdefC16b4c, + dnnl_gIdhwo8i = dnnl_aCdefb8c, + dnnl_gIdhwO8i2o = dnnl_aCdefB8c2b, + dnnl_gIdhwO8i4o = dnnl_aCdefB8c4b, + dnnl_gIdhwo16i = dnnl_aCdefb16c, + dnnl_gIdhwO16i2o = dnnl_aCdefB16c2b, + dnnl_gIdhwO16i4o = dnnl_aCdefB16c4b, + dnnl_gIdhwo24i = dnnl_aCdefb24c, + dnnl_gIdhwO24i2o = dnnl_aCdefB24c2b, + dnnl_gIdhwO24i4o = dnnl_aCdefB24c4b, + dnnl_gOdhwi4o = dnnl_aBdefc4b, + dnnl_gOdhwi8o = dnnl_aBdefc8b, + dnnl_gOdhwI8o2i = dnnl_aBdefC8b2c, + dnnl_gOdhwI8o4i = dnnl_aBdefC8b4c, + dnnl_gOdwhi16o = dnnl_aBdfec16b, + dnnl_gOIdhw16i16o = dnnl_aBCdef16c16b, + dnnl_gOIdhw4i16o4i = dnnl_aBCdef4c16b4c, + dnnl_gOIdhw16i16o4i = dnnl_aBCdef16c16b4c, + dnnl_gOIdhw2i8o4i = dnnl_aBCdef2c8b4c, + dnnl_gOIdhw16i16o2i = dnnl_aBCdef16c16b2c, + dnnl_gOIdhw16o16i = dnnl_aBCdef16b16c, + dnnl_gOIdhw16o16i2o = dnnl_aBCdef16b16c2b, + dnnl_gOidhw16o = dnnl_aBcdef16b, + dnnl_gOIdhw4i4o = dnnl_aBCdef4c4b, + dnnl_gOIdhw4o4i = dnnl_aBCdef4b4c, + dnnl_gOidhw4o = dnnl_aBcdef4b, + dnnl_gOIdhw8i16o2i = dnnl_aBCdef8c16b2c, + dnnl_gOIdhw8i8o = dnnl_aBCdef8c8b, + dnnl_gOIdhw8o16i2o = dnnl_aBCdef8b16c2b, + dnnl_gIOdhw8o16i2o = dnnl_aCBdef8b16c2b, + dnnl_gOIdhw8o8i = dnnl_aBCdef8b8c, + dnnl_gOIdhw8o4i = dnnl_aBCdef8b4c, + dnnl_Goidhw16g = dnnl_Abcdef16a, + dnnl_Goidhw32g = dnnl_Abcdef32a, + dnnl_gOIdhw2i4o2i = dnnl_aBCdef2c4b2c, + dnnl_gOIdhw4i8o2i = dnnl_aBCdef4c8b2c, + dnnl_gOIdhw2o4i2o = dnnl_aBCdef2b4c2b, + dnnl_gOIdhw4o8i2o = dnnl_aBCdef4b8c2b, + dnnl_goIdhw4i = dnnl_abCdef4c, + dnnl_goIdhw32i = dnnl_abCdef32c, + + // weights, 3D + dnnl_Owi24o = dnnl_Acb24a, + dnnl_OwI24o2i = dnnl_AcB24a2b, + dnnl_OwI24o4i = dnnl_AcB24a4b, + dnnl_Owi32o = dnnl_Acb32a, + dnnl_OwI32o2i = dnnl_AcB32a2b, + dnnl_OwI32o4i = dnnl_AcB32a4b, + dnnl_Owi48o = dnnl_Acb48a, + dnnl_OwI48o2i = dnnl_AcB48a2b, + dnnl_OwI48o4i = dnnl_AcB48a4b, + dnnl_Owi64o = dnnl_Acb64a, + dnnl_OwI64o2i = dnnl_AcB64a2b, + dnnl_OwI64o4i = dnnl_AcB64a4b, + dnnl_Iwo32i = dnnl_Bca32b, + dnnl_IwO32i2o = dnnl_BcA32b2a, + dnnl_IwO32i4o = dnnl_BcA32b4a, + dnnl_Iwo48i = dnnl_Bca48b, + dnnl_IwO48i2o = dnnl_BcA48b2a, + dnnl_IwO48i4o = dnnl_BcA48b4a, + dnnl_Iwo64i = dnnl_Bca64b, + dnnl_IwO64i2o = dnnl_BcA64b2a, + dnnl_IwO64i4o = dnnl_BcA64b4a, + dnnl_wIo2i = dnnl_cBa2b, + dnnl_wIo4i = dnnl_cBa4b, + dnnl_gOwi24o = dnnl_aBdc24b, + dnnl_gOwI24o2i = dnnl_aBdC24b2c, + dnnl_gOwI24o4i = dnnl_aBdC24b4c, + dnnl_gOwi32o = dnnl_aBdc32b, + dnnl_gOwI32o2i = dnnl_aBdC32b2c, + dnnl_gOwI32o4i = dnnl_aBdC32b4c, + dnnl_gOwi48o = dnnl_aBdc48b, + dnnl_gOwI48o2i = dnnl_aBdC48b2c, + dnnl_gOwI48o4i = dnnl_aBdC48b4c, + dnnl_gOwi64o = dnnl_aBdc64b, + dnnl_gOwI64o2i = dnnl_aBdC64b2c, + dnnl_gOwI64o4i = dnnl_aBdC64b4c, + dnnl_gIwo32i = dnnl_aCdb32c, + dnnl_gIwO32i2o = dnnl_aCdB32c2b, + dnnl_gIwO32i4o = dnnl_aCdB32c4b, + dnnl_gIwo48i = dnnl_aCdb48c, + dnnl_gIwO48i2o = dnnl_aCdB48c2b, + dnnl_gIwO48i4o = dnnl_aCdB48c4b, + dnnl_gIwo64i = dnnl_aCdb64c, + dnnl_gIwO64i2o = dnnl_aCdB64c2b, + dnnl_gIwO64i4o = dnnl_aCdB64c4b, + dnnl_gwio = dnnl_adcb, + dnnl_gwIo2i = dnnl_adCb2c, + dnnl_gwIo4i = dnnl_adCb4c, + // weights, 4D + dnnl_OhwI24o = dnnl_Acdb24a, + dnnl_OhwI24o2i = dnnl_AcdB24a2b, + dnnl_OhwI24o4i = dnnl_AcdB24a4b, + dnnl_OhwI32o = dnnl_Acdb32a, + dnnl_OhwI32o2i = dnnl_AcdB32a2b, + dnnl_OhwI32o4i = dnnl_AcdB32a4b, + dnnl_Ohwi48o = dnnl_Acdb48a, + dnnl_OhwI48o2i = dnnl_AcdB48a2b, + dnnl_OhwI48o4i = dnnl_AcdB48a4b, + dnnl_Ohwi64o = dnnl_Acdb64a, + dnnl_OhwI64o2i = dnnl_AcdB64a2b, + dnnl_OhwI64o4i = dnnl_AcdB64a4b, + dnnl_Ihwo32i = dnnl_Bcda32b, + dnnl_IhwO32i2o = dnnl_BcdA32b2a, + dnnl_IhwO32i4o = dnnl_BcdA32b4a, + dnnl_Ihwo48i = dnnl_Bcda48b, + dnnl_IhwO48i2o = dnnl_BcdA48b2a, + dnnl_IhwO48i4o = dnnl_BcdA48b4a, + dnnl_Ihwo64i = dnnl_Bcda64b, + dnnl_IhwO64i2o = dnnl_BcdA64b2a, + dnnl_IhwO64i4o = dnnl_BcdA64b4a, + dnnl_hwIo2i = dnnl_cdBa2b, + dnnl_hwIo4i = dnnl_cdBa4b, + dnnl_gOhwI24o = dnnl_aBdec24b, + dnnl_gOhwI32o = dnnl_aBdec32b, + dnnl_gOhwI32o2i = dnnl_aBdeC32b2c, + dnnl_gOhwI32o4i = dnnl_aBdeC32b4c, + dnnl_gOhwi48o = dnnl_aBdec48b, + dnnl_gOhwI48o2i = dnnl_aBdeC48b2c, + dnnl_gOhwI48o4i = dnnl_aBdeC48b4c, + dnnl_gOhwi64o = dnnl_aBdec64b, + dnnl_gOhwI64o2i = dnnl_aBdeC64b2c, + dnnl_gOhwI64o4i = dnnl_aBdeC64b4c, + dnnl_gIhwo32i = dnnl_aCdeb32c, + dnnl_gIhwO32i2o = dnnl_aCdeB32c2b, + dnnl_gIhwO32i4o = dnnl_aCdeB32c4b, + dnnl_gIhwo48i = dnnl_aCdeb48c, + dnnl_gIhwO48i2o = dnnl_aCdeB48c2b, + dnnl_gIhwO48i4o = dnnl_aCdeB48c4b, + dnnl_gIhwo64i = dnnl_aCdeb64c, + dnnl_gIhwO64i2o = dnnl_aCdeB64c2b, + dnnl_gIhwO64i4o = dnnl_aCdeB64c4b, + dnnl_ghwio = dnnl_adecb, + dnnl_ghwIo2i = dnnl_adeCb2c, + dnnl_ghwIo4i = dnnl_adeCb4c, + // weights, 5D + dnnl_Odhwi24o = dnnl_Acdeb24a, + dnnl_OdhwI24o2i = dnnl_AcdeB24a2b, + dnnl_OdhwI24o4i = dnnl_AcdeB24a4b, + dnnl_Odhwi32o = dnnl_Acdeb32a, + dnnl_OdhwI32o2i = dnnl_AcdeB32a2b, + dnnl_OdhwI32o4i = dnnl_AcdeB32a4b, + dnnl_Odhwi48o = dnnl_Acdeb48a, + dnnl_OdhwI48o2i = dnnl_AcdeB48a2b, + dnnl_OdhwI48o4i = dnnl_AcdeB48a4b, + dnnl_Odhwi64o = dnnl_Acdeb64a, + dnnl_OdhwI64o2i = dnnl_AcdeB64a2b, + dnnl_OdhwI64o4i = dnnl_AcdeB64a4b, + dnnl_Idhwo32i = dnnl_Bcdea32b, + dnnl_IdhwO32i2o = dnnl_BcdeA32b2a, + dnnl_IdhwO32i4o = dnnl_BcdeA32b4a, + dnnl_Idhwo48i = dnnl_Bcdea48b, + dnnl_IdhwO48i2o = dnnl_BcdeA48b2a, + dnnl_IdhwO48i4o = dnnl_BcdeA48b4a, + dnnl_Idhwo64i = dnnl_Bcdea64b, + dnnl_IdhwO64i2o = dnnl_BcdeA64b2a, + dnnl_IdhwO64i4o = dnnl_BcdeA64b4a, + dnnl_dhwIo2i = dnnl_cdeBa2b, + dnnl_dhwIo4i = dnnl_cdeBa4b, + dnnl_gOdhwi24o = dnnl_aBdefc24b, + dnnl_gOdhwI24o2i = dnnl_aBdefC24b2c, + dnnl_gOdhwI24o4i = dnnl_aBdefC24b4c, + dnnl_gOdhwi32o = dnnl_aBdefc32b, + dnnl_gOdhwI32o2i = dnnl_aBdefC32b2c, + dnnl_gOdhwI32o4i = dnnl_aBdefC32b4c, + dnnl_gOdhwi48o = dnnl_aBdefc48b, + dnnl_gOdhwI48o2i = dnnl_aBdefC48b2c, + dnnl_gOdhwI48o4i = dnnl_aBdefC48b4c, + dnnl_gOdhwi64o = dnnl_aBdefc64b, + dnnl_gOdhwI64o2i = dnnl_aBdefC64b2c, + dnnl_gOdhwI64o4i = dnnl_aBdefC64b4c, + dnnl_gIdhwo32i = dnnl_aCdefb32c, + dnnl_gIdhwO32i2o = dnnl_aCdefB32c2b, + dnnl_gIdhwO32i4o = dnnl_aCdefB32c4b, + dnnl_gIdhwo48i = dnnl_aCdefb48c, + dnnl_gIdhwO48i2o = dnnl_aCdefB48c2b, + dnnl_gIdhwO48i4o = dnnl_aCdefB48c4b, + dnnl_gIdhwo64i = dnnl_aCdefb64c, + dnnl_gIdhwO64i2o = dnnl_aCdefB64c2b, + dnnl_gIdhwO64i4o = dnnl_aCdefB64c4b, + dnnl_gdhwio = dnnl_adefcb, + dnnl_gdhwIo2i = dnnl_adefCb2c, + dnnl_gdhwIo4i = dnnl_adefCb4c, + dnnl_OI16i32o4i = dnnl_AB16b32a4b, + dnnl_OI16i48o4i = dnnl_AB16b48a4b, + dnnl_OI16i64o4i = dnnl_AB16b64a4b, + dnnl_OI16i16o2i = dnnl_AB16b16a2b, + dnnl_OI16i32o2i = dnnl_AB16b32a2b, + dnnl_OI16i48o2i = dnnl_AB16b48a2b, + dnnl_OI16i64o2i = dnnl_AB16b64a2b, + dnnl_OIw16i32o4i = dnnl_ABc16b32a4b, + dnnl_OIw16i48o4i = dnnl_ABc16b48a4b, + dnnl_OIw16i64o4i = dnnl_ABc16b64a4b, + dnnl_OIw16i32o2i = dnnl_ABc16b32a2b, + dnnl_OIw16i48o2i = dnnl_ABc16b48a2b, + dnnl_OIw16i64o2i = dnnl_ABc16b64a2b, + dnnl_OIhw16i32o4i = dnnl_ABcd16b32a4b, + dnnl_OIhw16i48o4i = dnnl_ABcd16b48a4b, + dnnl_OIhw16i64o4i = dnnl_ABcd16b64a4b, + dnnl_OIhw16i32o2i = dnnl_ABcd16b32a2b, + dnnl_OIhw16i48o2i = dnnl_ABcd16b48a2b, + dnnl_OIhw16i64o2i = dnnl_ABcd16b64a2b, + dnnl_OIdhw16i32o4i = dnnl_ABcde16b32a4b, + dnnl_OIdhw16i48o4i = dnnl_ABcde16b48a4b, + dnnl_OIdhw16i64o4i = dnnl_ABcde16b64a4b, + dnnl_OIdhw16i32o2i = dnnl_ABcde16b32a2b, + dnnl_OIdhw16i48o2i = dnnl_ABcde16b48a2b, + dnnl_OIdhw16i64o2i = dnnl_ABcde16b64a2b, + dnnl_OwI16i16o2i = dnnl_AcB16b16a2b, + dnnl_OwI16i16o4i = dnnl_AcB16b16a4b, + dnnl_OhwI16i16o2i = dnnl_AcdB16b16a2b, + dnnl_OhwI16i16o4i = dnnl_AcdB16b16a4b, + dnnl_OdhwI16i16o2i = dnnl_AcdeB16b16a2b, + dnnl_OdhwI16i16o4i = dnnl_AcdeB16b16a4b, + dnnl_IwO16o16i2o = dnnl_BcA16a16b2a, + dnnl_IwO16o16i4o = dnnl_BcA16a16b4a, + dnnl_IhwO16o16i2o = dnnl_BcdA16a16b2a, + dnnl_IhwO16o16i4o = dnnl_BcdA16a16b4a, + dnnl_IdhwO16o16i2o = dnnl_BcdeA16a16b2a, + dnnl_IdhwO16o16i4o = dnnl_BcdeA16a16b4a, + dnnl_gOwI16i16o2i = dnnl_aBdC16c16b2c, + dnnl_gOwI16i16o4i = dnnl_aBdC16c16b4c, + dnnl_gOhwI16i16o2i = dnnl_aBdeC16c16b2c, + dnnl_gOhwI16i16o4i = dnnl_aBdeC16c16b4c, + dnnl_gOdhwI16i16o2i = dnnl_aBdefC16c16b2c, + dnnl_gOdhwI16i16o4i = dnnl_aBdefC16c16b4c, + dnnl_gIwO16o16i2o = dnnl_aCdB16b16c2b, + dnnl_gIwO16o16i4o = dnnl_aCdB16b16c4b, + dnnl_gIhwO16o16i2o = dnnl_aCdeB16b16c2b, + dnnl_gIhwO16o16i4o = dnnl_aCdeB16b16c4b, + dnnl_gIdhwO16o16i2o = dnnl_aCdefB16b16c2b, + dnnl_gIdhwO16o16i4o = dnnl_aCdefB16b16c4b, + dnnl_OwI16i32o2i = dnnl_AcB16b32a2b, + dnnl_OwI16i32o4i = dnnl_AcB16b32a4b, + dnnl_OwI16i48o2i = dnnl_AcB16b48a2b, + dnnl_OwI16i48o4i = dnnl_AcB16b48a4b, + dnnl_OwI16i64o2i = dnnl_AcB16b64a2b, + dnnl_OwI16i64o4i = dnnl_AcB16b64a4b, + dnnl_IwO16o32i2o = dnnl_BcA16a32b2a, + dnnl_IwO16o32i4o = dnnl_BcA16a32b4a, + dnnl_IwO16o48i2o = dnnl_BcA16a48b2a, + dnnl_IwO16o48i4o = dnnl_BcA16a48b4a, + dnnl_IwO16o64i2o = dnnl_BcA16a64b2a, + dnnl_IwO16o64i4o = dnnl_BcA16a64b4a, + dnnl_gOwI16i32o2i = dnnl_aBdC16c32b2c, + dnnl_gOwI16i32o4i = dnnl_aBdC16c32b4c, + dnnl_gOwI16i48o2i = dnnl_aBdC16c48b2c, + dnnl_gOwI16i48o4i = dnnl_aBdC16c48b4c, + dnnl_gOwI16i64o2i = dnnl_aBdC16c64b2c, + dnnl_gOwI16i64o4i = dnnl_aBdC16c64b4c, + dnnl_gIwO16o32i2o = dnnl_aCdB16b32c2b, + dnnl_gIwO16o32i4o = dnnl_aCdB16b32c4b, + dnnl_gIwO16o48i2o = dnnl_aCdB16b48c2b, + dnnl_gIwO16o48i4o = dnnl_aCdB16b48c4b, + dnnl_gIwO16o64i2o = dnnl_aCdB16b64c2b, + dnnl_gIwO16o64i4o = dnnl_aCdB16b64c4b, + dnnl_OhwI16i32o2i = dnnl_AcdB16b32a2b, + dnnl_OhwI16i32o4i = dnnl_AcdB16b32a4b, + dnnl_OhwI16i48o2i = dnnl_AcdB16b48a2b, + dnnl_OhwI16i48o4i = dnnl_AcdB16b48a4b, + dnnl_OhwI16i64o2i = dnnl_AcdB16b64a2b, + dnnl_OhwI16i64o4i = dnnl_AcdB16b64a4b, + dnnl_IhwO16o32i2o = dnnl_BcdA16a32b2a, + dnnl_IhwO16o32i4o = dnnl_BcdA16a32b4a, + dnnl_IhwO16o48i2o = dnnl_BcdA16a48b2a, + dnnl_IhwO16o48i4o = dnnl_BcdA16a48b4a, + dnnl_IhwO16o64i2o = dnnl_BcdA16a64b2a, + dnnl_IhwO16o64i4o = dnnl_BcdA16a64b4a, + dnnl_gOhwI16i32o2i = dnnl_aBdeC16c32b2c, + dnnl_gOhwI16i32o4i = dnnl_aBdeC16c32b4c, + dnnl_gOhwI16i48o2i = dnnl_aBdeC16c48b2c, + dnnl_gOhwI16i48o4i = dnnl_aBdeC16c48b4c, + dnnl_gOhwI16i64o2i = dnnl_aBdeC16c64b2c, + dnnl_gOhwI16i64o4i = dnnl_aBdeC16c64b4c, + dnnl_gIhwO16o32i2o = dnnl_aCdeB16b32c2b, + dnnl_gIhwO16o32i4o = dnnl_aCdeB16b32c4b, + dnnl_gIhwO16o48i2o = dnnl_aCdeB16b48c2b, + dnnl_gIhwO16o48i4o = dnnl_aCdeB16b48c4b, + dnnl_gIhwO16o64i2o = dnnl_aCdeB16b64c2b, + dnnl_gIhwO16o64i4o = dnnl_aCdeB16b64c4b, + dnnl_OdhwI16i32o2i = dnnl_AcdeB16b32a2b, + dnnl_OdhwI16i32o4i = dnnl_AcdeB16b32a4b, + dnnl_OdhwI16i48o2i = dnnl_AcdeB16b48a2b, + dnnl_OdhwI16i48o4i = dnnl_AcdeB16b48a4b, + dnnl_OdhwI16i64o2i = dnnl_AcdeB16b64a2b, + dnnl_OdhwI16i64o4i = dnnl_AcdeB16b64a4b, + dnnl_IdhwO16o32i2o = dnnl_BcdeA16a32b2a, + dnnl_IdhwO16o32i4o = dnnl_BcdeA16a32b4a, + dnnl_IdhwO16o48i2o = dnnl_BcdeA16a48b2a, + dnnl_IdhwO16o48i4o = dnnl_BcdeA16a48b4a, + dnnl_IdhwO16o64i2o = dnnl_BcdeA16a64b2a, + dnnl_IdhwO16o64i4o = dnnl_BcdeA16a64b4a, + dnnl_gOdhwI16i32o2i = dnnl_aBdefC16c32b2c, + dnnl_gOdhwI16i32o4i = dnnl_aBdefC16c32b4c, + dnnl_gOdhwI16i48o2i = dnnl_aBdefC16c48b2c, + dnnl_gOdhwI16i48o4i = dnnl_aBdefC16c48b4c, + dnnl_gOdhwI16i64o2i = dnnl_aBdefC16c64b2c, + dnnl_gOdhwI16i64o4i = dnnl_aBdefC16c64b4c, + dnnl_gIdhwO16o32i2o = dnnl_aCdefB16b32c2b, + dnnl_gIdhwO16o32i4o = dnnl_aCdefB16b32c4b, + dnnl_gIdhwO16o48i2o = dnnl_aCdefB16b48c2b, + dnnl_gIdhwO16o48i4o = dnnl_aCdefB16b48c4b, + dnnl_gIdhwO16o64i2o = dnnl_aCdefB16b64c2b, + dnnl_gIdhwO16o64i4o = dnnl_aCdefB16b64c4b, + dnnl_hwioG16g = dnnl_decbA16a, + dnnl_hwioG8g = dnnl_decbA8a, + dnnl_dhwioG16g = dnnl_defcbA16a, + dnnl_dhwioG8g = dnnl_defcbA8a, + dnnl_NCdhw40n16c = dnnl_ABcde40a16b, + dnnl_NCw40n16c = dnnl_ABc40a16b, + dnnl_NChw40n16c = dnnl_ABcd40a16b, + dnnl_NCw40n32c = dnnl_ABc40a32b, + dnnl_NChw40n32c = dnnl_ABcd40a32b, + dnnl_NCdhw40n32c = dnnl_ABcde40a32b, + dnnl_OIdhw4o8i8o2i = dnnl_ABcde4a8b8a2b, + dnnl_OIhw4o8i8o2i = dnnl_ABcd4a8b8a2b, + dnnl_OIw4o8i8o2i = dnnl_ABc4a8b8a2b, + dnnl_gOIdhw4o8i8o2i = dnnl_aBCdef4b8c8b2c, + dnnl_gOIhw4o8i8o2i = dnnl_aBCde4b8c8b2c, + dnnl_gOIw4o8i8o2i = dnnl_aBCd4b8c8b2c, + dnnl_IOdhw4i8o8i2o = dnnl_BAcde4b8a8b2a, + dnnl_IOhw4i8o8i2o = dnnl_BAcd4b8a8b2a, + dnnl_IOw4i8o8i2o = dnnl_BAc4b8a8b2a, + dnnl_gIOdhw4i8o8i2o = dnnl_aCBdef4c8b8c2b, + dnnl_gIOhw4i8o8i2o = dnnl_aCBde4c8b8c2b, + dnnl_gIOw4i8o8i2o = dnnl_aCBd4c8b8c2b, + dnnl_NCw2c32n8c = dnnl_ABc2b32a8b, + dnnl_NChw2c32n8c = dnnl_ABcd2b32a8b, + dnnl_NCdhw2c32n8c = dnnl_ABcde2b32a8b, + dnnl_OIw2i8o16i4o = dnnl_ABc2b8a16b4a, + dnnl_OIhw2i8o16i4o = dnnl_ABcd2b8a16b4a, + dnnl_OIdhw2i8o16i4o = dnnl_ABcde2b8a16b4a, + dnnl_OIw2o8i16o4i = dnnl_ABc2a8b16a4b, + dnnl_OIw2o8i16o2i = dnnl_ABc2a8b16a2b, + dnnl_IOw2i8o16i4o = dnnl_BAc2b8a16b4a, + dnnl_IOw2i8o16i2o = dnnl_BAc2b8a16b2a, + dnnl_OIhw2o8i16o4i = dnnl_ABcd2a8b16a4b, + dnnl_OIhw2o8i16o2i = dnnl_ABcd2a8b16a2b, + dnnl_IOhw2i8o16i4o = dnnl_BAcd2b8a16b4a, + dnnl_IOhw2i8o16i2o = dnnl_BAcd2b8a16b2a, + dnnl_OIdhw2o8i16o4i = dnnl_ABcde2a8b16a4b, + dnnl_OIdhw2o8i16o2i = dnnl_ABcde2a8b16a2b, + dnnl_IOdhw2i8o16i4o = dnnl_BAcde2b8a16b4a, + dnnl_IOdhw2i8o16i2o = dnnl_BAcde2b8a16b2a, + dnnl_gOIw2o8i16o2i = dnnl_aBCd2b8c16b2c, + dnnl_gIOw2i8o16i2o = dnnl_aCBd2c8b16c2b, + dnnl_gIOhw2i8o16i2o = dnnl_aBCde2c8b16c2b, + dnnl_gIOdhw2i8o16i2o = dnnl_aBCdef2c8b16c2b, + dnnl_gOIhw2o8i16o2i = dnnl_aBCde2b8c16b2c, + dnnl_gOIdhw2o8i16o2i = dnnl_aBCdef2b8c16b2c, + dnnl_gOIw2o8i16o4i = dnnl_aBCd2b8c16b4c, + dnnl_gOIhw2o8i16o4i = dnnl_aBCde2b8c16b4c, +} dnnl_format_tag_t; + +/// @} dnnl_api_memory + +/// @addtogroup dnnl_api_primitives +/// @{ +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Kinds of propagation. +typedef enum { + // TODO: suggest renames + /// Undefined propagation type. + dnnl_prop_kind_undef = 0, + /// Forward data propagation (training mode). In this mode primitives + /// perform computations necessary for subsequent backward propagation. + dnnl_forward_training = 64, + /// Forward data propagation (inference mode). In this mode primitives + /// perform only computations that are necessary for inference and omit + /// computations that are necessary only for backward propagation. + dnnl_forward_inference = 96, + /// Forward data propagation (alias for @c dnnl_forward_training). + dnnl_forward = dnnl_forward_training, + /// Backward propagation (with respect to all parameters). + dnnl_backward = 128, + /// Backward data propagation. + dnnl_backward_data = 160, + /// Backward weights propagation. + dnnl_backward_weights = 192, + /// Backward bias propagation. + dnnl_backward_bias = 193, +} dnnl_prop_kind_t; + +/// Kinds of primitives. Used to implement a way to extend the library with new +/// primitives without changing the ABI. +typedef enum { + /// Undefined primitive + dnnl_undefined_primitive, + /// A reorder primitive. + dnnl_reorder, + /// A shuffle primitive. + dnnl_shuffle, + /// A (out-of-place) concat primitive. + dnnl_concat, + /// A sum primitive. + dnnl_sum, + /// A convolution primitive. + dnnl_convolution, + /// A deconvolution primitive. + dnnl_deconvolution, + /// An element-wise primitive. + dnnl_eltwise, + /// An LRN primitive. + dnnl_lrn, + /// A batch normalization primitive. + dnnl_batch_normalization, + /// An inner product primitive. + dnnl_inner_product, + /// A rnn primitive. + dnnl_rnn, + /// A matrix multiplication primitive (internal). + dnnl_gemm, + /// A binary primitive. + dnnl_binary, + /// A matrix multiplication primitive. + dnnl_matmul, + /// A resampling primitive. + dnnl_resampling, + /// A pooling primitive. + dnnl_pooling, + /// A reduction primitive. + dnnl_reduction, + /// A PReLU primitive. + dnnl_prelu, + /// A softmax primitive. + dnnl_softmax, + /// A layer normalization primitive. + dnnl_layer_normalization, + /// A group normalization primitive. + dnnl_group_normalization, + + /// Parameter to allow internal only primitives without undefined behavior. + /// This parameter is chosen to be valid for so long as sizeof(int) >= 2. + dnnl_primitive_kind_max = 0x7fff, +} dnnl_primitive_kind_t; + +/// Kinds of algorithms. +typedef enum { + dnnl_alg_kind_undef, + /// Direct convolution + dnnl_convolution_direct = 0x1, + /// Winograd convolution + dnnl_convolution_winograd = 0x2, + /// Convolution algorithm(either direct or Winograd) is chosen just in time + dnnl_convolution_auto = 0x3, + /// Direct deconvolution + dnnl_deconvolution_direct = 0xa, + /// Winograd deconvolution + dnnl_deconvolution_winograd = 0xb, + /// Eltwise: ReLU + dnnl_eltwise_relu = 0x20, + /// Eltwise: hyperbolic tangent non-linearity (tanh) + dnnl_eltwise_tanh, + /// Eltwise: exponential linear unit (elu) + dnnl_eltwise_elu, + /// Eltwise: square + dnnl_eltwise_square, + /// Eltwise: abs + dnnl_eltwise_abs, + /// Eltwise: square root + dnnl_eltwise_sqrt, + /// Eltwise: linear + dnnl_eltwise_linear, + /// Eltwise: soft_relu + dnnl_eltwise_soft_relu, + /// Eltwise: hardsigmoid + dnnl_eltwise_hardsigmoid, + /// Eltwise: logistic + dnnl_eltwise_logistic, + /// Eltwise: exponent + dnnl_eltwise_exp, + /// Eltwise: gelu + /// + /// @note Tanh approximation formula is used to approximate + /// the cumulative distribution function of a Gaussian here + dnnl_eltwise_gelu_tanh, + /// Eltwise: swish + dnnl_eltwise_swish, + /// Eltwise: natural logarithm + dnnl_eltwise_log, + /// Eltwise: clip + dnnl_eltwise_clip, + /// Eltwise: clip version 2 + dnnl_eltwise_clip_v2, + /// Eltwise: pow + dnnl_eltwise_pow, + /// Eltwise: erf-based gelu + dnnl_eltwise_gelu_erf, + /// Eltwise: round + dnnl_eltwise_round, + /// Eltwise: mish + dnnl_eltwise_mish, + /// Eltwise: hardswish + dnnl_eltwise_hardswish, + /// Eltwise: ReLU (dst for backward) + dnnl_eltwise_relu_use_dst_for_bwd = 0x100, + /// Eltwise: hyperbolic tangent non-linearity (tanh) (dst for backward) + dnnl_eltwise_tanh_use_dst_for_bwd, + /// Eltwise: exponential linear unit (elu) (dst for backward) + dnnl_eltwise_elu_use_dst_for_bwd, + /// Eltwise: square root (dst for backward) + dnnl_eltwise_sqrt_use_dst_for_bwd, + /// Eltwise: logistic (dst for backward) + dnnl_eltwise_logistic_use_dst_for_bwd, + /// Eltwise: exp (dst for backward) + dnnl_eltwise_exp_use_dst_for_bwd, + /// Eltwise: clip version 2 (dst for backward) + dnnl_eltwise_clip_v2_use_dst_for_bwd, + /// Max pooling + dnnl_pooling_max = 0x1ff, + /// Average pooling include padding + dnnl_pooling_avg_include_padding = 0x2ff, + /// Average pooling exclude padding + dnnl_pooling_avg_exclude_padding = 0x3ff, + /// Local response normalization (LRN) across multiple channels + dnnl_lrn_across_channels = 0xaff, + /// LRN within a single channel + dnnl_lrn_within_channel = 0xbff, + /// RNN cell + dnnl_vanilla_rnn = 0x1fff, + /// LSTM cell + dnnl_vanilla_lstm = 0x2fff, + /// GRU cell + dnnl_vanilla_gru = 0x3fff, + /// GRU cell with linear before reset + /// + /// Modification of original GRU cell. Differs from #dnnl_vanilla_gru + /// in how the new memory gate is calculated: + /// \f[ c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f] + /// Primitive expects 4 biases on input: + /// \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$ + dnnl_lbr_gru = 0x4fff, + /// AUGRU cell + dnnl_vanilla_augru = 0x5fff, + /// AUGRU cell with linear before reset + dnnl_lbr_augru = 0x6fff, + /// Binary add + dnnl_binary_add = 0x1fff0, + /// Binary mul + dnnl_binary_mul = 0x1fff1, + /// Binary max + dnnl_binary_max = 0x1fff2, + /// Binary min + dnnl_binary_min = 0x1fff3, + /// Binary div + dnnl_binary_div = 0x1fff4, + /// Binary sub + dnnl_binary_sub = 0x1fff5, + /// Binary greater or equal + dnnl_binary_ge = 0x1fff6, + /// Binary greater than + dnnl_binary_gt = 0x1fff7, + /// Binary less or equal + dnnl_binary_le = 0x1fff8, + /// Binary less than + dnnl_binary_lt = 0x1fff9, + /// Binary equal + dnnl_binary_eq = 0x1fffa, + /// Binary not equal + dnnl_binary_ne = 0x1fffb, + /// Binary select + dnnl_binary_select = 0x1fffc, + /// Nearest Neighbor Resampling Method + dnnl_resampling_nearest = 0x2fff0, + /// Linear Resampling Method + dnnl_resampling_linear = 0x2fff1, + /// Reduction using max + dnnl_reduction_max, + /// Reduction using min + dnnl_reduction_min, + /// Reduction using sum + dnnl_reduction_sum, + /// Reduction using mul + dnnl_reduction_mul, + /// Reduction using mean + dnnl_reduction_mean, + /// Reduction using lp norm + dnnl_reduction_norm_lp_max, + /// Reduction using lp norm + dnnl_reduction_norm_lp_sum, + /// Reduction using lp norm without final pth-root + dnnl_reduction_norm_lp_power_p_max, + /// Reduction using lp norm without final pth-root + dnnl_reduction_norm_lp_power_p_sum, + /// Softmax + dnnl_softmax_accurate = 0x30000, + /// Logsoftmax + dnnl_softmax_log, +} dnnl_alg_kind_t; + +/// Flags for normalization primitives. +typedef enum { + /// Use no normalization flags + /// + /// If specified + /// - on forward training propagation mean and variance are computed and + /// stored as output + /// - on backward propagation compute full derivative wrt data + /// - on backward propagation prop_kind == #dnnl_backward_data has the same + /// behavior as prop_kind == #dnnl_backward + dnnl_normalization_flags_none = 0x0U, + + /// Use global statistics + /// + /// If specified + /// - on forward propagation use mean and variance provided by user (input) + /// - on backward propagation reduces the amount of computations, since + /// mean and variance are considered as constants + /// + /// If not specified: + /// - on forward propagation mean and variance are computed and stored as + /// output + /// - on backward propagation compute full derivative wrt data + dnnl_use_global_stats = 0x1U, + + /// Use scale parameter + /// + /// If specified: + /// - on forward propagation use scale for the normalization results + /// - on backward propagation (for prop_kind == #dnnl_backward) compute + /// diff wrt scale (hence one extra output used) + dnnl_use_scale = 0x2U, + + /// Use shift parameter + /// + /// If specified: + /// - on forward propagation use shift (aka bias) for the normalization + /// results + /// - on backward propagation (for prop_kind == #dnnl_backward) compute + /// diff wrt shift (hence one extra output used) + dnnl_use_shift = 0x4U, + + /// Fuse with ReLU + /// + /// The flag implies negative slope being 0. On training this is the only + /// configuration supported. For inference, to use non-zero negative slope + /// consider using @ref dev_guide_attributes_post_ops. + /// + /// If specified: + /// - on inference this option behaves the same as if the primitive were + /// fused with ReLU using post ops API with zero negative slope. + /// - on training primitive requires workspace (required to be able to + /// perform backward pass) + dnnl_fuse_norm_relu = 0x8U, + + /// Fuse with Add and then fuse with ReLU + /// + /// If specified: + /// + /// - on forward propagation apply element-wise binary Add operation to + /// to the normalization results with an additional input tensor and then + /// apply ReLU with negative slope being 0. + /// - on training primitive requires workspace (required to be able to + /// perform backward pass). + /// - on backward propagation save the result of backward ReLU operation + /// with input tensor and workspace from forward pass to extra output + /// tensor and then perform backward normalization. + dnnl_fuse_norm_add_relu = 0x10U, + +} dnnl_normalization_flags_t; + +/// @} dnnl_api_primitives_common +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_memory +/// @{ + +/// A wildcard value for dimensions that are unknown at a primitive creation +/// time. +#define DNNL_RUNTIME_DIM_VAL INT64_MIN + +/// A `size_t` counterpart of the DNNL_RUNTIME_DIM_VAL. +/// For instance, this value is returned by dnnl_memory_desc_get_size() if +/// either of the dimensions or strides equal to #DNNL_RUNTIME_DIM_VAL. +#define DNNL_RUNTIME_SIZE_VAL ((size_t)DNNL_RUNTIME_DIM_VAL) + +/// @cond DO_NOT_DOCUMENT_THIS +/// Hex representation for a **special** quiet NAN (!= NAN from math.h) +static const union { + unsigned u; + float f; +} DNNL_RUNTIME_F32_VAL_REP = {0x7fc000d0}; +/// @endcond + +/// A wildcard value for floating point values that are unknown at a primitive +/// creation time. +#define DNNL_RUNTIME_F32_VAL (DNNL_RUNTIME_F32_VAL_REP.f) + +/// @cond DO_NOT_DOCUMENT_THIS +static const int DNNL_RUNTIME_S32_VAL_REP = INT32_MIN; +/// @endcond + +/// A wildcard value for int32_t values that are unknown at a primitive creation +/// time. +#define DNNL_RUNTIME_S32_VAL DNNL_RUNTIME_S32_VAL_REP + +/// @struct dnnl_memory_desc +/// An opaque structure to describe a memory descriptor. +struct dnnl_memory_desc; + +/// A memory descriptor handle. +typedef struct dnnl_memory_desc *dnnl_memory_desc_t; + +/// A memory descriptor handle. +typedef const struct dnnl_memory_desc *const_dnnl_memory_desc_t; + +/// @struct dnnl_memory +/// An opaque structure to describe a memory. +struct dnnl_memory; + +/// A memory handle. +typedef struct dnnl_memory *dnnl_memory_t; + +/// A constant memory handle. +typedef const struct dnnl_memory *const_dnnl_memory_t; + +/// @} dnnl_api_memory + +/// @addtogroup dnnl_api_primitives +/// @{ + +/// @addtogroup dnnl_api_rnn +/// @{ + +/// Flags for RNN cell. +typedef enum { + /// Undefined RNN flags + dnnl_rnn_flags_undef = 0x0, + /// Do not add weights gradient to existing diff_weights memory + dnnl_rnn_flags_diff_weights_overwrite = 0x1, +} dnnl_rnn_flags_t; + +/// A direction of RNN primitive execution. +typedef enum { + /// Undefined RNN direction. + dnnl_rnn_direction_undef = 0, + /// Unidirectional execution of RNN primitive from left to right. + dnnl_unidirectional_left2right, + /// Unidirectional execution of RNN primitive from right to left. + dnnl_unidirectional_right2left, + /// Bidirectional execution of RNN primitive with concatenation of the + /// results. + dnnl_bidirectional_concat, + /// Bidirectional execution of RNN primitive with summation of the + /// results. + dnnl_bidirectional_sum, +} dnnl_rnn_direction_t; + +/// @} dnnl_api_rnn + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_primitives +/// @{ +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// @struct dnnl_primitive_desc +/// @brief An opaque structure to describe a primitive descriptor. +struct dnnl_primitive_desc; + +/// @brief A primitive descriptor handle. +typedef struct dnnl_primitive_desc *dnnl_primitive_desc_t; + +/// @brief A constant primitive descriptor handle. +typedef const struct dnnl_primitive_desc *const_dnnl_primitive_desc_t; + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_attributes +/// @{ + +/// Scratchpad mode +typedef enum { + /// The library manages the scratchpad allocation according to the policy + /// specified by the `DNNL_ENABLE_CONCURRENT_EXEC` + /// [build option](@ref dev_guide_build_options) (default). + /// + /// When `DNNL_ENABLE_CONCURRENT_EXEC=OFF` (default), the library + /// scratchpad is common to all primitives to reduce the memory footprint. + /// This configuration comes with limited thread-safety properties, namely + /// primitives can be created and executed in parallel but cannot migrate + /// between threads (in other words, each primitive should be executed in + /// the same thread it was created in). + /// + /// When `DNNL_ENABLE_CONCURRENT_EXEC=ON`, the library scratchpad is + /// private to each primitive. The memory footprint is larger than when + /// using `DNNL_ENABLE_CONCURRENT_EXEC=OFF` but different primitives can be + /// created and run concurrently (the same primitive cannot be run + /// concurrently from two different threads though). + dnnl_scratchpad_mode_library, + /// The user manages the scratchpad allocation by querying and providing + /// the scratchpad memory to primitives. This mode is thread-safe as long + /// as the scratchpad buffers are not used concurrently by two primitive + /// executions. + dnnl_scratchpad_mode_user, +} dnnl_scratchpad_mode_t; + +/// Rounding mode +typedef enum { + /// rounding mode dictated by the floating-point environment + dnnl_rounding_mode_environment, + /// stochastic rounding mode where a random bias is added to the + /// trailing mantissa bits before conversion. + dnnl_rounding_mode_stochastic, +} dnnl_rounding_mode_t; + +/// @struct dnnl_primitive_attr +/// @brief An opaque structure for primitive descriptor attributes. +/// +/// Attributes may contain: +/// - output scales (to scale the result prior to storing it to the memory) +struct dnnl_primitive_attr; + +/// @brief A primitive descriptor attributes handle that controls primitive +/// behavior. +typedef struct dnnl_primitive_attr *dnnl_primitive_attr_t; + +/// @brief A constant primitive descriptor attributes handle. +typedef const struct dnnl_primitive_attr *const_dnnl_primitive_attr_t; + +/// @struct dnnl_post_ops +/// @brief An opaque structure for a chain of post operations. +/// +/// dnnl_post_ops can be used to perform some (trivial) operations like +/// accumulation or eltwise after certain primitives like convolution. +/// +/// Post operations might be combined together, making a chain of post +/// operations. For instance one can configure convolution followed by +/// accumulation followed by eltwise. This might be especially beneficial +/// for residual learning blocks. +/// +/// @warning +/// Of course not all combinations are supported, so the user should handle +/// errors accordingly. +/// +/// Supported post operations: +/// - accumulation (base primitive: convolution) +/// - eltwise (base primitive: convolution) +struct dnnl_post_ops; + +/// @brief A post operation chain handle. +typedef struct dnnl_post_ops *dnnl_post_ops_t; + +/// @brief A constant post operation chain handle. +typedef const struct dnnl_post_ops *const_dnnl_post_ops_t; + +/// @} dnnl_api_attributes + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// @struct dnnl_primitive +/// An opaque structure to describe a primitive. +struct dnnl_primitive; +/// A primitive handle. +typedef struct dnnl_primitive *dnnl_primitive_t; +/// A constant primitive handle. +typedef const struct dnnl_primitive *const_dnnl_primitive_t; + +/// Undefined argument. +#define DNNL_ARG_UNDEF 0 +/// Source argument #0. +#define DNNL_ARG_SRC_0 1 +/// A special mnemonic for source argument for primitives that have a +/// single source. An alias for #DNNL_ARG_SRC_0. +#define DNNL_ARG_SRC DNNL_ARG_SRC_0 +/// A special mnemonic for RNN input vector. An alias for +/// #DNNL_ARG_SRC_0. +#define DNNL_ARG_SRC_LAYER DNNL_ARG_SRC_0 +/// A special mnemonic for reorder source argument. An alias for +/// #DNNL_ARG_SRC_0. +#define DNNL_ARG_FROM DNNL_ARG_SRC_0 + +/// Source argument #1. +#define DNNL_ARG_SRC_1 2 +/// A special mnemonic for RNN input recurrent hidden state vector. An alias +/// for #DNNL_ARG_SRC_1. +#define DNNL_ARG_SRC_ITER DNNL_ARG_SRC_1 + +/// Source argument #2. +#define DNNL_ARG_SRC_2 3 +/// A special mnemonic for RNN input recurrent cell state vector. An alias for +/// #DNNL_ARG_SRC_2. +#define DNNL_ARG_SRC_ITER_C DNNL_ARG_SRC_2 + +/// Source argument #3. +#define DNNL_ARG_SRC_3 4 +/// A special mnemonic for RNN input recurrent cell attention vector. An alias for +/// #DNNL_ARG_SRC_3. +#define DNNL_ARG_AUGRU_ATTENTION DNNL_ARG_SRC_3 + +/// Destination argument #0. +#define DNNL_ARG_DST_0 17 +/// A special mnemonic for destination argument for primitives that have a +/// single destination. An alias for #DNNL_ARG_DST_0. +#define DNNL_ARG_DST DNNL_ARG_DST_0 +/// A special mnemonic for reorder destination argument. An alias for +/// #DNNL_ARG_DST_0. +#define DNNL_ARG_TO DNNL_ARG_DST_0 +/// A special mnemonic for RNN output vector. An alias for #DNNL_ARG_DST_0. +#define DNNL_ARG_DST_LAYER DNNL_ARG_DST_0 + +/// Destination argument #1. +#define DNNL_ARG_DST_1 18 +/// A special mnemonic for RNN input recurrent hidden state vector. An +/// alias for #DNNL_ARG_DST_1. +#define DNNL_ARG_DST_ITER DNNL_ARG_DST_1 + +/// Destination argument #2. +#define DNNL_ARG_DST_2 19 +/// A special mnemonic for LSTM output recurrent cell state vector. An +/// alias for #DNNL_ARG_DST_2. +#define DNNL_ARG_DST_ITER_C DNNL_ARG_DST_2 + +/// Weights argument #0. +#define DNNL_ARG_WEIGHTS_0 33 +/// A special mnemonic for primitives that have a single weights +/// argument. Alias for #DNNL_ARG_WEIGHTS_0. +#define DNNL_ARG_WEIGHTS DNNL_ARG_WEIGHTS_0 +/// A special mnemonic for RNN weights applied to the layer input. An +/// alias for #DNNL_ARG_WEIGHTS_0. +#define DNNL_ARG_WEIGHTS_LAYER DNNL_ARG_WEIGHTS_0 + +/// Weights argument #1. +#define DNNL_ARG_WEIGHTS_1 34 +/// A special mnemonic for RNN weights applied to the recurrent input. +/// An alias for #DNNL_ARG_WEIGHTS_1. +#define DNNL_ARG_WEIGHTS_ITER DNNL_ARG_WEIGHTS_1 + +/// Weights argument #2. +#define DNNL_ARG_WEIGHTS_2 35 +/// A special mnemonic for RNN weights applied to the peephole weights. +/// An alias for #DNNL_ARG_WEIGHTS_2. +#define DNNL_ARG_WEIGHTS_PEEPHOLE DNNL_ARG_WEIGHTS_2 + +/// Weights argument #3. +#define DNNL_ARG_WEIGHTS_3 36 +/// A special mnemonic for RNN weights applied to the projection weights. +/// An alias for #DNNL_ARG_WEIGHTS_3. +#define DNNL_ARG_WEIGHTS_PROJECTION DNNL_ARG_WEIGHTS_3 + +/// Bias tensor argument. +#define DNNL_ARG_BIAS 41 + +/// Mean values tensor argument. +#define DNNL_ARG_MEAN 49 +/// Variance values tensor argument. +#define DNNL_ARG_VARIANCE 50 + +/// A special mnemonic for scale argument of normalization primitives. +#define DNNL_ARG_SCALE 51 +/// A special mnemonic for shift argument of normalization primitives. +#define DNNL_ARG_SHIFT 52 + +/// Workspace tensor argument. Workspace is used to pass information +/// from forward propagation to backward propagation computations. +#define DNNL_ARG_WORKSPACE 64 +/// Scratchpad (temporary storage) tensor argument. +#define DNNL_ARG_SCRATCHPAD 80 + +/// Gradient (diff) of the source argument #0. +#define DNNL_ARG_DIFF_SRC_0 129 +/// A special mnemonic for primitives that have a single diff source argument. +/// An alias for #DNNL_ARG_DIFF_SRC_0. +#define DNNL_ARG_DIFF_SRC DNNL_ARG_DIFF_SRC_0 +/// A special mnemonic for gradient (diff) of RNN input vector. An alias for +/// #DNNL_ARG_DIFF_SRC_0. +#define DNNL_ARG_DIFF_SRC_LAYER DNNL_ARG_DIFF_SRC_0 + +/// Gradient (diff) of the source argument #1. +#define DNNL_ARG_DIFF_SRC_1 130 +/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state +/// vector. An alias for #DNNL_ARG_DIFF_SRC_1. +#define DNNL_ARG_DIFF_SRC_ITER DNNL_ARG_DIFF_SRC_1 + +/// Gradient (diff) of the source argument #2. +#define DNNL_ARG_DIFF_SRC_2 131 +/// A special mnemonic for gradient (diff) of RNN input recurrent cell state +/// vector. An alias for #DNNL_ARG_DIFF_SRC_1. +#define DNNL_ARG_DIFF_SRC_ITER_C DNNL_ARG_DIFF_SRC_2 + +/// Gradient (diff) of the source argument #3. +#define DNNL_ARG_DIFF_SRC_3 132 +/// A special mnemonic for gradient (diff) of RNN input recurrent cell attention +/// vector. An alias for #DNNL_ARG_DIFF_SRC_3. +#define DNNL_ARG_DIFF_AUGRU_ATTENTION DNNL_ARG_DIFF_SRC_3 + +/// Gradient (diff) of the destination argument #0. +#define DNNL_ARG_DIFF_DST_0 145 +/// A special mnemonic for primitives that have a single diff destination +/// argument. An alias for #DNNL_ARG_DIFF_DST_0. +#define DNNL_ARG_DIFF_DST DNNL_ARG_DIFF_DST_0 +/// A special mnemonic for gradient (diff) of RNN output vector. An alias for +/// #DNNL_ARG_DIFF_DST_0. +#define DNNL_ARG_DIFF_DST_LAYER DNNL_ARG_DIFF_DST_0 + +/// Gradient (diff) of the destination argument #1. +#define DNNL_ARG_DIFF_DST_1 146 +/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state +/// vector. An alias for #DNNL_ARG_DIFF_DST_1. +#define DNNL_ARG_DIFF_DST_ITER DNNL_ARG_DIFF_DST_1 + +/// Gradient (diff) of the destination argument #2. +#define DNNL_ARG_DIFF_DST_2 147 +/// A special mnemonic for gradient (diff) of RNN input recurrent cell state +/// vector. An alias for #DNNL_ARG_DIFF_DST_2. +#define DNNL_ARG_DIFF_DST_ITER_C DNNL_ARG_DIFF_DST_2 + +/// Gradient (diff) of the weights argument #0. +#define DNNL_ARG_DIFF_WEIGHTS_0 161 +/// A special mnemonic for primitives that have a single diff weights +/// argument. Alias for #DNNL_ARG_DIFF_WEIGHTS_0. +#define DNNL_ARG_DIFF_WEIGHTS DNNL_ARG_DIFF_WEIGHTS_0 +/// A special mnemonic for diff of RNN weights applied to the layer input. An +/// alias for #DNNL_ARG_DIFF_WEIGHTS_0. +#define DNNL_ARG_DIFF_WEIGHTS_LAYER DNNL_ARG_DIFF_WEIGHTS_0 + +/// Gradient (diff) of the weights argument #1. +#define DNNL_ARG_DIFF_WEIGHTS_1 162 +/// A special mnemonic for diff of RNN weights applied to the recurrent input. +/// An alias for #DNNL_ARG_DIFF_WEIGHTS_1. +#define DNNL_ARG_DIFF_WEIGHTS_ITER DNNL_ARG_DIFF_WEIGHTS_1 + +/// Gradient (diff) of the weights argument #2. +#define DNNL_ARG_DIFF_WEIGHTS_2 163 +/// A special mnemonic for diff of RNN weights applied to the peephole weights. +/// An alias for #DNNL_ARG_DIFF_WEIGHTS_2. +#define DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE DNNL_ARG_DIFF_WEIGHTS_2 + +/// Gradient (diff) of the weights argument #3. +#define DNNL_ARG_DIFF_WEIGHTS_3 164 +/// A special mnemonic for diff of RNN weights applied to the projection +/// weights. An alias for #DNNL_ARG_DIFF_WEIGHTS_3. +#define DNNL_ARG_DIFF_WEIGHTS_PROJECTION DNNL_ARG_DIFF_WEIGHTS_3 + +/// Gradient (diff) of the bias tensor argument. +#define DNNL_ARG_DIFF_BIAS 169 + +/// A special mnemonic for scale argument of normalization primitives. +#define DNNL_ARG_DIFF_SCALE 255 +/// A special mnemonic for shift argument of normalization primitives. +#define DNNL_ARG_DIFF_SHIFT 256 + +/// Rounding mode seed for stochastic rounding +/// Single seed needed independently of how many arguments need stochastic rounding +#define DNNL_ARG_ATTR_ROUNDING_SEED 508 + +/// Dropout mask output buffer. +#define DNNL_ARG_ATTR_DROPOUT_MASK 509 + +/// Dropout probability value passed via a buffer. +#define DNNL_ARG_ATTR_DROPOUT_PROBABILITY 510 + +/// Dropout RNG seed value passed via a buffer. +#define DNNL_ARG_ATTR_DROPOUT_SEED 511 + +/// Output scaling factors provided at execution time. +#define DNNL_ARG_ATTR_OUTPUT_SCALES 513 + +/// Starting index for source arguments for primitives that take a variable +/// number of source arguments. +#define DNNL_ARG_MULTIPLE_SRC 1024 +/// Starting index for destination arguments for primitives that produce a +/// variable number of destination arguments. +#define DNNL_ARG_MULTIPLE_DST 2048 + +/// Scaling factors provided at execution time. +#define DNNL_ARG_ATTR_SCALES 4096 + +/// Zero points provided at execution time. +#define DNNL_ARG_ATTR_ZERO_POINTS 8192 + +/// Arguments for fused depthwise convolution. +/// See @ref dev_guide_attributes_post_ops_depthwise_fusion +#define DNNL_ARG_ATTR_POST_OP_DW 16384 + +/// Starting point for a binary post operation. +#define DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE 32768 + +/// Arguments for a binary post operation. Up to 32 arguments are supported. +/// See @ref dev_guide_attributes_post_ops_binary_fusion +#define DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) \ + (DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE * ((idx) + 1)) + +/// A structure that contains an index and a memory object, and is used to pass +/// arguments to dnnl_primitive_execute(). +typedef struct { + int arg; ///< An argument index, e.g. DNNL_ARG_SRC + dnnl_memory_t memory; ///< Input/output memory +} dnnl_exec_arg_t; + +/// @} dnnl_api_primitives_common + +/// @addtogroup dnnl_api_primitives_common +/// @{ + +/// Primitive descriptor query specification +/// +/// For generic function dnnl_primitive_desc_query(), the type of result must +/// agree with the queried argument. The correspondence table: +/// +/// Query kind | Type of query result +/// --------------------------------|----------------------------- +/// dnnl_query_*_engine | #dnnl_engine_t * +/// #dnnl_query_primitive_kind | #dnnl_primitive_kind_t * +/// dnnl_query_*_s32 | int * +/// dnnl_query_*_s64 | #dnnl_dim_t * (same as int64_t *) +/// dnnl_query_*_f32 | float * +/// dnnl_query_*_f64 | double * +/// dnnl_query_*_str | const char ** +/// dnnl_query_*_md | #const_dnnl_memory_desc_t * +/// dnnl_query_*_pd | #const_dnnl_primitive_desc_t * +/// dnnl_query_cache_blob_id | const uint8_t ** +/// dnnl_query_strides | const #dnnl_dims_t ** +/// dnnl_query_dilations | const #dnnl_dims_t ** +/// dnnl_query_padding_l | const #dnnl_dims_t ** +/// dnnl_query_padding_r | const #dnnl_dims_t ** +/// dnnl_query_flags | unsigned * +/// dnnl_query_alg_kind | #dnnl_alg_kind_t * +/// dnnl_query_factors | const float ** +/// dnnl_query_cell_kind | #dnnl_alg_kind_t * +/// dnnl_query_direction | #dnnl_rnn_direction_t * +/// dnnl_query_activation_kind | #dnnl_alg_kind_t * +/// dnnl_query_kernel | const #dnnl_dims_t ** +/// dnnl_query_dims | const #dnnl_dims_t ** +/// dnnl_query_data_type | #dnnl_data_type_t * +/// dnnl_query_padded_dims | const #dnnl_dims_t ** +/// dnnl_query_padded_offsets | const #dnnl_dims_t ** +/// dnnl_query_format_kind | #dnnl_format_kind_t * +/// dnnl_query_inner_blks | const #dnnl_dims_t ** +/// dnnl_query_inner_idxs | const #dnnl_dims_t ** +/// dnnl_query_sparse_encoding | #dnnl_sparse_encoding_t * +/// +/// @note +/// Rule of thumb: all opaque types and structures are returned by +/// reference. All numbers are returned by value. +/// +/// @warning +/// All returned references point to constant objects and are valid only +/// during the lifetime of the queried primitive descriptor. Returned objects +/// must not be destroyed by the user. If you need to keep the object longer +/// than the lifetime of the queried primitive descriptor, use +/// dnnl_primitive_desc_clone() to make a copy. +typedef enum { + dnnl_query_undef = 0, ///< no query + + dnnl_query_engine, ///< execution engine + dnnl_query_primitive_kind, ///< primitive kind + + dnnl_query_num_of_inputs_s32, ///< number of inputs expected + dnnl_query_num_of_outputs_s32, ///< number of outputs expected + + dnnl_query_time_estimate_f64, ///< runtime estimation (seconds) + dnnl_query_memory_consumption_s64, ///< memory consumption -- extra + /// (scratch) memory, additional to + /// all inputs and outputs memory + /// (bytes) + + dnnl_query_scratchpad_engine, ///< scratchpad engine -- engine to be used + /// for creating scratchpad memory + + dnnl_query_impl_info_str, ///< implementation name + + dnnl_query_reorder_src_engine, ///< source engine + dnnl_query_reorder_dst_engine, ///< destination engine + + dnnl_query_prop_kind, ///< propagation kind + + dnnl_query_cache_blob_id_size_s64, ///< size of cache blob ID in bytes + dnnl_query_cache_blob_id, ///< cache blob ID (pointer to array) + + dnnl_query_strides, ///< strides + dnnl_query_dilations, ///< dilations + dnnl_query_padding_l, ///< left padding + dnnl_query_padding_r, ///< right padding + dnnl_query_epsilon_f32, ///< epsilon + dnnl_query_flags, ///< flags + dnnl_query_alg_kind, ///< algorithm kind + dnnl_query_alpha_f32, ///< alpha + dnnl_query_beta_f32, ///< beta + dnnl_query_axis_s32, ///< axis + dnnl_query_local_size_s64, ///< LRN parameter local size + dnnl_query_k_f32, ///< LRN parameter K + dnnl_query_p_f32, ///< Reduction parameter P + dnnl_query_factors, ///< Resampling parameter factors + dnnl_query_cell_kind, ///< RNN parameter cell kind + dnnl_query_direction, ///< RNN parameter direction + dnnl_query_activation_kind, ///< RNN parameter activation kind + dnnl_query_kernel, ///< Pooling parameter kernel + dnnl_query_group_size_s64, ///< Shuffle parameter group size + + // memory descriptor section + dnnl_query_some_md = 128, ///< stub + dnnl_query_src_md, ///< source memory desc + dnnl_query_diff_src_md, ///< source gradient memory desc + dnnl_query_weights_md, ///< weights memory descriptor desc + dnnl_query_diff_weights_md, ///< weights grad. memory desc + dnnl_query_dst_md, ///< destination memory desc + dnnl_query_diff_dst_md, ///< destination grad. memory desc + dnnl_query_workspace_md, ///< workspace memory desc + dnnl_query_scratchpad_md, ///< scratchpad memory desc + dnnl_query_exec_arg_md = 255, ///< memory desc of an execute argument + + dnnl_query_ndims_s32, ///< number of dimensions + dnnl_query_dims, ///< vector of dimensions + dnnl_query_data_type, ///< data type + dnnl_query_submemory_offset_s64, ///< submemory offset + dnnl_query_padded_dims, ///< vector of padded dimensions + dnnl_query_padded_offsets, ///< vector of padded offsets + dnnl_query_format_kind, ///< format kind + dnnl_query_inner_nblks_s32, ///< number of innermost blocks + dnnl_query_inner_blks, ///< vector of sizes of the innermost blocks + dnnl_query_inner_idxs, ///< vector of logical indices of the blocks +#ifdef DNNL_EXPERIMENTAL_SPARSE + dnnl_query_sparse_encoding, ///< Sparse encoding + dnnl_query_nnz_s64, ///< Number of non-zero entries + dnnl_query_num_handles_s32, ///< Number of buffers required for a memory +/// descriptor +#endif + // Max value to prevent UB for internal use only dnnl_query_t + dnnl_query_max = 0x7fff, +} dnnl_query_t; + +/// @} dnnl_api_primitives_common + +/// @} dnnl_api_primitives + +/// @addtogroup dnnl_api_service +/// @{ + +/// Disable profiling completely +#define DNNL_JIT_PROFILE_NONE 0u + +/// Enable VTune Profiler integration +#define DNNL_JIT_PROFILE_VTUNE 1u + +/// Enable Linux perf integration via perfmap files +#define DNNL_JIT_PROFILE_LINUX_PERFMAP 2u + +/// Enable Linux perf integration via jitdump files +#define DNNL_JIT_PROFILE_LINUX_JITDUMP 4u + +/// Instruct Linux perf integration via jitdump files to use TSC. @ref +/// DNNL_JIT_PROFILE_LINUX_JITDUMP must be set too for this to take effect. +#define DNNL_JIT_PROFILE_LINUX_JITDUMP_USE_TSC 8u + +/// Enable Linux perf integration (both jitdump and perfmap) +#define DNNL_JIT_PROFILE_LINUX_PERF \ + (DNNL_JIT_PROFILE_LINUX_JITDUMP | DNNL_JIT_PROFILE_LINUX_PERFMAP) + +/// CPU instruction set flags +typedef enum { + /// Library choice of ISA (excepting those listed as initial support) + dnnl_cpu_isa_default = 0x0, + + /// Intel Streaming SIMD Extensions 4.1 (Intel SSE4.1) + dnnl_cpu_isa_sse41 = 0x1, + + /// Intel Advanced Vector Extensions (Intel AVX) + dnnl_cpu_isa_avx = 0x3, + + /// Intel Advanced Vector Extensions 2 (Intel AVX2) + dnnl_cpu_isa_avx2 = 0x7, + + /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost) support + dnnl_cpu_isa_avx2_vnni = 0xf, + + /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost) + /// with 8-bit integer, float16 and bfloat16 support + dnnl_cpu_isa_avx2_vnni_2 = 0x1f, + + /// Intel AVX-512 subset for Intel Xeon Scalable processor family + /// and Intel Core processor family. + dnnl_cpu_isa_avx512_core = 0x27, + + /// Intel AVX-512 and Intel Deep Learning Boost (Intel DL Boost) support + /// for Intel Xeon Scalable processor family + /// and Intel Core processor family. + dnnl_cpu_isa_avx512_core_vnni = 0x67, + + /// Intel AVX-512, Intel DL Boost and bfloat16 support + /// for Intel Xeon Scalable processor family + /// and Intel Core processor family. + dnnl_cpu_isa_avx512_core_bf16 = 0xe7, + + /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support + /// for Intel Xeon Scalable processor family + /// and Intel Core processor family. + // TODO: Align avx10_1 values to internal representation. + dnnl_cpu_isa_avx10_1_512 = 0x1ef, + /// @copydoc dnnl_cpu_isa_avx10_1_512 + dnnl_cpu_isa_avx512_core_fp16 = dnnl_cpu_isa_avx10_1_512, + + /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and + /// Intel AMX with 8-bit integer and bfloat16 support + // TODO: Align avx10_1 values to internal representation. + dnnl_cpu_isa_avx10_1_512_amx = 0xfef, + /// @copydoc dnnl_cpu_isa_avx10_1_512_amx + dnnl_cpu_isa_avx512_core_amx = dnnl_cpu_isa_avx10_1_512_amx, + + /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and + /// Intel AMX with 8-bit integer, bfloat16 and float16 support + // TODO: Align avx10_1 values to internal representation. + dnnl_cpu_isa_avx10_1_512_amx_fp16 = 0x1fef, + /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16 + dnnl_cpu_isa_avx512_core_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16, +} dnnl_cpu_isa_t; + +/// CPU ISA hints flags +typedef enum { + /// No hints (use default features) + dnnl_cpu_isa_no_hints = 0x0, + + /// Prefer to exclusively use Ymm registers for computations + dnnl_cpu_isa_prefer_ymm = 0x1, +} dnnl_cpu_isa_hints_t; + +/// @} dnnl_api_service + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_TYPES_H */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h new file mode 100644 index 0000000000000000000000000000000000000000..43b4713a603e8cba86ed803e90f1b6375482d3a0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h @@ -0,0 +1,342 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C API + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_H +#define ONEAPI_DNNL_DNNL_UKERNEL_H + +#include "oneapi/dnnl/dnnl.h" +#include "oneapi/dnnl/dnnl_ukernel_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_ukernel +/// @{ + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// Creates a ukernel attributes memory storage. +/// +/// @param attr_params Output ukernel attributes memory storage. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_create( + dnnl_ukernel_attr_params_t *attr_params); + +/// Sets post-operations arguments to a storage. +/// +/// @param attr_params Memory pointers storage object. +/// @param post_ops_args A pointer to pointers of post_ops storages. Expected to +/// be packed together. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_post_ops_args( + dnnl_ukernel_attr_params_t attr_params, const void **post_ops_args); + +/// Sets tensor A scales argument to a storage. +/// +/// @param attr_params Memory pointers storage object. +/// @param a_scales Pointer to the scales storage. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_A_scales( + dnnl_ukernel_attr_params_t attr_params, const void *a_scales); + +/// Sets tensor B scales argument to a storage. +/// +/// If `dnnl_brgemm_set_B_scales` used mask of 2, then at least N values of +/// selected data type are expected. +/// +/// @param attr_params Memory pointers storage object. +/// @param b_scales Pointer to the scales storage. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_B_scales( + dnnl_ukernel_attr_params_t attr_params, const void *b_scales); + +/// Sets tensor D scales argument to a storage. +/// +/// @param attr_params Memory pointers storage object. +/// @param d_scales Pointer to the scales storage. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_D_scales( + dnnl_ukernel_attr_params_t attr_params, const void *d_scales); + +/// Destroys a ukernel attributes memory storage. +/// +/// @param attr_params Memory pointers storage object to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_ukernel_attr_params_destroy( + dnnl_ukernel_attr_params_t attr_params); + +/// @addtogroup dnnl_api_ukernel_brgemm +/// @{ + +/// Creates a BRGeMM ukernel object. Operates by the following formula: +/// `C = [A x B]`. +/// +/// @param brgemm Output BRGeMM ukernel object. +/// @param M Dimension M of tensor A. +/// @param N Dimension N of tensor B. +/// @param K Dimension K of tensors A and B. +/// @param batch_size Number of batches to process. +/// @param lda Leading dimension of tensor A. +/// @param ldb Leading dimension of tensor B. +/// @param ldc Leading dimension of tensor C. +/// @param a_dt Data type of tensor A. +/// @param b_dt Data type of tensor B. +/// @param c_dt Data type of tensor C. Must be dnnl_f32. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_create(dnnl_brgemm_t *brgemm, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t batch_size, dnnl_dim_t lda, + dnnl_dim_t ldb, dnnl_dim_t ldc, dnnl_data_type_t a_dt, + dnnl_data_type_t b_dt, dnnl_data_type_t c_dt); + +/// Sets adding an intermediate result to the output tensor C instead of +/// writing: `C += [A x B]`. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param add_C Value to indicate addition. Can be `0` to skip addition, and +/// `1` to apply addition. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_set_add_C(dnnl_brgemm_t brgemm, int add_C); + +/// Sets post-operations to a BRGeMM ukernel object: `D = post-operations(C)`. +/// +/// Post-operations applies if one of the following holds: +/// * Non-empty attributes are specified. +/// * Output data type `d_dt` is different from accumulation data type `c_dt`. +/// +/// If any of conditions happens, the final call of the accumulation chain +/// must be `dnnl_brgemm_execute_postops`, and `dnnl_brgemm_execute`, otherwise. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param ldd Leading dimension of tensor D. +/// @param d_dt Data type of tensor D. +/// @param post_ops Primitive post operations attribute to extend the kernel +/// operations. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_set_post_ops(dnnl_brgemm_t brgemm, + dnnl_dim_t ldd, dnnl_data_type_t d_dt, const_dnnl_post_ops_t post_ops); + +/// Sets tensor A scales mask to a BRGeMM ukernel object. +/// +/// For quantization flavor tensor A scales apply to accumulation buffer once C +/// is ready. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param a_scale_mask Tensor A scale mask. Can be `0` only. +dnnl_status_t DNNL_API dnnl_brgemm_set_A_scales( + dnnl_brgemm_t brgemm, int a_scale_mask); + +/// Sets tensor B scales mask to a BRGeMM ukernel object. +/// +/// For quantization flavor tensor B scales apply to accumulation buffer once C +/// is ready. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only. +dnnl_status_t DNNL_API dnnl_brgemm_set_B_scales( + dnnl_brgemm_t brgemm, int b_scale_mask); + +/// Sets tensor D scales mask to a BRGeMM ukernel object. +/// +/// For quantization flavor tensor D scales apply after all post-ops are +/// applied. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param d_scale_mask Tensor D scale mask. Can be `0` only. +dnnl_status_t DNNL_API dnnl_brgemm_set_D_scales( + dnnl_brgemm_t brgemm, int d_scale_mask); + +/// Finalizes initialization of a BRGeMM ukernel object. +/// +/// This step is mandatory to query information from the object. +/// +/// @param brgemm Output BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_finalize(dnnl_brgemm_t brgemm); + +/// Returns the packing type expected by a tensor B of a BRGeMM ukernel object. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param pack_type Output packing type. Can be `dnnl_brgemm_no_pack` if +/// packing is not expected, and `dnnl_brgemm_pack_32`, otherwise. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type( + const_dnnl_brgemm_t brgemm, dnnl_pack_type_t *pack_type); + +/// Returns the size of a scratchpad memory needed for the BRGeMM ukernel +/// object. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param size Output size of a buffer required for the BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_get_scratchpad_size( + const_dnnl_brgemm_t brgemm, size_t *size); + +/// Returns the flag indicating when the call to `dnnl_brgemm_execute_postops` +/// is valid. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param valid The flag indicating if `dnnl_brgemm_execute_postops` is valid +/// for a given ukernel object. `1` is for valid and `0`, otherwise. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_is_execute_postops_valid( + const_dnnl_brgemm_t brgemm, int *valid); + +/// Initializes the hardware-specific context. If no initialization required, +/// returns the success status. +/// +/// @param brgemm BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_set_hw_context(const_dnnl_brgemm_t brgemm); + +/// Releases the hardware-specific context. Must be used after all the execution +/// calls to BRGeMM ukernel objects. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_release_hw_context(); + +/// Generates an executable part of BRGeMM ukernel object. +/// @param brgemm BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_generate(dnnl_brgemm_t brgemm); + +/// Executes a BRGeMM ukernel object. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param A_ptr Base pointer to a tensor A. +/// @param B_ptr Base pointer to a tensor B. +/// @param A_B_offsets Pointer to the set of tensor A and tensor B offsets for +/// each batch; the set must be contiguous in memory. Single batch should +/// supply offsets for both tensors A and B simultaneously. The number of +/// batches must coincide with the `batch_size` value passed at the creation +/// stage. +/// @param C_ptr Pointer to a tensor C (accumulation buffer). +/// @param scratchpad_ptr Pointer to a scratchpad buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm, + const void *A_ptr, const void *B_ptr, const dnnl_dim_t *A_B_offsets, + void *C_ptr, void *scratchpad_ptr); + +/// Executes a BRGeMM ukernel object with post operations. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param A Base pointer to a tensor A. +/// @param B Base pointer to a tensor B. +/// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for +/// each batch. A set must be contiguous in memory. A single batch should +/// supply offsets for both tensors A and B simultaneously. The number of +/// batches must coincide with the `batch_size` value passed at the creation +/// stage. +/// @param C_ptr Pointer to a tensor C (accumulation buffer). +/// @param D_ptr Pointer to a tensor D (output buffer). +/// @param scratchpad_ptr Pointer to a scratchpad buffer. +/// @param attr_params Ukernel attributes memory storage. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_execute_postops(const_dnnl_brgemm_t brgemm, + const void *A, const void *B, const dnnl_dim_t *A_B_offsets, + const void *C_ptr, void *D_ptr, void *scratchpad_ptr, + const_dnnl_ukernel_attr_params_t attr_params); + +/// Destroys a BRGeMM ukernel object. +/// +/// @param brgemm BRGeMM ukernel object to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_destroy(dnnl_brgemm_t brgemm); + +/// Creates a transform object. +/// +/// @param transform Output transform object. +/// @param K Dimension K. +/// @param N Dimension N. +/// @param in_pack_type Input packing type. Must be one of +/// `dnnl_pack_type_no_trans`, or `dnnl_pack_type_trans`. +/// @param in_ld Input leading dimension. +/// @param out_ld Output leading dimension. When packing data, it specifies a +/// block by N dimension. +/// @param in_dt Input data type. +/// @param out_dt Output data type. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_transform_create(dnnl_transform_t *transform, + dnnl_dim_t K, dnnl_dim_t N, dnnl_pack_type_t in_pack_type, + dnnl_dim_t in_ld, dnnl_dim_t out_ld, dnnl_data_type_t in_dt, + dnnl_data_type_t out_dt); + +/// Generates an executable part of transform object. +/// @param transform Transform object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_transform_generate(dnnl_transform_t transform); + +/// Executes a transform object. +/// +/// @param transform Transform object. +/// @param in_ptr Pointer to an input buffer. +/// @param out_ptr Pointer to an output buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_transform_execute( + const_dnnl_transform_t transform, const void *in_ptr, void *out_ptr); + +/// Destroys a transform object. +/// +/// @param transform Transform object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_transform_destroy(dnnl_transform_t transform); + +/// @} dnnl_api_ukernel_brgemm + +#endif + +/// @} dnnl_api_ukernel + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_H */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5f2a8ccf57ccb3be88d7902a57908f17b304330b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp @@ -0,0 +1,470 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C++ API + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_HPP +#define ONEAPI_DNNL_DNNL_UKERNEL_HPP + +#include "oneapi/dnnl/dnnl.hpp" +#include "oneapi/dnnl/dnnl_ukernel.h" + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// oneDNN namespace +namespace dnnl { + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// @addtogroup dnnl_api_utils +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_brgemm_t p) { + return dnnl_brgemm_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_transform_t p) { + return dnnl_transform_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_ukernel_attr_params_t p) { + return dnnl_ukernel_attr_params_destroy(p); + } +}; + +/// @endcond + +/// @} dnnl_api_utils + +#endif + +/// @addtogroup dnnl_api_ukernel Ukernels +/// Collection of ukernels +/// @{ + +/// ukernel namespace +namespace ukernel { + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// @addtogroup dnnl_api_ukernel_utils ukernel utils +/// ukernel utility functions +/// @{ + +/// Packing specification +enum class pack_type { + /// Undefined pack type. A guard value. + undef = dnnl_pack_type_undef, + /// Plain, not transposed layout. Similar to format_tag::ab. + no_trans = dnnl_pack_type_no_trans, + /// Plain, transposed layout. Similar to format_tag::ba. + trans = dnnl_pack_type_trans, + /// Packed by 32 bits along K dimension layout. + pack32 = dnnl_pack_type_pack32, +}; + +/// Ukernel attributes memory storage +struct attr_params : public handle { + /// Constructs a ukernel attributes memory storage. + attr_params() { + dnnl_ukernel_attr_params_t c_params = nullptr; + dnnl_status_t status = dnnl_ukernel_attr_params_create(&c_params); + error::wrap_c_api( + status, "could not create an attributes memory storage"); + reset(c_params); + } + + /// Sets post-operations arguments to a storage. + /// + /// @param post_ops_args Pointer to pointers of post_ops storages. + /// Expected to be packed together. + void set_post_ops_args(const void **post_ops_args) { + dnnl_status_t status = dnnl_ukernel_attr_params_set_post_ops_args( + get(), post_ops_args); + if (status != dnnl_success) + error::wrap_c_api( + status, "could not set post operations arguments"); + } + + /// Sets tensor A scales arguments to a storage. + /// + /// @param a_scales Pointer to scales storage. + void set_A_scales(const void *a_scales) { + dnnl_status_t status + = dnnl_ukernel_attr_params_set_A_scales(get(), a_scales); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set A scales argument"); + } + + /// Sets tensor B scales arguments to a storage. + /// + /// If @ref attr_params::set_B_scales used mask of 2, then at + /// least N values of selected data type are expected. + /// + /// @param b_scales Pointer to scales storage. + void set_B_scales(const void *b_scales) { + dnnl_status_t status + = dnnl_ukernel_attr_params_set_B_scales(get(), b_scales); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set B scales argument"); + } + + /// Sets tensor D scales arguments to a storage. + /// + /// @param d_scales Pointer to scales storage. + void set_D_scales(const void *d_scales) { + dnnl_status_t status + = dnnl_ukernel_attr_params_set_D_scales(get(), d_scales); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set D scales argument"); + } +}; +/// @} dnnl_api_ukernel_utils + +/// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel +/// BRGeMM ukernel routines +/// @{ + +/// BRGeMM ukernel +struct brgemm : public handle { + /// Default constructor. Produces an empty object. + brgemm() = default; + + /// Constructs a BRGeMM ukernel object. Operates by the following formula: + /// `C = [A x B]`. + /// + /// @param M Dimension M of tensor A. + /// @param N Dimension N of tensor B. + /// @param K Dimension K of tensors A and B. + /// @param batch_size Number of batches to process. + /// @param lda Leading dimension of tensor A. + /// @param ldb Leading dimension of tensor B. + /// @param ldc Leading dimension of tensor C. + /// @param a_dt Data type of tensor A. + /// @param b_dt Data type of tensor B. + /// @param c_dt Data type of tensor C. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + brgemm(memory::dim M, memory::dim N, memory::dim K, memory::dim batch_size, + memory::dim lda, memory::dim ldb, memory::dim ldc, + memory::data_type a_dt, memory::data_type b_dt, + memory::data_type c_dt, bool allow_empty = false) { + + dnnl_brgemm_t brgemm = nullptr; + dnnl_status_t status = dnnl_brgemm_create(&brgemm, M, N, K, batch_size, + lda, ldb, ldc, memory::convert_to_c(a_dt), + memory::convert_to_c(b_dt), memory::convert_to_c(c_dt)); + + if (!allow_empty) + error::wrap_c_api( + status, "could not create a BRGeMM ukernel object"); + reset(brgemm); + } + + /// Sets adding an intermediate result to the output tensor C instead of + /// writing: `C += [A x B]`. + /// + /// @param add_C Value to indicate addition. `false` to skip addition, and + /// `true` to apply addition. + void set_add_C(bool add_C) { + dnnl_status_t status + = dnnl_brgemm_set_add_C(get(), static_cast(add_C)); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set add_C attribute"); + } + + /// Sets post-operations to a BRGeMM ukernel object: + /// `D = post-operations(C)`. + /// + /// Post-operations applies if one of the following holds: + /// * Non-empty post-operations are specified. + /// * Output data type `d_dt` is different from accumulation data type + /// `c_dt`. + /// + /// @param ldd Leading dimension of tensor D. + /// @param d_dt Data type of tensor D. + /// @param po Primitive post-operation attributes to extend the kernel + /// operations. + void set_post_ops(memory::dim ldd, memory::data_type d_dt, + const post_ops &po = default_post_ops()) { + dnnl_status_t status = dnnl_brgemm_set_post_ops( + get(), ldd, memory::convert_to_c(d_dt), po.get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set post operations"); + } + + /// Sets tensor A scales mask to a BRGeMM ukernel object. + /// + /// For quantization flavor tensor A scales apply to accumulation buffer + /// once C is ready. + /// + /// @param a_scale_mask Tensor A scale mask. Can be `0` only. + void set_A_scales(int a_scale_mask) { + dnnl_status_t status = dnnl_brgemm_set_A_scales(get(), a_scale_mask); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set A scales"); + } + + /// Sets tensor B scales mask to a BRGeMM ukernel object. + /// + /// For quantization flavor tensor B scales apply to accumulation buffer + /// once C is ready. + /// + /// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only. + void set_B_scales(int b_scale_mask) { + dnnl_status_t status = dnnl_brgemm_set_B_scales(get(), b_scale_mask); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set B scales"); + } + + /// Sets tensor D scales mask to a BRGeMM ukernel object. + /// + /// For quantization flavor tensor D scales apply after all post-ops are + /// applied. + /// + /// @param d_scale_mask Tensor D scale mask. Can be `0` only. + void set_D_scales(int d_scale_mask) { + dnnl_status_t status = dnnl_brgemm_set_D_scales(get(), d_scale_mask); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set D scales"); + } + + /// Finalizes initialization of a BRGeMM ukernel object. + /// + /// This step must be performed prior to querying information from the + /// object. + void finalize() { + dnnl_status_t status = dnnl_brgemm_finalize(get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not finalize an object"); + } + + /// Returns the packing type expected by a tensor B of a BRGeMM ukernel + /// object. + pack_type get_B_pack_type() const { + dnnl_pack_type_t c_pack_type; + dnnl_status_t status = dnnl_brgemm_get_B_pack_type(get(), &c_pack_type); + if (status != dnnl_success) + error::wrap_c_api(status, "could not query B pack type"); + + return static_cast(c_pack_type); + } + + /// Returns the size of a scratchpad memory needed for the BRGeMM ukernel + /// object. + size_t get_scratchpad_size() const { + size_t size; + dnnl_status_t status = dnnl_brgemm_get_scratchpad_size(get(), &size); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not query a scratchpad size from a BRGeMM ukernel " + "object"); + return size; + } + + /// Returns the flag indicating when the call to execute with post + /// operations is valid. + /// + /// `True` is for a valid call, `false`, otherwise. + bool is_execute_postops_valid() const { + int valid; + dnnl_status_t status + = dnnl_brgemm_is_execute_postops_valid(get(), &valid); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not query a flag for execute postops from a BRGeMM " + "ukernel object"); + return static_cast(valid); + } + + /// Initializes the hardware-specific context. Affects the global state for + /// all BRGeMM ukernel objects. If no initialization required, returns. + void set_hw_context() const { + dnnl_status_t status = dnnl_brgemm_set_hw_context(get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set hardware context"); + } + + /// Releases the hardware-specific context. Affects the global state for + /// all BRGeMM ukernel objects. Must be used after all the execution calls + /// to BRGeMM ukernel objects. + static void release_hw_context() { + dnnl_status_t status = dnnl_brgemm_release_hw_context(); + if (status != dnnl_success) + error::wrap_c_api(status, "could not release hardware context"); + } + + /// Generates an executable part of BRGeMM ukernel object. + void generate() { + dnnl_status_t status = dnnl_brgemm_generate(get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not generate a kernel"); + } + + /// Executes a BRGeMM ukernel object. + /// + /// @param A Base pointer to a tensor A. + /// @param B Base pointer to a tensor B. + /// @param A_B_offsets Vector of pairs of tensors A and B offsets for + /// each batch. The number of batches must coincide with the + /// `batch_size` value passed at object construction stage. + /// @param C Pointer to a tensor C (accumulation buffer). + /// @param scratchpad Pointer to a scratchpad buffer. + void execute(const void *A, const void *B, + const std::vector> &A_B_offsets, + void *C, void *scratchpad) const { + // TODO: export batch_element to C API later for user to fill it and + // pass directly to the call. + dnnl_status_t status = dnnl_brgemm_execute(get(), A, B, + (const dnnl_dim_t *)A_B_offsets.data(), C, scratchpad); + if (status != dnnl_success) + error::wrap_c_api( + status, "could not execute a BRGeMM ukernel object"); + } + + /// Executes a BRGeMM ukernel object with post operations. + /// + /// @param A Base pointer to a tensor A. + /// @param B Base pointer to a tensor B. + /// @param A_B_offsets Vector of pairs of tensors A and B offsets for + /// each batch. The number of batches must coincide with the + /// `batch_size` value passed at object construction stage. + /// @param C Pointer to a tensor C (accumulation buffer). + /// @param D Pointer to a tensor D (output buffer). + /// @param scratchpad Pointer to a scratchpad buffer. + /// @param params Post-op memory arguments. Must be passed If binary + /// post-op or scales were set. + void execute(const void *A, const void *B, + const std::vector> &A_B_offsets, + const void *C, void *D, void *scratchpad, + const attr_params ¶ms = default_attr_params()) const { + // TODO: export batch_element to C API later for user to fill it and + // pass directly to the call. + dnnl_status_t status = dnnl_brgemm_execute_postops(get(), A, B, + (const dnnl_dim_t *)A_B_offsets.data(), C, D, scratchpad, + params.get()); + if (status != dnnl_success) + error::wrap_c_api( + status, "could not execute a BRGeMM ukernel object"); + } + + /// Returns a constant reference to a static instance of default constructed + /// primitive post-operations attribute. + static const post_ops &default_post_ops() { + static const post_ops po; + return po; + } + + /// Returns a constant reference to a static instance of default constructed + /// ukernel attributes parameters. + static const attr_params &default_attr_params() { + static const attr_params ap; + return ap; + } +}; +/// @} dnnl_api_ukernel_brgemm + +/// @addtogroup dnnl_api_ukernel_transform Transform ukernel +/// Transform routines +/// @{ + +/// Transform ukernel +struct transform : public handle { + /// Default constructor. Produces an empty object. + transform() = default; + + /// Constructs a transform object. + /// + /// @param K Dimension K. + /// @param N Dimension N. + /// @param in_pack_type Input packing type. Must be one of + /// `pack_type::no_trans`, or `pack_type::trans`. + /// @param in_ld Input leading dimension. + /// @param out_ld Output leading dimension. Specifies a block by N dimension + /// during data packing. + /// @param in_dt Input data type. + /// @param out_dt Output data type. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + transform(memory::dim K, memory::dim N, pack_type in_pack_type, + memory::dim in_ld, memory::dim out_ld, memory::data_type in_dt, + memory::data_type out_dt, bool allow_empty = false) { + + dnnl_transform_t transform = nullptr; + dnnl_status_t status = dnnl_transform_create(&transform, K, N, + static_cast(in_pack_type), in_ld, out_ld, + memory::convert_to_c(in_dt), memory::convert_to_c(out_dt)); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a BRGeMM ukernel packing B object"); + reset(transform); + } + + /// Generates an executable part of transform object. + void generate() { + dnnl_status_t status = dnnl_transform_generate(get()); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not generate a BRGeMM ukernel packing B object"); + } + + /// Executes a transform object. + /// + /// @param in Pointer to an input buffer. + /// @param out Pointer to an output buffer. + void execute(const void *in, void *out) const { + dnnl_status_t status = dnnl_transform_execute(get(), in, out); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not execute a BRGeMM ukernel packing B object"); + } +}; + +/// @} dnnl_api_ukernel_transform + +#endif + +} // namespace ukernel + +/// @} dnnl_api_ukernel + +} // namespace dnnl + +/// @} dnnl_api + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_HPP */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h new file mode 100644 index 0000000000000000000000000000000000000000..f1588d4dcf2347dd9752e9435b343f4a17c98923 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h @@ -0,0 +1,98 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C API types definitions + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H +#define ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "oneapi/dnnl/dnnl_types.h" + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_ukernel +/// @{ + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// Packing specification +typedef enum { + /// Undefined pack type. A guard value. + dnnl_pack_type_undef = 0, + /// Plain, not transposed layout. Similar to format_tag::ab. + dnnl_pack_type_no_trans, + /// Plain, transposed layout. Similar to format_tag::ba. + dnnl_pack_type_trans, + /// Packed by 32 bits along K dimension layout. + dnnl_pack_type_pack32, +} dnnl_pack_type_t; + +/// @struct dnnl_ukernel_attr_params +/// An opaque structure to describe ukernel attributes memory storage. +struct dnnl_ukernel_attr_params; + +/// A ukernel attributes memory storage handle. +typedef struct dnnl_ukernel_attr_params *dnnl_ukernel_attr_params_t; + +/// A constant ukernel attributes memory storage handle. +typedef const struct dnnl_ukernel_attr_params *const_dnnl_ukernel_attr_params_t; + +/// @addtogroup dnnl_api_ukernel_brgemm +/// @{ + +/// @struct dnnl_brgemm +/// An opaque structure to describe a brgemm ukernel. +struct dnnl_brgemm; + +/// A brgemm ukernel handle. +typedef struct dnnl_brgemm *dnnl_brgemm_t; + +/// A constant brgemm ukernel handle. +typedef const struct dnnl_brgemm *const_dnnl_brgemm_t; + +/// @struct dnnl_transform +/// An opaque structure to describe a transform routine. +struct dnnl_transform; + +/// A transform routine handle. +typedef struct dnnl_transform *dnnl_transform_t; + +/// A constant transform routine handle. +typedef const struct dnnl_transform *const_dnnl_transform_t; + +/// @} dnnl_api_ukernel_brgemm +#endif + +/// @} dnnl_api_ukernel + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H */ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h new file mode 100644 index 0000000000000000000000000000000000000000..7b7c394258523af0fb6c11ae74deb36d3dfe138a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2019-2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_VERSION_H +#define ONEAPI_DNNL_DNNL_VERSION_H + +// clang-format off + +/// Major version +#define DNNL_VERSION_MAJOR 3 + +/// Minor version +#define DNNL_VERSION_MINOR 7 + +/// Patch version +#define DNNL_VERSION_PATCH 1 + +// clang-format on + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h new file mode 100644 index 0000000000000000000000000000000000000000..5bfe42dbda1bf80004de23cc501e7874d87178ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_VERSION_HASH_H +#define ONEAPI_DNNL_DNNL_VERSION_HASH_H + +// clang-format off + +/// Note: this macro and header file were moved to a separate instance to avoid +/// incremental build issues as moving from commit to commit would trigger a +/// complete library rebuild. Including a generated header file in a single +/// translation unit makes this problem go away. +/// Git commit hash +#define DNNL_VERSION_HASH "8d263e693366ef8db40acc569cc7d8edf644556d" + +// clang-format on + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h new file mode 100644 index 0000000000000000000000000000000000000000..78db14a6384f3d54e6c70420fda861b395bbe105 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2024 The pybind Community. + +#pragma once + +#include + +#include "common.h" +#include "internals.h" + +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +// Forward declaration needed here: Refactoring opportunity. +extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *); + +inline bool type_is_managed_by_our_internals(PyTypeObject *type_obj) { +#if defined(PYPY_VERSION) + auto &internals = get_internals(); + return bool(internals.registered_types_py.find(type_obj) + != internals.registered_types_py.end()); +#else + return bool(type_obj->tp_new == pybind11_object_new); +#endif +} + +inline bool is_instance_method_of_type(PyTypeObject *type_obj, PyObject *attr_name) { + PyObject *descr = _PyType_Lookup(type_obj, attr_name); + return bool((descr != nullptr) && PyInstanceMethod_Check(descr)); +} + +inline object try_get_cpp_conduit_method(PyObject *obj) { + if (PyType_Check(obj)) { + return object(); + } + PyTypeObject *type_obj = Py_TYPE(obj); + str attr_name("_pybind11_conduit_v1_"); + bool assumed_to_be_callable = false; + if (type_is_managed_by_our_internals(type_obj)) { + if (!is_instance_method_of_type(type_obj, attr_name.ptr())) { + return object(); + } + assumed_to_be_callable = true; + } + PyObject *method = PyObject_GetAttr(obj, attr_name.ptr()); + if (method == nullptr) { + PyErr_Clear(); + return object(); + } + if (!assumed_to_be_callable && PyCallable_Check(method) == 0) { + Py_DECREF(method); + return object(); + } + return reinterpret_steal(method); +} + +inline void *try_raw_pointer_ephemeral_from_cpp_conduit(handle src, + const std::type_info *cpp_type_info) { + object method = try_get_cpp_conduit_method(src.ptr()); + if (method) { + capsule cpp_type_info_capsule(const_cast(static_cast(cpp_type_info)), + typeid(std::type_info).name()); + object cpp_conduit = method(bytes(PYBIND11_PLATFORM_ABI_ID), + cpp_type_info_capsule, + bytes("raw_pointer_ephemeral")); + if (isinstance(cpp_conduit)) { + return reinterpret_borrow(cpp_conduit).get_pointer(); + } + } + return nullptr; +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h new file mode 100644 index 0000000000000000000000000000000000000000..908aa703741a1f127efa8918c67a5d74064010e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h @@ -0,0 +1,44 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2021 The Pybind Development Team. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#pragma once + +#include "common.h" + +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +template +struct dynamic_raw_ptr_cast_is_possible : std::false_type {}; + +template +struct dynamic_raw_ptr_cast_is_possible< + To, + From, + detail::enable_if_t::value && std::is_polymorphic::value>> + : std::true_type {}; + +template ::value, int> = 0> +To *dynamic_raw_ptr_cast_if_possible(From * /*ptr*/) { + return nullptr; +} + +template ::value, int> = 0> +To *dynamic_raw_ptr_cast_if_possible(From *ptr) { + return dynamic_cast(ptr); +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h new file mode 100644 index 0000000000000000000000000000000000000000..ee9149931607cf8d448ff75432b456090add8eb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h @@ -0,0 +1,196 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2024-2025 The Pybind Development Team. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +// For background see the description of PR google/pybind11clif#30099. + +#pragma once + +#include +#include +#include + +#include "common.h" + +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +struct function_record_PyObject { + PyObject_HEAD + function_record *cpp_func_rec; +}; + +PYBIND11_NAMESPACE_BEGIN(function_record_PyTypeObject_methods) + +PyObject *tp_new_impl(PyTypeObject *type, PyObject *args, PyObject *kwds); +PyObject *tp_alloc_impl(PyTypeObject *type, Py_ssize_t nitems); +int tp_init_impl(PyObject *self, PyObject *args, PyObject *kwds); +void tp_dealloc_impl(PyObject *self); +void tp_free_impl(void *self); + +static PyObject *reduce_ex_impl(PyObject *self, PyObject *, PyObject *); + +static PyMethodDef tp_methods_impl[] + = {{"__reduce_ex__", + // reduce_ex_impl is a PyCFunctionWithKeywords, but PyMethodDef + // requires a PyCFunction. The cast through void* is safe and + // idiomatic with METH_KEYWORDS, and it successfully sidesteps + // unhelpful compiler warnings. + // NOLINTNEXTLINE(bugprone-casting-through-void) + reinterpret_cast(reinterpret_cast(reduce_ex_impl)), + METH_VARARGS | METH_KEYWORDS, + nullptr}, + {nullptr, nullptr, 0, nullptr}}; + +// Python 3.12+ emits a DeprecationWarning for heap types whose tp_name does +// not contain a dot ('.') and that lack a __module__ attribute. For pybind11's +// internal function_record type, we do not have an actual module object to +// attach, so we cannot use PyType_FromModuleAndSpec (introduced in Python 3.9) +// to set __module__ automatically. +// +// As a workaround, we define a "qualified" type name that includes a dummy +// module name (PYBIND11_DUMMY_MODULE_NAME). This is non‑idiomatic but avoids +// the deprecation warning, and results in reprs like +// +// +// +// even though no real pybind11_builtins module exists. If pybind11 gains an +// actual module object in the future, this code should switch to +// PyType_FromModuleAndSpec for Python 3.9+ and drop the dummy module +// workaround. +// +// Note that this name is versioned. +#define PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME \ + "pybind11_detail_function_record_" PYBIND11_DETAIL_FUNCTION_RECORD_ABI_ID \ + "_" PYBIND11_PLATFORM_ABI_ID +constexpr char tp_plainname_impl[] = PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME; +constexpr char tp_qualname_impl[] + = PYBIND11_DUMMY_MODULE_NAME "." PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME; + +PYBIND11_NAMESPACE_END(function_record_PyTypeObject_methods) + +static PyType_Slot function_record_PyType_Slots[] = { + {Py_tp_dealloc, + reinterpret_cast(function_record_PyTypeObject_methods::tp_dealloc_impl)}, + {Py_tp_methods, + reinterpret_cast(function_record_PyTypeObject_methods::tp_methods_impl)}, + {Py_tp_init, reinterpret_cast(function_record_PyTypeObject_methods::tp_init_impl)}, + {Py_tp_alloc, reinterpret_cast(function_record_PyTypeObject_methods::tp_alloc_impl)}, + {Py_tp_new, reinterpret_cast(function_record_PyTypeObject_methods::tp_new_impl)}, + {Py_tp_free, reinterpret_cast(function_record_PyTypeObject_methods::tp_free_impl)}, + {0, nullptr}}; + +static PyType_Spec function_record_PyType_Spec + = {function_record_PyTypeObject_methods::tp_qualname_impl, + sizeof(function_record_PyObject), + 0, + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE, + function_record_PyType_Slots}; + +inline PyTypeObject *get_function_record_PyTypeObject() { + PyTypeObject *&py_type_obj = detail::get_local_internals().function_record_py_type; + if (!py_type_obj) { + PyObject *py_obj = PyType_FromSpec(&function_record_PyType_Spec); + if (py_obj == nullptr) { + throw error_already_set(); + } + py_type_obj = reinterpret_cast(py_obj); + } + return py_type_obj; +} + +inline bool is_function_record_PyObject(PyObject *obj) { + if (PyType_Check(obj) != 0) { + return false; + } + PyTypeObject *obj_type = Py_TYPE(obj); + + PyTypeObject *frtype = get_function_record_PyTypeObject(); + + // Fast path (pointer comparison). + if (obj_type == frtype) { + return true; + } + // This works across extension modules. Note that tp_name is versioned. + if (strcmp(obj_type->tp_name, function_record_PyTypeObject_methods::tp_qualname_impl) == 0 + || strcmp(obj_type->tp_name, function_record_PyTypeObject_methods::tp_plainname_impl) + == 0) { + return true; + } + return false; +} + +inline function_record *function_record_ptr_from_PyObject(PyObject *obj) { + if (is_function_record_PyObject(obj)) { + return ((detail::function_record_PyObject *) obj)->cpp_func_rec; + } + return nullptr; +} + +inline object function_record_PyObject_New() { + auto *py_func_rec = PyObject_New(function_record_PyObject, get_function_record_PyTypeObject()); + if (py_func_rec == nullptr) { + throw error_already_set(); + } + py_func_rec->cpp_func_rec = nullptr; // For clarity/purity. Redundant in practice. + return reinterpret_steal((PyObject *) py_func_rec); +} + +PYBIND11_NAMESPACE_BEGIN(function_record_PyTypeObject_methods) + +// Guard against accidents & oversights, in particular when porting to future Python versions. +inline PyObject *tp_new_impl(PyTypeObject *, PyObject *, PyObject *) { + pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_new_impl"); + // return nullptr; // Unreachable. +} + +inline PyObject *tp_alloc_impl(PyTypeObject *, Py_ssize_t) { + pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_alloc_impl"); + // return nullptr; // Unreachable. +} + +inline int tp_init_impl(PyObject *, PyObject *, PyObject *) { + pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_init_impl"); + // return -1; // Unreachable. +} + +inline void tp_free_impl(void *) { + pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_free_impl"); +} + +inline PyObject *reduce_ex_impl(PyObject *self, PyObject *, PyObject *) { + // Deliberately ignoring the arguments for simplicity (expected is `protocol: int`). + const function_record *rec = function_record_ptr_from_PyObject(self); + if (rec == nullptr) { + pybind11_fail( + "FATAL: function_record_PyTypeObject reduce_ex_impl(): cannot obtain cpp_func_rec."); + } + if (rec->name != nullptr && rec->name[0] != '\0' && rec->scope + && PyModule_Check(rec->scope.ptr()) != 0) { + object scope_module = get_scope_module(rec->scope); + if (scope_module) { + auto builtins = reinterpret_borrow(PyEval_GetBuiltins()); + auto builtins_eval = builtins["eval"]; + auto reconstruct_args = make_tuple(str("__import__('importlib').import_module('") + + scope_module + str("')")); + return make_tuple(std::move(builtins_eval), std::move(reconstruct_args)) + .release() + .ptr(); + } + } + set_error(PyExc_RuntimeError, repr(self) + str(" is not pickleable.")); + return nullptr; +} + +PYBIND11_NAMESPACE_END(function_record_PyTypeObject_methods) + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h new file mode 100644 index 0000000000000000000000000000000000000000..384902ccbc2c58505936d77ccbccb06da83e4d32 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h @@ -0,0 +1,543 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + pybind11/detail/init.h: init factory function implementation and support code. + + Copyright (c) 2017 Jason Rhinelander + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "class.h" +#include "using_smart_holder.h" + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_WARNING_DISABLE_MSVC(4127) + +PYBIND11_NAMESPACE_BEGIN(detail) + +template <> +class type_caster { +public: + bool load(handle h, bool) { + value = reinterpret_cast(h.ptr()); + return true; + } + + template + using cast_op_type = value_and_holder &; + explicit operator value_and_holder &() { return *value; } + static constexpr auto name = const_name(); + +private: + value_and_holder *value = nullptr; +}; + +PYBIND11_NAMESPACE_BEGIN(initimpl) + +inline void no_nullptr(const void *ptr) { + if (!ptr) { + throw type_error("pybind11::init(): factory function returned nullptr"); + } +} + +// Implementing functions for all forms of py::init<...> and py::init(...) +template +using Cpp = typename Class::type; +template +using Alias = typename Class::type_alias; +template +using Holder = typename Class::holder_type; + +template +using is_alias_constructible = std::is_constructible, Cpp &&>; + +// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance. +template = 0> +bool is_alias(Cpp *ptr) { + return dynamic_cast *>(ptr) != nullptr; +} +// Failing fallback version of the above for a no-alias class (always returns false) +template +constexpr bool is_alias(const void *) { + return false; +} + +// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall +// back to brace aggregate initialization so that for aggregate initialization can be used with +// py::init, e.g. `py::init` to initialize a `struct T { int a; int b; }`. For +// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually +// works, but will not do the expected thing when `T` has an `initializer_list` constructor). +template ::value, int> = 0> +inline Class *construct_or_initialize(Args &&...args) { + return new Class(std::forward(args)...); +} +template ::value, int> = 0> +inline Class *construct_or_initialize(Args &&...args) { + return new Class{std::forward(args)...}; +} + +// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor. This allows types with +// an alias to provide only a single Cpp factory function as long as the Alias can be +// constructed from an rvalue reference of the base Cpp type. This means that Alias classes +// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to +// inherit all the base class constructors. +template +void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/, + value_and_holder &v_h, + Cpp &&base) { + v_h.value_ptr() = new Alias(std::move(base)); +} +template +[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/, + value_and_holder &, + Cpp &&) { + throw type_error("pybind11::init(): unable to convert returned instance to required " + "alias class: no `Alias(Class &&)` constructor available"); +} + +// Error-generating fallback for factories that don't match one of the below construction +// mechanisms. +template +void construct(...) { + static_assert(!std::is_same::value /* always false */, + "pybind11::init(): init function must return a compatible pointer, " + "holder, or value"); +} + +// Pointer return v1: the factory function returns a class pointer for a registered class. +// If we don't need an alias (because this class doesn't have one, or because the final type is +// inherited on the Python side) we can simply take over ownership. Otherwise we need to try to +// construct an Alias from the returned base instance. +template +void construct(value_and_holder &v_h, Cpp *ptr, bool need_alias) { + PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias); + no_nullptr(ptr); + if (Class::has_alias && need_alias && !is_alias(ptr)) { + // We're going to try to construct an alias by moving the cpp type. Whether or not + // that succeeds, we still need to destroy the original cpp pointer (either the + // moved away leftover, if the alias construction works, or the value itself if we + // throw an error), but we can't just call `delete ptr`: it might have a special + // deleter, or might be shared_from_this. So we construct a holder around it as if + // it was a normal instance, then steal the holder away into a local variable; thus + // the holder and destruction happens when we leave the C++ scope, and the holder + // class gets to handle the destruction however it likes. + v_h.value_ptr() = ptr; + v_h.set_instance_registered(true); // Trick to prevent init_instance from registering it + // DANGER ZONE BEGIN: exceptions will leave v_h in an invalid state. + v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder + Holder temp_holder(std::move(v_h.holder>())); // Steal the holder + v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null + v_h.set_instance_registered(false); + // DANGER ZONE END. + + construct_alias_from_cpp(is_alias_constructible{}, v_h, std::move(*ptr)); + } else { + // Otherwise the type isn't inherited, so we don't need an Alias + v_h.value_ptr() = ptr; + } +} + +// Pointer return v2: a factory that always returns an alias instance ptr. We simply take over +// ownership of the pointer. +template = 0> +void construct(value_and_holder &v_h, Alias *alias_ptr, bool) { + no_nullptr(alias_ptr); + v_h.value_ptr() = static_cast *>(alias_ptr); +} + +// Holder return: copy its pointer, and move or copy the returned holder into the new instance's +// holder. This also handles types like std::shared_ptr and std::unique_ptr where T is a +// derived type (through those holder's implicit conversion from derived class holder +// constructors). +template >::value, int> = 0> +void construct(value_and_holder &v_h, Holder holder, bool need_alias) { + PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias); + auto *ptr = holder_helper>::get(holder); + no_nullptr(ptr); + // If we need an alias, check that the held pointer is actually an alias instance + if (Class::has_alias && need_alias && !is_alias(ptr)) { + throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance " + "is not an alias instance"); + } + + // Cast away constness to store in void* storage. + // The value_and_holder storage is fundamentally untyped (void**), so we lose + // const-correctness here by design. The const qualifier will be restored + // when the pointer is later retrieved and cast back to the original type. + // This explicit const_cast makes the const-removal clearly visible. + v_h.value_ptr() = const_cast(static_cast(ptr)); + v_h.type->init_instance(v_h.inst, &holder); +} + +// return-by-value version 1: returning a cpp class by value. If the class has an alias and an +// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct +// the alias from the base when needed (i.e. because of Python-side inheritance). When we don't +// need it, we simply move-construct the cpp value into a new instance. +template +void construct(value_and_holder &v_h, Cpp &&result, bool need_alias) { + PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias); + static_assert(is_move_constructible>::value, + "pybind11::init() return-by-value factory function requires a movable class"); + if (Class::has_alias && need_alias) { + construct_alias_from_cpp(is_alias_constructible{}, v_h, std::move(result)); + } else { + v_h.value_ptr() = new Cpp(std::move(result)); + } +} + +// return-by-value version 2: returning a value of the alias type itself. We move-construct an +// Alias instance (even if no the python-side inheritance is involved). The is intended for +// cases where Alias initialization is always desired. +template +void construct(value_and_holder &v_h, Alias &&result, bool) { + static_assert( + is_move_constructible>::value, + "pybind11::init() return-by-alias-value factory function requires a movable alias class"); + v_h.value_ptr() = new Alias(std::move(result)); +} + +template +smart_holder init_smart_holder_from_unique_ptr(std::unique_ptr &&unq_ptr, + bool void_cast_raw_ptr) { + void *void_ptr = void_cast_raw_ptr ? static_cast(unq_ptr.get()) : nullptr; + return smart_holder::from_unique_ptr(std::move(unq_ptr), void_ptr); +} + +template >, + detail::enable_if_t>::value, int> = 0> +void construct(value_and_holder &v_h, std::unique_ptr, D> &&unq_ptr, bool need_alias) { + PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias); + auto *ptr = unq_ptr.get(); + no_nullptr(ptr); + if (Class::has_alias && need_alias && !is_alias(ptr)) { + throw type_error("pybind11::init(): construction failed: returned std::unique_ptr pointee " + "is not an alias instance"); + } + // Here and below: if the new object is a trampoline, the shared_from_this mechanism needs + // to be prevented from accessing the smart_holder vptr, because it does not keep the + // trampoline Python object alive. For types that don't inherit from enable_shared_from_this + // it does not matter if void_cast_raw_ptr is true or false, therefore it's not necessary + // to also inspect the type. + auto smhldr = init_smart_holder_from_unique_ptr( + std::move(unq_ptr), /*void_cast_raw_ptr*/ Class::has_alias && is_alias(ptr)); + v_h.value_ptr() = ptr; + v_h.type->init_instance(v_h.inst, &smhldr); +} + +template >, + detail::enable_if_t>::value, int> = 0> +void construct(value_and_holder &v_h, + std::unique_ptr, D> &&unq_ptr, + bool /*need_alias*/) { + auto *ptr = unq_ptr.get(); + no_nullptr(ptr); + auto smhldr + = init_smart_holder_from_unique_ptr(std::move(unq_ptr), /*void_cast_raw_ptr*/ true); + v_h.value_ptr() = ptr; + v_h.type->init_instance(v_h.inst, &smhldr); +} + +template +void construct_from_shared_ptr(value_and_holder &v_h, + std::shared_ptr &&shd_ptr, + bool need_alias) { + static_assert(std::is_same>::value + || std::is_same>::value, + "Expected (const) Cpp as shared_ptr pointee"); + auto *ptr = shd_ptr.get(); + no_nullptr(ptr); + if (Class::has_alias && need_alias && !is_alias(ptr)) { + throw type_error("pybind11::init(): construction failed: returned std::shared_ptr pointee " + "is not an alias instance"); + } + // Cast to non-const if needed, consistent with internal design + auto smhldr + = smart_holder::from_shared_ptr(std::const_pointer_cast>(std::move(shd_ptr))); + v_h.value_ptr() = const_cast *>(ptr); + v_h.type->init_instance(v_h.inst, &smhldr); +} + +template >::value, int> = 0> +void construct(value_and_holder &v_h, std::shared_ptr> &&shd_ptr, bool need_alias) { + construct_from_shared_ptr, Class>(v_h, std::move(shd_ptr), need_alias); +} + +template >::value, int> = 0> +void construct(value_and_holder &v_h, + std::shared_ptr> &&shd_ptr, + bool need_alias) { + construct_from_shared_ptr, Class>(v_h, std::move(shd_ptr), need_alias); +} + +template >::value, int> = 0> +void construct(value_and_holder &v_h, + std::shared_ptr> &&shd_ptr, + bool /*need_alias*/) { + auto *ptr = shd_ptr.get(); + no_nullptr(ptr); + auto smhldr = smart_holder::from_shared_ptr(shd_ptr); + v_h.value_ptr() = ptr; + v_h.type->init_instance(v_h.inst, &smhldr); +} + +// Implementing class for py::init<...>() +template +struct constructor { + template = 0> + static void execute(Class &cl, const Extra &...extra) { + cl.def( + "__init__", + [](value_and_holder &v_h, + Args... args) { // NOLINT(performance-unnecessary-value-param) + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + }, + is_new_style_constructor(), + extra...); + } + + template < + typename Class, + typename... Extra, + enable_if_t, Args...>::value, int> + = 0> + static void execute(Class &cl, const Extra &...extra) { + cl.def( + "__init__", + [](value_and_holder &v_h, Args... args) { + if (Py_TYPE(v_h.inst) == v_h.type->type) { + v_h.value_ptr() + = construct_or_initialize>(std::forward(args)...); + } else { + v_h.value_ptr() + = construct_or_initialize>(std::forward(args)...); + } + }, + is_new_style_constructor(), + extra...); + } + + template < + typename Class, + typename... Extra, + enable_if_t, Args...>::value, int> + = 0> + static void execute(Class &cl, const Extra &...extra) { + cl.def( + "__init__", + [](value_and_holder &v_h, Args... args) { + v_h.value_ptr() + = construct_or_initialize>(std::forward(args)...); + }, + is_new_style_constructor(), + extra...); + } +}; + +// Implementing class for py::init_alias<...>() +template +struct alias_constructor { + template < + typename Class, + typename... Extra, + enable_if_t, Args...>::value, int> + = 0> + static void execute(Class &cl, const Extra &...extra) { + cl.def( + "__init__", + [](value_and_holder &v_h, Args... args) { + v_h.value_ptr() + = construct_or_initialize>(std::forward(args)...); + }, + is_new_style_constructor(), + extra...); + } +}; + +// Implementation class for py::init(Func) and py::init(Func, AliasFunc) +template , + typename = function_signature_t> +struct factory; + +// Specialization for py::init(Func) +template +struct factory { + remove_reference_t class_factory; + + // NOLINTNEXTLINE(google-explicit-constructor) + factory(Func &&f) : class_factory(std::forward(f)) {} + + // The given class either has no alias or has no separate alias factory; + // this always constructs the class itself. If the class is registered with an alias + // type and an alias instance is needed (i.e. because the final type is a Python class + // inheriting from the C++ type) the returned value needs to either already be an alias + // instance, or the alias needs to be constructible from a `Class &&` argument. + template + void execute(Class &cl, const Extra &...extra) && { +#if defined(PYBIND11_CPP14) + cl.def( + "__init__", + [func = std::move(class_factory)] +#else + auto &func = class_factory; + cl.def( + "__init__", + [func] +#endif + (value_and_holder &v_h, Args... args) { + construct( + v_h, func(std::forward(args)...), Py_TYPE(v_h.inst) != v_h.type->type); + }, + is_new_style_constructor(), + extra...); + } +}; + +// Specialization for py::init(Func, AliasFunc) +template +struct factory { + static_assert(sizeof...(CArgs) == sizeof...(AArgs), + "pybind11::init(class_factory, alias_factory): class and alias factories " + "must have identical argument signatures"); + static_assert(all_of...>::value, + "pybind11::init(class_factory, alias_factory): class and alias factories " + "must have identical argument signatures"); + + remove_reference_t class_factory; + remove_reference_t alias_factory; + + factory(CFunc &&c, AFunc &&a) + : class_factory(std::forward(c)), alias_factory(std::forward(a)) {} + + // The class factory is called when the `self` type passed to `__init__` is the direct + // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype. + template + void execute(Class &cl, const Extra &...extra) && { + static_assert(Class::has_alias, + "The two-argument version of `py::init()` can " + "only be used if the class has an alias"); +#if defined(PYBIND11_CPP14) + cl.def( + "__init__", + [class_func = std::move(class_factory), alias_func = std::move(alias_factory)] +#else + auto &class_func = class_factory; + auto &alias_func = alias_factory; + cl.def( + "__init__", + [class_func, alias_func] +#endif + (value_and_holder &v_h, CArgs... args) { + if (Py_TYPE(v_h.inst) == v_h.type->type) { + // If the instance type equals the registered type we don't have inheritance, + // so don't need the alias and can construct using the class function: + construct(v_h, class_func(std::forward(args)...), false); + } else { + construct(v_h, alias_func(std::forward(args)...), true); + } + }, + is_new_style_constructor(), + extra...); + } +}; + +/// Set just the C++ state. Same as `__init__`. +template +void setstate(value_and_holder &v_h, T &&result, bool need_alias) { + construct(v_h, std::forward(result), need_alias); +} + +/// Set both the C++ and Python states +template ::value, int> = 0> +void setstate(value_and_holder &v_h, std::pair &&result, bool need_alias) { + construct(v_h, std::move(result.first), need_alias); + auto d = handle(result.second); + if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) { + // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily. + // See PR #2972 for details. + return; + } + // Our tests never run into an unset dict, but being careful here for now (see #5658) + auto dict = getattr((PyObject *) v_h.inst, "__dict__", none()); + if (dict.is_none()) { + setattr((PyObject *) v_h.inst, "__dict__", d); + } else { + // Keep the original object dict and just update it + if (PyDict_Update(dict.ptr(), d.ptr()) < 0) { + throw error_already_set(); + } + } +} + +/// Implementation for py::pickle(GetState, SetState) +template , + typename = function_signature_t> +struct pickle_factory; + +template +struct pickle_factory { + static_assert(std::is_same, intrinsic_t>::value, + "The type returned by `__getstate__` must be the same " + "as the argument accepted by `__setstate__`"); + + remove_reference_t get; + remove_reference_t set; + + pickle_factory(Get get, Set set) : get(std::forward(get)), set(std::forward(set)) {} + + template + void execute(Class &cl, const Extra &...extra) && { + cl.def("__getstate__", std::move(get), pos_only()); + +#if defined(PYBIND11_CPP14) + cl.def( + "__setstate__", + [func = std::move(set)] +#else + auto &func = set; + cl.def( + "__setstate__", + [func] +#endif + (value_and_holder &v_h, ArgState state) { + setstate( + v_h, func(std::forward(state)), Py_TYPE(v_h.inst) != v_h.type->type); + }, + is_new_style_constructor(), + extra...); + } +}; + +PYBIND11_NAMESPACE_END(initimpl) +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h new file mode 100644 index 0000000000000000000000000000000000000000..f076e79e73ebc7be7bf009ac3c711765a934d9fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h @@ -0,0 +1,214 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2022-2025 The pybind Community. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#pragma once + +#include "../pytypes.h" +#include "common.h" +#include "internals.h" + +#include +#include +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +// This is a separate function only to enable easy unit testing. +inline std::string +native_enum_missing_finalize_error_message(const std::string &enum_name_encoded) { + return "pybind11::native_enum<...>(\"" + enum_name_encoded + "\", ...): MISSING .finalize()"; +} + +class native_enum_data { +public: + native_enum_data(const object &parent_scope, + const char *enum_name, + const char *native_type_name, + const char *class_doc, + const std::type_index &enum_type_index) + : enum_name_encoded{enum_name}, native_type_name_encoded{native_type_name}, + enum_type_index{enum_type_index}, parent_scope(parent_scope), enum_name{enum_name}, + native_type_name{native_type_name}, class_doc(class_doc), export_values_flag{false}, + finalize_needed{false} {} + + void finalize(); + + native_enum_data(const native_enum_data &) = delete; + native_enum_data &operator=(const native_enum_data &) = delete; + +#if !defined(NDEBUG) + // This dtor cannot easily be unit tested because it terminates the process. + ~native_enum_data() { + if (finalize_needed) { + pybind11_fail(native_enum_missing_finalize_error_message(enum_name_encoded)); + } + } +#endif + +protected: + void disarm_finalize_check(const char *error_context) { + if (!finalize_needed) { + pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded + + "\"): " + error_context); + } + finalize_needed = false; + } + + void arm_finalize_check() { + assert(!finalize_needed); // Catch redundant calls. + finalize_needed = true; + } + + std::string enum_name_encoded; + std::string native_type_name_encoded; + std::type_index enum_type_index; + +private: + object parent_scope; + str enum_name; + str native_type_name; + std::string class_doc; + +protected: + list members; + list member_docs; + bool export_values_flag : 1; // Attention: It is best to keep the bools together. + +private: + bool finalize_needed : 1; +}; + +inline void global_internals_native_enum_type_map_set_item(const std::type_index &enum_type_index, + PyObject *py_enum) { + with_internals( + [&](internals &internals) { internals.native_enum_type_map[enum_type_index] = py_enum; }); +} + +inline handle +global_internals_native_enum_type_map_get_item(const std::type_index &enum_type_index) { + return with_internals([&](internals &internals) { + auto found = internals.native_enum_type_map.find(enum_type_index); + if (found != internals.native_enum_type_map.end()) { + return handle(found->second); + } + return handle(); + }); +} + +inline bool +global_internals_native_enum_type_map_contains(const std::type_index &enum_type_index) { + return with_internals([&](internals &internals) { + return internals.native_enum_type_map.count(enum_type_index) != 0; + }); +} + +inline object import_or_getattr(const std::string &fully_qualified_name, + const std::string &append_to_exception_message) { + std::istringstream stream(fully_qualified_name); + std::string part; + + if (!std::getline(stream, part, '.') || part.empty()) { + std::string msg = "Invalid fully-qualified name `"; + msg += fully_qualified_name; + msg += "`"; + msg += append_to_exception_message; + throw value_error(msg); + } + + auto curr_scope = reinterpret_steal(PyImport_ImportModule(part.c_str())); + if (!curr_scope) { + std::string msg = "Failed to import top-level module `"; + msg += part; + msg += "`"; + msg += append_to_exception_message; + raise_from(PyExc_ImportError, msg.c_str()); + throw error_already_set(); + } + + // Now recursively getattr or import remaining parts + std::string curr_path = part; + while (std::getline(stream, part, '.')) { + if (part.empty()) { + std::string msg = "Invalid fully-qualified name `"; + msg += fully_qualified_name; + msg += "`"; + msg += append_to_exception_message; + throw value_error(msg); + } + std::string next_path = curr_path; + next_path += "."; + next_path += part; + auto next_scope + = reinterpret_steal(PyObject_GetAttrString(curr_scope.ptr(), part.c_str())); + if (!next_scope) { + error_fetch_and_normalize stored_getattr_error("getattr"); + // Try importing the next level + next_scope = reinterpret_steal(PyImport_ImportModule(next_path.c_str())); + if (!next_scope) { + error_fetch_and_normalize stored_import_error("import"); + std::string msg = "Failed to import or getattr `"; + msg += part; + msg += "` from `"; + msg += curr_path; + msg += "`"; + msg += append_to_exception_message; + msg += "\n-------- getattr exception --------\n"; + msg += stored_getattr_error.error_string(); + msg += "\n-------- import exception --------\n"; + msg += stored_import_error.error_string(); + throw import_error(msg.c_str()); + } + } + curr_scope = next_scope; + curr_path = next_path; + } + return curr_scope; +} + +inline void native_enum_data::finalize() { + disarm_finalize_check("DOUBLE finalize"); + if (hasattr(parent_scope, enum_name)) { + pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded + + "\"): an object with that name is already defined"); + } + auto py_enum_type = import_or_getattr(native_type_name, " (native_type_name)"); + auto py_enum = py_enum_type(enum_name, members); + object module_name = get_module_name_if_available(parent_scope); + if (module_name) { + py_enum.attr("__module__") = module_name; + } + if (hasattr(parent_scope, "__qualname__")) { + const auto parent_qualname = parent_scope.attr("__qualname__").cast(); + py_enum.attr("__qualname__") = str(parent_qualname + "." + enum_name.cast()); + } + parent_scope.attr(enum_name) = py_enum; + if (export_values_flag) { + for (auto member : members) { + auto member_name = member[int_(0)]; + if (hasattr(parent_scope, member_name)) { + pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded + "\").value(\"" + + member_name.cast() + + "\"): an object with that name is already defined"); + } + parent_scope.attr(member_name) = py_enum[member_name]; + } + } + if (!class_doc.empty()) { + py_enum.attr("__doc__") = class_doc.c_str(); + } + for (auto doc : member_docs) { + py_enum[doc[int_(0)]].attr("__doc__") = doc[int_(1)]; + } + global_internals_native_enum_type_map_set_item(enum_type_index, py_enum.release().ptr()); +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h new file mode 100644 index 0000000000000000000000000000000000000000..1272e1fb2275abd469797e8bf81ee49650a31cca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h @@ -0,0 +1,87 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2016-2025 The Pybind Development Team. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#pragma once + +// PLEASE DO NOT ADD ANY INCLUDES HERE + +// Define some generic pybind11 helper macros for warning management. +// +// Note that compiler-specific push/pop pairs are baked into the +// PYBIND11_NAMESPACE_BEGIN/PYBIND11_NAMESPACE_END pair of macros. Therefore manual +// PYBIND11_WARNING_PUSH/PYBIND11_WARNING_POP are usually only needed in `#include` sections. +// +// If you find you need to suppress a warning, please try to make the suppression as local as +// possible using these macros. Please also be sure to push/pop with the pybind11 macros. Please +// only use compiler specifics if you need to check specific versions, e.g. Apple Clang vs. vanilla +// Clang. +#if defined(_MSC_VER) +# define PYBIND11_COMPILER_MSVC +# define PYBIND11_PRAGMA(...) __pragma(__VA_ARGS__) +# define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning(push)) +# define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning(pop)) +#elif defined(__INTEL_COMPILER) +# define PYBIND11_COMPILER_INTEL +# define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__) +# define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning push) +# define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning pop) +#elif defined(__clang__) +# define PYBIND11_COMPILER_CLANG +# define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__) +# define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(clang diagnostic push) +# define PYBIND11_WARNING_POP PYBIND11_PRAGMA(clang diagnostic pop) +#elif defined(__GNUC__) +# define PYBIND11_COMPILER_GCC +# define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__) +# define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(GCC diagnostic push) +# define PYBIND11_WARNING_POP PYBIND11_PRAGMA(GCC diagnostic pop) +#endif + +#ifdef PYBIND11_COMPILER_MSVC +# define PYBIND11_WARNING_DISABLE_MSVC(name) PYBIND11_PRAGMA(warning(disable : name)) +#else +# define PYBIND11_WARNING_DISABLE_MSVC(name) +#endif + +#ifdef PYBIND11_COMPILER_CLANG +# define PYBIND11_WARNING_DISABLE_CLANG(name) PYBIND11_PRAGMA(clang diagnostic ignored name) +#else +# define PYBIND11_WARNING_DISABLE_CLANG(name) +#endif + +#ifdef PYBIND11_COMPILER_GCC +# define PYBIND11_WARNING_DISABLE_GCC(name) PYBIND11_PRAGMA(GCC diagnostic ignored name) +#else +# define PYBIND11_WARNING_DISABLE_GCC(name) +#endif + +#ifdef PYBIND11_COMPILER_INTEL +# define PYBIND11_WARNING_DISABLE_INTEL(name) PYBIND11_PRAGMA(warning disable name) +#else +# define PYBIND11_WARNING_DISABLE_INTEL(name) +#endif + +#define PYBIND11_NAMESPACE_BEGIN(name) \ + namespace name { \ + PYBIND11_WARNING_PUSH + +#define PYBIND11_NAMESPACE_END(name) \ + PYBIND11_WARNING_POP \ + } + +// Robust support for some features and loading modules compiled against different pybind versions +// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute +// on the main `pybind11` namespace. +#if !defined(PYBIND11_NAMESPACE) +# if defined(__GNUG__) && !defined(_WIN32) +# define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden"))) +# else +# define PYBIND11_NAMESPACE pybind11 +# endif +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h new file mode 100644 index 0000000000000000000000000000000000000000..3e7bf659193938643e4d72c80408b4e707d9992a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h @@ -0,0 +1,95 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) 2016-2024 The Pybind Development Team. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#pragma once + +#include "common.h" + +#include +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +struct value_and_holder { + instance *inst = nullptr; + size_t index = 0u; + const detail::type_info *type = nullptr; + void **vh = nullptr; + + // Main constructor for a found value/holder: + value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) + : inst{i}, index{index}, type{type}, + vh{inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[vpos]} {} + + // Default constructor (used to signal a value-and-holder not found by get_value_and_holder()) + value_and_holder() = default; + + // Used for past-the-end iterator + explicit value_and_holder(size_t index) : index{index} {} + + template + V *&value_ptr() const { + return reinterpret_cast(vh[0]); + } + // True if this `value_and_holder` has a non-null value pointer + explicit operator bool() const { return value_ptr() != nullptr; } + + template + H &holder() const { + return reinterpret_cast(vh[1]); + } + bool holder_constructed() const { + return inst->simple_layout + ? inst->simple_holder_constructed + : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u; + } + // NOLINTNEXTLINE(readability-make-member-function-const) + void set_holder_constructed(bool v = true) { + if (inst->simple_layout) { + inst->simple_holder_constructed = v; + } else if (v) { + inst->nonsimple.status[index] |= instance::status_holder_constructed; + } else { + inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed; + } + } + bool instance_registered() const { + return inst->simple_layout + ? inst->simple_instance_registered + : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0); + } + // NOLINTNEXTLINE(readability-make-member-function-const) + void set_instance_registered(bool v = true) { + if (inst->simple_layout) { + inst->simple_instance_registered = v; + } else if (v) { + inst->nonsimple.status[index] |= instance::status_instance_registered; + } else { + inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered; + } + } +}; + +// This is a semi-public API to check if the corresponding instance has been constructed with a +// holder. That is, if the instance has been constructed with a holder, the `__init__` method is +// called and the C++ object is valid. Otherwise, the C++ object might only be allocated, but not +// initialized. This will lead to **SEGMENTATION FAULTS** if the C++ object is used in any way. +// Example usage: https://pybind11.readthedocs.io/en/stable/advanced/classes.html#custom-type-setup +// for `tp_traverse` and `tp_clear` implementations. +// WARNING: The caller is responsible for ensuring that the `reinterpret_cast` is valid. +inline bool is_holder_constructed(PyObject *obj) { + auto *const instance = reinterpret_cast(obj); + return instance->get_value_and_holder().holder_constructed(); +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..27c149f5f8ffb98d6693df21db5c93f3f870e5d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace basic { + +std::shared_ptr create(); + +} // namespace basic +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..868b7d823a3d788907f69127654cebacf1e22358 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace cma { + +std::shared_ptr create(); + +} // namespace cma +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h new file mode 100644 index 0000000000000000000000000000000000000000..8affe24eae78be35640f3e5f4a9ea238e1dc4522 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h @@ -0,0 +1,113 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace tensorpipe { +namespace channel { + +enum class Endpoint : bool { kConnect, kListen }; + +class Channel; + +// Abstract base class for channel context classes. +// +// Instances of these classes are expected to be registered with a +// context. All registered instances are assumed to be eligible +// channels for all pairs. +// +class Context { + public: + // Return whether the context is able to operate correctly. + // + // Some channel types may be unable to perform as intended under some + // circumstances (e.g., specialized hardware unavailable, lack of + // permissions). They can report it through this method in order for + // the core context to avoid registering them in the first place. + // + virtual bool isViable() const = 0; + + // Return the number of control connections needed to create an instance of + // this channel. + // + // Most channels require only one, but some require more (cuda_basic), and + // some might require none. + // + virtual size_t numConnectionsNeeded() const = 0; + + // Return a map from supported devices to strings describing the device from + // the channel's perspective. + // + // Two processes with a channel context of the same type can leverage this + // channel to make two devices communicate if one side's device descriptor is + // "accepted" by the other one, using the canCommunicateWithRemote method + // below. That method must be symmetric, and unless overridden defaults to + // string comparison. + // + virtual const std::unordered_map& deviceDescriptors() + const = 0; + + // Compare local and remote device descriptors for compatibility. + // + // Determine whether a channel can be opened between a local device and + // a remote one that has the given device descriptor. This function + // needs to be symmetric: if we called this method on the remote + // context with the local descriptor we should get the same answer. + // Unless overridden it defaults to string comparison. + // + virtual bool canCommunicateWithRemote( + const std::string& localDeviceDescriptor, + const std::string& remoteDeviceDescriptor) const = 0; + + // Return newly created channel using the specified connections. + // + // It is up to the channel to either use these connections for further + // initialization, or use them directly. Either way, the returned + // channel should be immediately usable. If the channel isn't fully + // initialized yet, take care to queue these operations to execute + // as soon as initialization has completed. + // + virtual std::shared_ptr createChannel( + std::vector>, + Endpoint) = 0; + + // Tell the context what its identifier is. + // + // This is only supposed to be called from the high-level context. It will + // only used for logging and debugging purposes. + virtual void setId(std::string id) = 0; + + // Put the channel context in a terminal state, in turn closing all of its + // channels, and release its resources. This may be done asynchronously, in + // background. + virtual void close() = 0; + + // Wait for all resources to be released and all background activity to stop. + virtual void join() = 0; + + virtual ~Context() = default; + + private: + std::string name_; +}; + +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..715d476a2618e54b6d828168edf6f4aa57ca41ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace cuda_basic { + +std::shared_ptr create(std::shared_ptr cpuContext); + +} // namespace cuda_basic +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h new file mode 100644 index 0000000000000000000000000000000000000000..8cb6e8b1ed9088d261ef8fd56d0172bb8a2d670f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace cuda_gdr { + +class IbvError final : public BaseError { + public: + explicit IbvError(std::string error) : error_(error) {} + + std::string what() const override { + return error_; + } + + private: + std::string error_; +}; + +} // namespace cuda_gdr +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..32695bedbd095ba759433ce409426904b9308f07 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace tensorpipe { +namespace channel { +namespace cuda_gdr { + +std::shared_ptr create( + optional> gpuIdxToNicName = nullopt); + +} // namespace cuda_gdr +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..673ff4b9189164f7bc646ea6133a140dd7ee1d99 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace cuda_ipc { + +std::shared_ptr create(); + +} // namespace cuda_ipc +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..c9ea4a4f2682cafc0d6ce28d583e2f9bb3334a86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace cuda_xth { + +std::shared_ptr create(); + +} // namespace cuda_xth +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h new file mode 100644 index 0000000000000000000000000000000000000000..edacbdc2aa9007117ac4afb69f76eeeb47f7883b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { + +class ContextClosedError final : public BaseError { + public: + ContextClosedError() {} + + std::string what() const override; +}; + +class ChannelClosedError final : public BaseError { + public: + ChannelClosedError() {} + + std::string what() const override; +}; + +class ContextNotViableError final : public BaseError { + public: + ContextNotViableError() {} + + std::string what() const override; +}; + +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ad01173834d1b4810ac8fc7408e1a3542ef496 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace tensorpipe { +namespace channel { +namespace mpt { + +std::shared_ptr create( + std::vector> contexts, + std::vector> listeners); + +} // namespace mpt +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..07a0f5f27d925c96fa4a151bcb3db3e9c0fdae01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace channel { +namespace xth { + +std::shared_ptr create(); + +} // namespace xth +} // namespace channel +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..c250c27e810659ddf1635cff80f77f8d14fb2d2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h @@ -0,0 +1,140 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace tensorpipe { + +class Buffer { + class AbstractBufferWrapper { + public: + virtual Device device() const = 0; + virtual void copyConstructInto(void* ptr) const = 0; + virtual void moveConstructInto(void* ptr) = 0; + virtual ~AbstractBufferWrapper() = default; + }; + + template + class BufferWrapper : public AbstractBufferWrapper { + static_assert( + std::is_trivially_copyable::value, + "wrapping non-trivially copyable class"); + + public: + TBuffer buffer; + + explicit BufferWrapper(TBuffer buffer) : buffer(std::move(buffer)) {} + + Device device() const override { + return buffer.getDevice(); + } + + void copyConstructInto(void* ptr) const override { + new (ptr) BufferWrapper(*this); + } + + void moveConstructInto(void* ptr) override { + new (ptr) BufferWrapper(std::move(*this)); + } + }; + + public: + template + /* implicit */ Buffer(TBuffer b) { + static_assert( + sizeof(BufferWrapper) <= kStructSize, "kStructSize too small"); + static_assert( + alignof(BufferWrapper) <= kStructAlign, + "kStructAlign too small"); + new (&raw_) BufferWrapper(std::move(b)); + } + + Buffer() : Buffer(CpuBuffer{}) {} + + Buffer(const Buffer& other) { + other.ptr()->copyConstructInto(&raw_); + } + + Buffer& operator=(const Buffer& other) { + if (this != &other) { + ptr()->~AbstractBufferWrapper(); + other.ptr()->copyConstructInto(&raw_); + } + return *this; + } + + Buffer(Buffer&& other) noexcept { + other.ptr()->moveConstructInto(&raw_); + } + + Buffer& operator=(Buffer&& other) { + if (this != &other) { + ptr()->~AbstractBufferWrapper(); + other.ptr()->moveConstructInto(&raw_); + } + return *this; + } + + ~Buffer() { + ptr()->~AbstractBufferWrapper(); + } + + template + TBuffer& unwrap() { + BufferWrapper* wrapperPtr = + dynamic_cast*>(ptr()); + if (wrapperPtr == nullptr) { + throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer"); + } + return wrapperPtr->buffer; + } + + template + const TBuffer& unwrap() const { + const BufferWrapper* wrapperPtr = + dynamic_cast*>(ptr()); + if (wrapperPtr == nullptr) { + throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer"); + } + return wrapperPtr->buffer; + } + + Device device() const { + return ptr()->device(); + } + + private: + static constexpr int kStructSize = 32; + static constexpr int kStructAlign = 8; + std::aligned_storage::type raw_{}; + + const AbstractBufferWrapper* ptr() const { + // FIXME: Once we go C++17, use std::launder on the returned pointer. + return reinterpret_cast(&raw_); + } + + AbstractBufferWrapper* ptr() { + // FIXME: Once we go C++17, use std::launder on the returned pointer. + return reinterpret_cast(&raw_); + } +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..7f34eb829ca08673643b615147b44931ad062e2e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace tensorpipe { + +struct CpuBuffer { + void* ptr{nullptr}; + + Device getDevice() const { + return Device{kCpuDeviceType, 0}; + } +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..f377784914c96f30071fe2cb7b720d6e8ee2d80b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { + +struct CudaBuffer { + void* ptr{nullptr}; + cudaStream_t stream{cudaStreamDefault}; + + Device getDevice() const; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h new file mode 100644 index 0000000000000000000000000000000000000000..b4a814563b585ddd437b4b1d86fe526e420abf81 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h @@ -0,0 +1,69 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace tensorpipe { + +const std::string kCpuDeviceType{"cpu"}; +const std::string kCudaDeviceType{"cuda"}; + +struct Device { + std::string type; + int index; + + // This pointless constructor is needed to work around a bug in GCC 5.5 (and + // possibly other versions). It appears to be needed in the nop types that + // are used inside nop::Optional. + Device() {} + + Device(std::string type, int index) : type(std::move(type)), index(index) {} + + std::string toString() const { + std::stringstream ss; + ss << type << ":" << index; + return ss.str(); + } + + bool operator==(const Device& other) const { + return type == other.type && index == other.index; + } +}; + +} // namespace tensorpipe + +namespace std { + +template <> +struct hash<::tensorpipe::Device> { + size_t operator()(const ::tensorpipe::Device& device) const noexcept { + return std::hash{}(device.toString()); + } +}; + +template <> +struct hash> { + size_t operator()(const std::pair<::tensorpipe::Device, ::tensorpipe::Device>& + p) const noexcept { + size_t h1 = std::hash<::tensorpipe::Device>{}(p.first); + size_t h2 = std::hash<::tensorpipe::Device>{}(p.second); + // Shifting one hash to avoid collisions between (a, b) and (b, a). + return h1 ^ (h2 << 1); + } +}; + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h new file mode 100644 index 0000000000000000000000000000000000000000..22d56522a1563d03ab471402a91ff24748a2a9af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h @@ -0,0 +1,132 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace tensorpipe { + +// Base class for actual errors. +class BaseError { + public: + virtual ~BaseError() = default; + + // Returns an explanatory string. + // Like `std::exception` but returns a `std::string`. + virtual std::string what() const = 0; +}; + +// Wrapper class for errors. +// +// Background: we wish to not use exceptions yet need an error +// representation that can propagate across function and thread +// boundaries. This representation must be copyable (so we can store +// and return it at a later point in time) and retain downstream type +// information. This implies a heap allocation because it's the +// easiest way to deal with variable size objects (barring a union of +// all downstream error classes and a lot of custom code). Instead of +// passing a shared_ptr around directly, we use this wrapper class to +// keep implementation details hidden from calling code. +// +class Error final { + public: + // Constant instance that indicates success. + static const Error kSuccess; + + // Default constructor for error that is not an error. + Error() {} + + Error(std::shared_ptr error, std::string file, int line) + : error_(std::move(error)), file_(std::move(file)), line_(line) {} + + ~Error() = default; + + // Converting to boolean means checking if there is an error. This + // means we don't need to use an `std::optional` and allows for a + // snippet like the following: + // + // if (error) { + // // Deal with it. + // } + // + operator bool() const { + return static_cast(error_); + } + + template + std::shared_ptr castToType() const { + return std::dynamic_pointer_cast(error_); + } + + template + bool isOfType() const { + return castToType() != nullptr; + } + + // Like `std::exception` but returns a `std::string`. + std::string what() const; + + private: + std::shared_ptr error_; + std::string file_; + int line_; +}; + +class SystemError final : public BaseError { + public: + explicit SystemError(const char* syscall, int error) + : syscall_(syscall), error_(error) {} + + std::string what() const override; + + int errorCode() const; + + private: + const char* syscall_; + const int error_; +}; + +class ShortReadError final : public BaseError { + public: + ShortReadError(ssize_t expected, ssize_t actual) + : expected_(expected), actual_(actual) {} + + std::string what() const override; + + private: + const ssize_t expected_; + const ssize_t actual_; +}; + +class ShortWriteError final : public BaseError { + public: + ShortWriteError(ssize_t expected, ssize_t actual) + : expected_(expected), actual_(actual) {} + + std::string what() const override; + + private: + const ssize_t expected_; + const ssize_t actual_; +}; + +class EOFError final : public BaseError { + public: + EOFError() {} + + std::string what() const override; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h new file mode 100644 index 0000000000000000000000000000000000000000..14f2ef1a80cad8ea5b1599751895dc69daa6121b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace tensorpipe { + +using std::optional; +using std::nullopt; + +} // namespace tensorpipe + + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h new file mode 100644 index 0000000000000000000000000000000000000000..8564d6b6685d10ce6f7b55e8dca1c003def24ae1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h @@ -0,0 +1,101 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include + +namespace tensorpipe { + +class ContextImpl; +class Listener; +class Pipe; + +class ContextOptions { + public: + // The name should be a semantically meaningful description of this context. + // It will only be used for logging and debugging purposes, to identify the + // endpoints of a pipe. + ContextOptions&& name(std::string name) && { + name_ = std::move(name); + return std::move(*this); + } + + private: + std::string name_; + + friend ContextImpl; +}; + +class PipeOptions { + public: + // The name should be a semantically meaningful description of the context + // that the pipe is connecting to. It will only be used for logging and + // debugging purposes, to identify the endpoints of a pipe. + PipeOptions&& remoteName(std::string remoteName) && { + remoteName_ = std::move(remoteName); + return std::move(*this); + } + + private: + std::string remoteName_; + + friend ContextImpl; +}; + +class Context final { + public: + explicit Context(ContextOptions opts = ContextOptions()); + + void registerTransport( + int64_t priority, + std::string transport, + std::shared_ptr context); + + void registerChannel( + int64_t priority, + std::string channel, + std::shared_ptr context); + + std::shared_ptr listen(const std::vector& urls); + + std::shared_ptr connect( + const std::string& url, + PipeOptions opts = PipeOptions()); + + // Put the context in a terminal state, in turn closing all of its pipes and + // listeners, and release its resources. This may be done asynchronously, in + // background. + void close(); + + // Wait for all resources to be released and all background activity to stop. + void join(); + + ~Context(); + + private: + // The implementation is managed by a shared_ptr because each child object + // will also hold a shared_ptr to it. However, its lifetime is tied to the one + // of this public object since when the latter is destroyed the implementation + // is closed and joined. + const std::shared_ptr impl_; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h new file mode 100644 index 0000000000000000000000000000000000000000..b20c9cadbc58959e81ff7b97577000f146f067a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { + +class LogicError final : public BaseError { + public: + explicit LogicError(std::string reason) : reason_(std::move(reason)) {} + + std::string what() const override; + + private: + const std::string reason_; +}; + +class ContextClosedError final : public BaseError { + public: + explicit ContextClosedError() {} + + std::string what() const override; +}; + +class ListenerClosedError final : public BaseError { + public: + explicit ListenerClosedError() {} + + std::string what() const override; +}; + +class PipeClosedError final : public BaseError { + public: + explicit PipeClosedError() {} + + std::string what() const override; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h new file mode 100644 index 0000000000000000000000000000000000000000..122de98c9d7b8e8ee12463a431dec4e3a82177a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h @@ -0,0 +1,101 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace tensorpipe { + +class ContextImpl; +class ListenerImpl; +class Pipe; + +// The listener. +// +// Listeners are used to produce pipes. Depending on the type of the +// context, listeners may use a variety of addresses to listen on. For +// example, for TCP/IP sockets they listen on an IPv4 or IPv6 address, +// for Unix domain sockets they listen on a path, etcetera. +// +// A pipe can only be accepted from this listener after it has been +// fully established. This means that both its connection and all its +// side channels have been established. +// +class Listener final { + // Use the passkey idiom to allow make_shared to call what should be a private + // constructor. See https://abseil.io/tips/134 for more information. + struct ConstructorToken {}; + + public: + Listener( + ConstructorToken token, + std::shared_ptr context, + std::string id, + const std::vector& urls); + + // + // Entry points for user code + // + + using accept_callback_fn = + std::function)>; + + void accept(accept_callback_fn fn); + + // Returns map with the materialized address of listeners by transport. + // + // If you don't bind a transport listener to a specific port or address, it + // may generate its address automatically. Then, in order to connect to the + // listener, the user must use a separate mechanism to communicate the + // materialized address to whoever wants to connect. + // + const std::map& addresses() const; + + // Returns materialized address for specific transport. + // + // See `addresses()` for more information. + // + const std::string& address(const std::string& transport) const; + + // Returns URL with materialized address for specific transport. + // + // See `addresses()` for more information. + // + std::string url(const std::string& transport) const; + + // Put the listener in a terminal state, aborting its pending operations and + // rejecting future ones, and release its resrouces. This may be carried out + // asynchronously, in background. Since the pipes may occasionally use the + // listener to open new connections, closing a listener may trigger errors + // in the pipes. + void close(); + + ~Listener(); + + private: + // Using a shared_ptr allows us to detach the lifetime of the implementation + // from the public object's one and perform the destruction asynchronously. + const std::shared_ptr impl_; + + // Allow context to access constructor token. + friend ContextImpl; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h new file mode 100644 index 0000000000000000000000000000000000000000..87106638ca97509d7ba4cbfb767ce8634046f527 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h @@ -0,0 +1,109 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace tensorpipe { + +// Messages consist of a primary buffer and zero or more separate +// buffers. The primary buffer is always a host-side memory region that +// contains a serialized version of the message we're dealing with. This +// serialized message, in turn, may have references to the separate +// buffers that accompany the primary buffer. These separate buffers may +// point to any type of memory, host-side or device-side. +// +class Message final { + public: + std::string metadata; + + struct Payload { + void* data{nullptr}; + size_t length{0}; + + // Users may include arbitrary metadata in the following fields. + // This may contain allocation hints for the receiver, for example. + std::string metadata; + }; + + // Holds the payloads that are transferred over the primary connection. + std::vector payloads; + + struct Tensor { + tensorpipe::Buffer buffer; + size_t length{0}; + + // Users may optionally specify the target device, on which the receiver + // should allocate memory for this tensor. If left unset, the receiver will + // choose one at their convenience. + optional targetDevice; + + // Users may include arbitrary metadata in the following field. + // This may contain allocation hints for the receiver, for example. + std::string metadata; + }; + + // Holds the tensors that are offered to the side channels. + std::vector tensors; +}; + +// Descriptors consist of metadata required by the receiver to allocate memory +// for an incoming message. +class Descriptor final { + public: + std::string metadata; + + struct Payload { + size_t length{0}; + std::string metadata; + }; + std::vector payloads; + + struct Tensor { + size_t length{0}; + + // This is the sender-side device from which this tensor is being sent. + Device sourceDevice; + + // The sender may optionally specify a target device, in which case the + // receiver must allocate memory for this tensor on the specified device. + optional targetDevice; + + std::string metadata; + }; + std::vector tensors; +}; + +// Allocations consist of actual memory allocations provided by the receiver for +// an incoming message. They must match the length and target devices specified +// in the corresponding Descriptor. +class Allocation final { + public: + struct Payload { + void* data{nullptr}; + }; + std::vector payloads; + + struct Tensor { + tensorpipe::Buffer buffer; + }; + std::vector tensors; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h new file mode 100644 index 0000000000000000000000000000000000000000..a7192bf2155033c84a2577dae9c48951a06dbbf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace tensorpipe { + +class ContextImpl; +class ListenerImpl; +class PipeImpl; + +// The pipe. +// +// Pipes represent a set of connections between a pair of processes. +// Unlike POSIX pipes, they are message oriented instead of byte +// oriented. Messages that are sent through the pipe may use whatever +// channels are at their disposal to make it happen. If the pair of +// processes happen to be colocated on the same machine, they may +// leverage a region of shared memory to communicate the primary +// buffer of a message. Secondary buffers may use shared memory as +// well, if they're located in CPU memory, or use a CUDA device to +// device copy if they're located in NVIDIA GPU memory. If the pair is +// located across the world, they may simply use a set of TCP +// connections to communicate. +// +class Pipe final { + // Use the passkey idiom to allow make_shared to call what should be a private + // constructor. See https://abseil.io/tips/134 for more information. + struct ConstructorToken {}; + + public: + // + // Initialization + // + + Pipe( + ConstructorToken token, + std::shared_ptr context, + std::string id, + std::string remoteName, + const std::string& url); + + Pipe(ConstructorToken token, std::shared_ptr impl); + + // + // Entry points for user code + // + + using read_descriptor_callback_fn = + std::function; + + void readDescriptor(read_descriptor_callback_fn fn); + + using read_callback_fn = std::function; + + void read(Allocation allocation, read_callback_fn fn); + + using write_callback_fn = std::function; + + void write(Message message, write_callback_fn fn); + + // Retrieve the user-defined name that was given to the constructor of the + // context on the remote side, if any (if not, this will be the empty string). + // This is intended to help in logging and debugging only. + const std::string& getRemoteName(); + + // Put the pipe in a terminal state, aborting its pending operations and + // rejecting future ones, and release its resrouces. This may be carried out + // asynchronously, in background. + void close(); + + ~Pipe(); + + private: + // Using a shared_ptr allows us to detach the lifetime of the implementation + // from the public object's one and perform the destruction asynchronously. + const std::shared_ptr impl_; + + // Allow context to access constructor token. + friend ContextImpl; + // Allow listener to access constructor token. + friend ListenerImpl; +}; + +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h new file mode 100644 index 0000000000000000000000000000000000000000..0cbecc4880b08343ca91c43217412476aeeee806 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h @@ -0,0 +1,83 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace tensorpipe { +namespace transport { + +class Connection; +class Listener; + +class Context { + public: + virtual std::shared_ptr connect(std::string addr) = 0; + + virtual std::shared_ptr listen(std::string addr) = 0; + + // Return whether the context is able to operate correctly. + // + // Some transport types may be unable to perform as intended under + // some circumstances (e.g., specialized hardware unavailable, lack + // of permissions). They can report it through this method in order + // for the core context to avoid registering them in the first place. + // + virtual bool isViable() const = 0; + + // Return string to describe the domain for this context. + // + // Two processes with a context of the same type can connect to each + // other if one side's domain descriptor is "accepted" by the other + // one, using the canCommunicateWithRemote method below. That method + // must be symmetric, and unless overridden defaults to string + // comparison. + // + // For example, for a transport that leverages TCP/IP, this may be + // as simple as the address family (assuming we can route between + // any two processes). For a transport that leverages shared memory, + // this descriptor must uniquely identify the machine, such that + // only co-located processes generate the same domain descriptor. + // + virtual const std::string& domainDescriptor() const = 0; + + // Compare local and remote domain descriptor for compatibility. + // + // Determine whether a connection can be opened between this context + // and a remote one that has the given domain descriptor. This + // function needs to be symmetric: if we called this method on the + // remote context with the local descriptor we should get the same + // answer. Unless overridden it defaults to string comparison. + // + virtual bool canCommunicateWithRemote( + const std::string& remoteDomainDescriptor) const { + return domainDescriptor() == remoteDomainDescriptor; + } + + // Tell the context what its identifier is. + // + // This is only supposed to be called from the high-level context or from + // channel contexts. It will only used for logging and debugging purposes. + virtual void setId(std::string id) = 0; + + virtual void close() = 0; + + virtual void join() = 0; + + virtual ~Context() = default; +}; + +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h new file mode 100644 index 0000000000000000000000000000000000000000..0f7c866850144dfe9c0f465d733ea472336c95de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h @@ -0,0 +1,52 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { + +class ContextClosedError final : public BaseError { + public: + ContextClosedError() {} + + std::string what() const override; +}; + +class ListenerClosedError final : public BaseError { + public: + ListenerClosedError() {} + + std::string what() const override; +}; + +class ConnectionClosedError final : public BaseError { + public: + ConnectionClosedError() {} + + std::string what() const override; +}; + +class ContextNotViableError final : public BaseError { + public: + ContextNotViableError() {} + + std::string what() const override; +}; + +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h new file mode 100644 index 0000000000000000000000000000000000000000..de0bb2fc3516dde57ac71961136322b85a3c63fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace ibv { + +class IbvError final : public BaseError { + public: + explicit IbvError(std::string error) : error_(error) {} + + std::string what() const override; + + private: + std::string error_; +}; + +class GetaddrinfoError final : public BaseError { + public: + explicit GetaddrinfoError(int error) : error_(error) {} + + std::string what() const override; + + private: + int error_; +}; + +class NoAddrFoundError final : public BaseError { + public: + NoAddrFoundError() {} + + std::string what() const override; +}; + +} // namespace ibv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..b708a78b38c40c77122b1d0fc6f3389adec61725 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace ibv { + +std::shared_ptr create(); + +} // namespace ibv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..bbaac69ca4162d2db739311b029be5868037a156 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace ibv { + +std::tuple lookupAddrForIface(std::string iface); + +std::tuple lookupAddrForHostname(); + +} // namespace ibv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..e1124fba3072ae66c1a77dcd881a7fbfbbed4da8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace shm { + +std::shared_ptr create(); + +} // namespace shm +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h new file mode 100644 index 0000000000000000000000000000000000000000..efb26e52877aea08aacf4f03b37fa3c5bae1c55a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace uv { + +class UVError final : public BaseError { + public: + explicit UVError(int error) : error_(error) {} + + std::string what() const override; + + private: + int error_; +}; + +class NoAddrFoundError final : public BaseError { + public: + NoAddrFoundError() {} + + std::string what() const override; +}; + +} // namespace uv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h new file mode 100644 index 0000000000000000000000000000000000000000..6a4ec2cabc45da9a481e836c1d07cb18a38ae5e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace uv { + +std::shared_ptr create(); + +} // namespace uv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..0c3b757b1e8c2b27327abb6274d5b3a0d60a2085 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +#include +#include + +namespace tensorpipe { +namespace transport { +namespace uv { + +std::tuple lookupAddrForIface(std::string iface); + +std::tuple lookupAddrForHostname(); + +// Try to replicate the same logic used by NCCL to find a node's own address. +// Roughly, it returns the "first" usable address it can find, and prioritizes +// the interfaces with an `ib` prefix and de-prioritizes those with a `docker` +// or `lo` prefix. It can optionally only return only IPv4 or IPv4 addresses. +std::tuple lookupAddrLikeNccl( + optional familyFilter = nullopt); + +} // namespace uv +} // namespace transport +} // namespace tensorpipe + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53156bfaf0fa8488f6fbee5cdb9bb230b8b7fe57 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecf2fd803f71d966944d3bb18bb5ca5c6356a297 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4319a31810762cca4ef656caf92dcf9278cdca53 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60932c45cc966cc136a13e6e70a69f75cf2769a8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e47580790177768defa324d74b29797c256014e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1f75dec49dee7b1fc70e7821b1d05400f109742 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64385e60d07b87b2859775340ef274861bcb3286 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..438a8bc55caf0b73780288496a943848d7a71191 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py @@ -0,0 +1,187 @@ +# mypy: allow-untyped-defs +"""This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention""" + +import contextlib +from collections.abc import Iterable +from typing import Union +from warnings import warn + +import torch.backends.cuda +from torch._C import _SDPBackend as SDPBackend +from torch.backends.cuda import ( + can_use_efficient_attention, + can_use_flash_attention, + SDPAParams, +) + + +__all__: list[str] = [ + "SDPBackend", + "sdpa_kernel", + "WARN_FOR_UNFUSED_KERNELS", + "register_flash_attention_impl", + "activate_flash_attention_impl", + "list_flash_attention_impls", + "current_flash_attention_impl", +] + + +# Note: [SDPA warnings] +# TODO: Consider using this for sdpa regardless of subclasses +# This only effects users of bias subclasses +# If this is set to True, we will warn the user if they are not using the fused kernels +# As well, it will raise warnings for all the reasons why the fused kernels can't be run. +# To set this to True, run +# torch.nn.attention.WARN_FOR_UNFUSED_KERNELS = True +WARN_FOR_UNFUSED_KERNELS = False + + +r"""An enum-like class that contains the different backends for scaled dot product attention. + This backend class is designed to be used with the sdpa_kernel context manager. + + The following Enums are available: + - ERROR: An error occurred when trying to determine the backend. + - MATH: The math backend for scaled dot product attention. + - FLASH_ATTENTION: The flash attention backend for scaled dot product attention. + - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention. + - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention. + - OVERRIDEABLE: The overridable backend for extension. + + See :func:`torch.nn.attention.sdpa_kernel` for more details. + + .. warning:: This class is in beta and subject to change. +""" +SDPBackend.__module__ = __name__ +SDPBackend.__name__ = "SDPBackend" + + +def _raise_kernel_warnings(params: SDPAParams) -> None: + """ + If WARN_FOR_UNFUSED_KERNELS is set to True, this will raise warnings + for all the reasons why the fused kernels can't be run. If using subclasses + """ + if WARN_FOR_UNFUSED_KERNELS: + if not can_use_efficient_attention(params): + warn("Efficient attention can't be used because:", stacklevel=2) + can_use_efficient_attention(params, True) + if not can_use_flash_attention(params): + warn("Flash attention can't be used because:", stacklevel=2) + can_use_flash_attention(params, True) + + +_backend_names = { + "cudnn": "CUDNN_ATTENTION", + "flash": "FLASH_ATTENTION", + "mem_efficient": "EFFICIENT_ATTENTION", + "math": "MATH", + "overrideable": "OVERRIDEABLE", +} + + +def _backend_from_string(name: str): + return getattr(SDPBackend, name) + + +def _cur_sdpa_kernel_backends(with_priority: bool = False): + backends = [] + for name, val in _backend_names.items(): + if getattr(torch._C, f"_get_{name}_sdp_enabled")(): + backends.append(getattr(SDPBackend, val)) + if with_priority: + curr_priority = torch._C._get_sdp_priority_order() + backends = sorted( + backends, key=lambda backend: curr_priority.index(int(backend)) + ) + return backends + + +def _sdpa_kernel(backends: Iterable, set_priority: bool = False) -> None: + for name, val in _backend_names.items(): + enabled = getattr(SDPBackend, val) in backends + getattr(torch._C, f"_set_sdp_use_{name}")(enabled) + if set_priority: + # backends should be a unique list + user_priority = [int(backend) for backend in backends] + previous_priority = torch._C._get_sdp_priority_order() + for backend in previous_priority: + if backend not in user_priority: + user_priority.append(int(backend)) + torch._C._set_sdp_priority_order(user_priority) + + +@contextlib.contextmanager +def sdpa_kernel(backends: list[SDPBackend] | SDPBackend, set_priority: bool = False): + r""" + Context manager to select which backend to use for scaled dot product attention. + + .. warning:: This function is beta and subject to change. + + Args: + backends (Union[List[SDPBackend], SDPBackend]): A backend or list of backends for scaled dot product attention. + set_priority_order (bool=False): Whether the ordering of the backends is interpreted as their priority order. + + Example: + + .. code-block:: python + + from torch.nn.functional import scaled_dot_product_attention + from torch.nn.attention import SDPBackend, sdpa_kernel + + # Only enable flash attention backend + with sdpa_kernel(SDPBackend.FLASH_ATTENTION): + scaled_dot_product_attention(...) + + # Enable the Math or Efficient attention backends + with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]): + scaled_dot_product_attention(...) + + This context manager can be used to select which backend to use for scaled dot product attention. + Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends. + """ + assert isinstance(backends, (list, SDPBackend)), ( + "Backend must be an instance of SDPBackend or a list of SDPBackend instances" + ) + + if isinstance(backends, SDPBackend): + backends = [backends] + + backends = list(dict.fromkeys(backends)) + + previous_backends = _cur_sdpa_kernel_backends(with_priority=set_priority) + try: + _sdpa_kernel(backends, set_priority) + yield {} + finally: + _sdpa_kernel(previous_backends, set_priority) + + +# variadic version of sdpa_kernel for dynamo to use while reconstructing +@contextlib.contextmanager +def _sdpa_kernel_variadic(*backends: SDPBackend): + with sdpa_kernel(list(backends)): + yield + + +def _get_flash_version() -> str: + """This returns the closest matching tag for the flash attention backend""" + return "2.5.7" + + +from . import _registry + + +# Re-export registry types and functions for public API +_FlashAttentionImpl = _registry._FlashAttentionImpl +_RegisterFn = _registry._RegisterFn +register_flash_attention_impl = _registry.register_flash_attention_impl +activate_flash_attention_impl = _registry.activate_flash_attention_impl +list_flash_attention_impls = _registry.list_flash_attention_impls +current_flash_attention_impl = _registry.current_flash_attention_impl + +register_flash_attention_impl.__module__ = __name__ +activate_flash_attention_impl.__module__ = __name__ +list_flash_attention_impls.__module__ = __name__ +current_flash_attention_impl.__module__ = __name__ + +# Import built-in implementations to trigger self-registration +from . import _fa4 # noqa: F401 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py new file mode 100644 index 0000000000000000000000000000000000000000..1be960ee53218e4fe01ac1f16c7416ecc0ff3822 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py @@ -0,0 +1,456 @@ +"""UBER PROTOTYPE!!!""" +# mypy: allow-untyped-defs + +from __future__ import annotations + +import importlib +from dataclasses import dataclass +from functools import cache +from typing import Any, TYPE_CHECKING +from typing_extensions import TypeVarTuple, Unpack + +from . import _registry + + +if TYPE_CHECKING: + from types import ModuleType + +import torch +from torch.library import Library + + +__all__ = [ + "register_flash_attention_fa4", +] + + +_FA4_MODULE_PATH: str | None = None + + +@dataclass +class _FA4Handle: + library: Library | None + + def remove(self) -> None: + self.library = None + + +@cache +def _get_device_major(device: torch.device) -> int: + major, _ = torch.cuda.get_device_capability(device) + return major + + +def register_flash_attention_fa4( + module_path: str = "flash_attn.cute.interface", +) -> _FA4Handle: + """ + Register FA4 flash attention kernels with the PyTorch dispatcher. + + Args: + module_path: Python module path to the FA4 implementation. + """ + global _FA4_MODULE_PATH + _ = _fa4_import_module(module_path) + _FA4_MODULE_PATH = module_path + return _FA4Handle(_fa4_register_kernels()) + + +@cache +def _fa4_import_module(module_path: str) -> ModuleType: + module = importlib.import_module(module_path) + if not hasattr(module, "_flash_attn_fwd") or not hasattr(module, "_flash_attn_bwd"): + raise RuntimeError(f"Module '{module_path}' does not expose FA4 kernels") + return module + + +def _fa4_register_kernels() -> Library: + lib = Library("aten", "IMPL", "CUDA") # noqa: TOR901 + lib.impl("_flash_attention_forward", _fa4_flash_attention_forward_impl, "CUDA") + lib.impl("_flash_attention_backward", _fa4_flash_attention_backward_impl, "CUDA") + lib.impl( + "_scaled_dot_product_flash_attention", + _fa4_scaled_dot_product_flash_attention_forward_impl, + "CUDA", + ) + lib.impl( + "_scaled_dot_product_flash_attention_backward", + _fa4_scaled_dot_product_flash_attention_backward_impl, + "CUDA", + ) + return lib + + +def _fa4_common_support_error( + query: torch.Tensor, + tensors: tuple[torch.Tensor, ...], + cum_seq_q: torch.Tensor | None, + require_fp32: tuple[tuple[str, torch.Tensor], ...] = (), +) -> str | None: + if not all(t.is_cuda for t in tensors): + return "inputs must be CUDA tensors" + if len({t.device for t in tensors}) != 1: + return "inputs must share device" + if query.dtype not in (torch.float16, torch.bfloat16): + return "query dtype must be float16 or bfloat16" + for name, tensor in require_fp32: + if tensor.dtype != torch.float32: + return f"{name} dtype must be float32" + if cum_seq_q is None and query.dim() != 4: + return "dense query must be 4D" + if cum_seq_q is not None and query.dim() != 3: + return "ragged query must be 3D" + if not torch.cuda.is_available(): + return "CUDA not available" + if _get_device_major(query.device) not in (9, 10): + return "FA4 requires compute capability 9.0 or 10.0" + return None + + +def _fa4_forward_support_error( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + dropout_p: float, + return_debug_mask: bool, + alibi_slopes: torch.Tensor | None, + seqused_k: torch.Tensor | None, + cum_seq_q: torch.Tensor | None, +) -> str | None: + if dropout_p != 0.0: + return "dropout_p must be 0" + if return_debug_mask: + return "return_debug_mask must be False" + if alibi_slopes is not None: + return "alibi_slopes not supported" + if seqused_k is not None: + if seqused_k.dtype != torch.int32: + return "seqused_k must be int32" + if not seqused_k.is_cuda: + return "seqused_k must be CUDA" + error = _fa4_common_support_error( + query, + (query, key, value), + cum_seq_q, + ) + if error is not None: + if error == "inputs must share device": + return "query, key, value must be on same device" + return error + return None + + +def _fa4_backward_support_error( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + logsumexp: torch.Tensor, + dropout_p: float, + cum_seq_q: torch.Tensor | None, + window_size_left: int | None, + window_size_right: int | None, +) -> str | None: + if dropout_p != 0.0: + return "dropout_p must be 0" + if window_size_left is not None or window_size_right is not None: + return "windowed attention not supported" + error = _fa4_common_support_error( + query, + (grad_out, query, key, value, out, logsumexp), + cum_seq_q, + require_fp32=(("logsumexp", logsumexp),), + ) + if error is not None: + return error + return None + + +Ts = TypeVarTuple("Ts") + + +def _transpose_dense(*tensors: Unpack[Ts]) -> tuple[Unpack[Ts]]: + return tuple(t.transpose(1, 2) for t in tensors) # type: ignore[attr-defined] + + +def _fa4_run_forward( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seq_q: torch.Tensor | None, + cu_seq_k: torch.Tensor | None, + scale: float | None, + is_causal: bool, + window_size_left: int | None, + window_size_right: int | None, + seqused_k: torch.Tensor | None, + out: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if _FA4_MODULE_PATH is None: + raise RuntimeError("FA4 not registered") + module = _fa4_import_module(_FA4_MODULE_PATH) + + kwargs: dict[str, Any] = { + "softmax_scale": scale, + "causal": is_causal, + "window_size_left": window_size_left, + "window_size_right": window_size_right, + "return_lse": True, + "cu_seqlens_q": cu_seq_q, + "cu_seqlens_k": cu_seq_k, + "seqused_k": seqused_k.contiguous() if seqused_k is not None else None, + } + if out is not None: + kwargs["out"] = out + out, lse = module._flash_attn_fwd(query, key, value, **kwargs) + return out, lse.contiguous() + + +def _fa4_run_backward( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + logsumexp: torch.Tensor, + cu_seq_q: torch.Tensor | None, + cu_seq_k: torch.Tensor | None, + scale: float | None, + is_causal: bool, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if _FA4_MODULE_PATH is None: + raise RuntimeError("FA4 not registered") + module = _fa4_import_module(_FA4_MODULE_PATH) + dq, dk, dv = module._flash_attn_bwd( + query, + key, + value, + out, + grad_out, + logsumexp.contiguous(), + softmax_scale=scale, + causal=is_causal, + cu_seqlens_q=cu_seq_q, + cu_seqlens_k=cu_seq_k, + ) + return dq, dk, dv + + +def _fa4_flash_attention_forward_impl( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cum_seq_q: torch.Tensor | None, + cum_seq_k: torch.Tensor | None, + max_q: int, + max_k: int, + dropout_p: float, + is_causal: bool, + return_debug_mask: bool, + *, + scale: float | None = None, + window_size_left: int | None = None, + window_size_right: int | None = None, + seqused_k: torch.Tensor | None = None, + alibi_slopes: torch.Tensor | None = None, + out: torch.Tensor | None = None, +): + error = _fa4_forward_support_error( + query, + key, + value, + dropout_p, + return_debug_mask, + alibi_slopes, + seqused_k, + cum_seq_q, + ) + if error is not None: + raise RuntimeError(f"FA4 flash_attention forward unsupported: {error}") + out, lse = _fa4_run_forward( + query, + key, + value, + cum_seq_q, + cum_seq_k, + scale, + is_causal, + window_size_left, + window_size_right, + seqused_k, + out, + ) + rng_state = torch.zeros((2,), dtype=torch.uint64, device=query.device) + philox_offset = torch.zeros((), dtype=torch.uint64, device=query.device) + debug_mask = torch.empty(0, dtype=query.dtype, device=query.device) + return out, lse, rng_state, philox_offset, debug_mask + + +def _fa4_flash_attention_backward_impl( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + logsumexp: torch.Tensor, + cum_seq_q: torch.Tensor | None, + cum_seq_k: torch.Tensor | None, + max_q: int, + max_k: int, + dropout_p: float, + is_causal: bool, + rng_state: torch.Tensor, + unused: torch.Tensor, + *, + scale: float | None = None, + window_size_left: int | None = None, + window_size_right: int | None = None, +): + error = _fa4_backward_support_error( + grad_out, + query, + key, + value, + out, + logsumexp, + dropout_p, + cum_seq_q, + window_size_left, + window_size_right, + ) + if error is not None: + raise RuntimeError(f"FA4 flash_attention backward unsupported: {error}") + dq, dk, dv = _fa4_run_backward( + grad_out, + query, + key, + value, + out, + logsumexp, + cum_seq_q, + cum_seq_k, + scale, + is_causal, + ) + return dq, dk, dv + + +def _fa4_scaled_dot_product_flash_attention_forward_impl( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + dropout_p: float = 0.0, + is_causal: bool = False, + return_debug_mask: bool = False, + *, + scale: float | None = None, +): + error = _fa4_forward_support_error( + query, + key, + value, + dropout_p, + return_debug_mask, + None, + None, + None, + ) + if error is not None: + raise RuntimeError(f"FA4 SDPA forward unsupported: {error}") + q, k, v = _transpose_dense(query, key, value) + + # Pre-allocate output with query's strides (BHSD layout), then create + # a BSHD view for the kernel. This ensures the returned output has + # the same memory layout as the input query. + out_bhsd = torch.empty_like(query) + out_bshd = out_bhsd.transpose(1, 2) + + max_q_flash = q.size(1) + max_k_flash = k.size(1) + _, lse, rng_state, philox_offset, debug_mask = _fa4_flash_attention_forward_impl( + q, + k, + v, + None, + None, + max_q_flash, + max_k_flash, + dropout_p, + is_causal, + return_debug_mask, + scale=scale, + out=out_bshd, + ) + max_q = query.size(2) + max_k = key.size(2) + return ( + out_bhsd, + lse, + None, + None, + max_q, + max_k, + rng_state, + philox_offset, + debug_mask, + ) + + +def _fa4_scaled_dot_product_flash_attention_backward_impl( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + logsumexp: torch.Tensor, + cum_seq_q: torch.Tensor | None, + cum_seq_k: torch.Tensor | None, + max_q: int, + max_k: int, + dropout_p: float, + is_causal: bool, + philox_seed: torch.Tensor, + philox_offset: torch.Tensor, + *, + scale: float | None = None, +): + error = _fa4_backward_support_error( + grad_out, + query, + key, + value, + out, + logsumexp, + dropout_p, + None, + None, + None, + ) + if error is not None: + raise RuntimeError(f"FA4 SDPA backward unsupported: {error}") + q, k, v, o, go = _transpose_dense(query, key, value, out, grad_out) + max_q = query.size(2) + max_k = key.size(2) + dq, dk, dv = _fa4_flash_attention_backward_impl( + go, + q, + k, + v, + o, + logsumexp, + None, + None, + max_q, + max_k, + dropout_p, + is_causal, + philox_seed, + philox_offset, + scale=scale, + ) + dq, dk, dv = _transpose_dense(dq, dk, dv) + return dq, dk, dv + + +_registry.register_flash_attention_impl("FA4", register_fn=register_flash_attention_fa4) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..883252d56f8b65cfa258d9d77ed463b374fd77ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py @@ -0,0 +1,109 @@ +# mypy: allow-untyped-defs +"""Registry for flash attention implementations. + +This module contains the registration system for flash attention implementations. +It has no torch dependencies to avoid circular imports during initialization. +""" + +from collections.abc import Callable +from typing import Literal, Protocol + + +class FlashAttentionHandle(Protocol): + def remove(self) -> None: ... + + +_RegisterFn = Callable[..., FlashAttentionHandle | None] +_FlashAttentionImpl = Literal["FA4"] + +_FLASH_ATTENTION_IMPLS: dict[str, _RegisterFn] = {} + +_FLASH_ATTENTION_ACTIVE: str | None = None +_FLASH_ATTENTION_HANDLES: dict[str, FlashAttentionHandle] = {} + + +def register_flash_attention_impl( + impl: str | _FlashAttentionImpl, + *, + register_fn: _RegisterFn, +) -> None: + """ + Register the callable that activates a flash attention impl. + + .. note:: + This function is intended for SDPA backend providers to register their + implementations. End users should use :func:`activate_flash_attention_impl` + to activate a registered implementation. + + Args: + impl: Implementation identifier (e.g., ``"FA4"``). + register_fn: Callable that performs the actual dispatcher registration. + This function will be invoked by :func:`activate_flash_attention_impl` + and should register custom kernels with the PyTorch dispatcher. + It may optionally return a handle implementing + :class:`FlashAttentionHandle` to keep any necessary state alive. + + Example: + >>> def my_impl_register(module_path: str = "my_flash_impl"): + ... # Register custom kernels with torch dispatcher + ... pass # doctest: +SKIP + >>> register_flash_attention_impl( + ... "MyImpl", register_fn=my_impl_register + ... ) # doctest: +SKIP + """ + _FLASH_ATTENTION_IMPLS[impl] = register_fn + + +def activate_flash_attention_impl( + impl: str | _FlashAttentionImpl, +) -> None: + """ + Activate into the dispatcher a previously registered flash attention impl. + + .. note:: + Backend providers should NOT automatically activate their implementation + on import. Users should explicitly opt-in by calling this function or via + environment variables to ensure multiple provider libraries can coexist. + + Args: + impl: Implementation identifier to activate. See + :func:`~torch.nn.attention.list_flash_attention_impls` for available + implementations. + If the backend's :func:`register_flash_attention_impl` callable + returns a :class:`FlashAttentionHandle`, the registry keeps that + handle alive for the lifetime of the process (until explicit + uninstall support exists). + + Example: + >>> activate_flash_attention_impl("FA4") # doctest: +SKIP + """ + global _FLASH_ATTENTION_ACTIVE + register_fn = _FLASH_ATTENTION_IMPLS.get(impl) + if register_fn is None: + raise ValueError( + f"Unknown flash attention impl '{impl}'. " + f"Available implementations: {list_flash_attention_impls()}" + ) + # TODO: The only way to actually register a new impl is to unregister the current impl + # reinstall the default impl and then register the new impl + if _FLASH_ATTENTION_ACTIVE == impl: + return + + handle = register_fn() + if handle is not None: + _FLASH_ATTENTION_HANDLES[impl] = handle + _FLASH_ATTENTION_ACTIVE = impl + + +def list_flash_attention_impls() -> list[str]: + """Return the names of all available flash attention implementations.""" + return sorted(_FLASH_ATTENTION_IMPLS.keys()) + + +def current_flash_attention_impl() -> str | None: + """ + Return the currently activated flash attention impl name, if any. + + ``None`` indicates that no custom impl has been activated. + """ + return _FLASH_ATTENTION_ACTIVE diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cd530bb675e8fce9164b7de0d75fd9dce90edec8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py @@ -0,0 +1,59 @@ +# mypy: allow-untyped-defs +"""Defines utilities for interacting with scaled_dot_product_attention""" + +import math + +import torch + + +__all__: list[str] = [] + + +def _input_requires_grad(*tensors: torch.Tensor) -> bool: + """Returns True if any of the tensors requires grad""" + return any(t.requires_grad for t in tensors) + + +def _postprocess_flash_output(inpt_tensor: torch.Tensor, og_size: int) -> torch.Tensor: + """Handles the unpad of the last dimension""" + if inpt_tensor.size(-1) != og_size: + return inpt_tensor[..., :og_size] + return inpt_tensor + + +def _calculate_scale(head_dim_size: int, scale: float | None) -> float: + """ + For FlashAttention we pad the head dimension to be a multiple of 8 so we need to scale the output + by the original head size and not the padded. + """ + if scale is not None: + return scale + return 1.0 / math.sqrt(head_dim_size) + + +def _validate_sdpa_input( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: torch.Tensor | None = None, + dropout_p=0.0, + is_causal=False, + scale=None, +) -> None: + if query.dtype != key.dtype or query.dtype != value.dtype: + raise ValueError( + f"Expected query, key, and value to have the same dtype, " + f"but got query.dtype: {query.dtype}, key.dtype: {key.dtype}, " + f"and value.dtype: {value.dtype} instead." + ) + if query.device != key.device or query.device != value.device: + raise ValueError( + f"Expected query, key, and value to have the same device type, " + f"but got query.device: {query.device}, key.device: {key.device}, " + f"and value.device: {value.device} instead." + ) + if query.dim() < 2 or key.dim() < 2 or value.dim() < 2: + raise ValueError( + f"Expected query, key, and value to all be at least 2 dimensional, but got query.dim: " + f"{query.dim()}, key.dim: {key.dim()} and value.dim: {value.dim()} instead." + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py new file mode 100644 index 0000000000000000000000000000000000000000..746e04c01f3d571fc61a06a62332f229ada4e6c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py @@ -0,0 +1,371 @@ +# mypy: allow-untyped-defs +"""Defines bias subclasses that work with scaled_dot_product_attention""" + +from enum import auto, IntEnum +from warnings import warn + +import torch +import torch.nn.functional as F +from torch.backends.cuda import ( + can_use_efficient_attention, + can_use_flash_attention, + is_flash_attention_available, + SDPAParams, +) +from torch.nn.attention import _raise_kernel_warnings +from torch.nn.attention._utils import ( + _calculate_scale, + _input_requires_grad, + _postprocess_flash_output, + _validate_sdpa_input, +) + + +__all__ = ["causal_upper_left", "causal_lower_right", "CausalVariant", "CausalBias"] + + +torch._dynamo.allow_in_graph(is_flash_attention_available) +torch._dynamo.allow_in_graph(can_use_flash_attention) +torch._dynamo.allow_in_graph(can_use_efficient_attention) +torch._dynamo.allow_in_graph(SDPAParams) + + +class CausalVariant(IntEnum): + r""" + Enum for causal variants used in attention mechanisms. + + Defines two types of causal biases: + + ``UPPER_LEFT``: Represents upper-left triangular bias for standard causal attention. + The equivalent pytorch code for constructing this bias is: + + .. code-block:: python + + torch.tril(torch.ones(size, dtype=torch.bool)) + + For instance, with ``shape=(3,4)``, the materialized bias tensor will be: + + .. code-block:: text + + [[1, 0, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 0]] + + + ``LOWER_RIGHT``: Represents lower-right triangular bias, the include values are aligned to the lower + right corner of the matrix. + + The equivalent pytorch code for constructing this bias is: + + .. code-block:: python + + diagonal_offset = size[1] - size[0] + torch.tril( + torch.ones(size, dtype=torch.bool), + diagonal=diagonal_offset, + ) + + For instance, with ``shape=(3,4)``, the materialized bias tensor will be: + + .. code-block:: text + + [[1, 1, 0, 0], + [1, 1, 1, 0], + [1, 1, 1, 1]] + + Note that these variants are equivalent to each other when the sequence lengths of the query and key/value + tensors are equal since the triangular matrix is square. + + .. warning:: This enum is a prototype and subject to change. + """ + + UPPER_LEFT = auto() + LOWER_RIGHT = auto() + + +class CausalBias(torch.Tensor): + """ + A bias representing causal attention patterns. For an overview of the bias structure, see the :class:`CausalVariant` enum. + + This class is used for defining causal (triangular) attention biases. For construing the bias, there exist + two factory functions: :func:`causal_upper_left` and :func:`causal_lower_right`. + + Example: + + .. code-block:: python + + from torch.nn.attention.bias import causal_lower_right + + bsz, num_heads, seqlen_q, seqlen_kv, head_dim = 32, 8, 4, 12, 8 + + # Create a lower-right causal bias + attn_bias = causal_lower_right(seqlen_q, seqlen_kv) + + q = torch.randn( + bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16 + ) + k = torch.randn( + bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16 + ) + v = torch.randn( + bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16 + ) + + out = F.scaled_dot_product_attention(q, k, v, attn_bias) + + .. warning:: This class is a prototype and subject to change. + """ + + def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int) -> None: + """ + Initializes the CausalBias instance with a specified variant and sequence lengths. + + Args: + variant (CausalVariant): The type of causal bias to use (either UPPER_LEFT or LOWER_RIGHT). + seq_len_q (int): The sequence length of the query tensor. + seq_len_kv (int): The sequence length of the key/value tensor. + + Raises a warning if the LOWER_RIGHT variant is used with seq_len_q > seq_len_kv, as it may produce NaNs. + """ + assert isinstance(variant, CausalVariant) + super().__init__() + self.variant = variant + self.seq_len_q = seq_len_q + self.seq_len_kv = seq_len_kv + if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT: + warn( + "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!", + stacklevel=2, + ) + + def _upper_left(self, device: torch.device) -> torch.Tensor: + """Upper left causal bias""" + return torch.tril( + torch.ones(self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool) + ) + + def _lower_right(self, device: torch.device) -> torch.Tensor: + """Lower right causal bias""" + diagonal_offset = self.seq_len_kv - self.seq_len_q + return torch.tril( + torch.ones( + self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool + ), + diagonal=diagonal_offset, + ) + + # pyrefly: ignore [bad-return] + def _materialize(self, device: torch.device | None = None) -> torch.Tensor: + """ + Materializes the causal bias into a tensor form. + + Depending on the variant, this method generates either an upper-left or lower-right + triangular matrix to represent the causal bias. + + Args: + device (Optional[torch.device]): The device on which to create the tensor. Defaults to CPU. + + Returns: + torch.Tensor: The materialized bias tensor. + """ + if device is None: + device = torch.device("cpu") + if self.variant == CausalVariant.UPPER_LEFT: + return self._upper_left(device) + elif self.variant == CausalVariant.LOWER_RIGHT: + return self._lower_right(device) + + @staticmethod + def _dispatch( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: "CausalBias", + dropout_p: float = 0.0, + is_causal: bool = False, + scale: float | None = None, + enable_gqa: bool = False, + ) -> torch.Tensor: + r""" + Handles the logic for computing attention with the specified causal bias. + + Args: + query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`. + key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`. + value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`. + attn_mask (CausalBias): The type of causal attention to apply. + A boolean mask where a value of True indicates that the element *should* take part in attention. + A float mask of the same type as query, key, value that is added to the attention score. + dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied + is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal + are set. + scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set + to :math:`\frac{1}{\sqrt{E}}`. + enable_gqa (optional bool): If set to True, Grouped Query Attention (GQA) is enabled, by default it is set to False. + + Returns: + output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`. + + Raises: + ValueError: If the causal bias variant is not a CausalVariant type. + + """ + if is_causal: + raise ValueError("CausalBias should not be used with causal=True") + + if ( + attn_mask.seq_len_q == attn_mask.seq_len_kv + or attn_mask.variant == CausalVariant.UPPER_LEFT + ): + return F.scaled_dot_product_attention( + query, + key, + value, + attn_mask=None, + dropout_p=dropout_p, + is_causal=True, + scale=scale, + enable_gqa=enable_gqa, + ) + elif attn_mask.variant == CausalVariant.LOWER_RIGHT: + _validate_sdpa_input(query, key, value, None, dropout_p, is_causal, scale) + sdpa_params = SDPAParams( + query, key, value, None, dropout_p, is_causal, enable_gqa + ) + if can_use_flash_attention(sdpa_params): + alignment = 64 if query.device.type == "xpu" else 8 + og_head_size = query.size(-1) + og_scale = _calculate_scale(og_head_size, scale) + needs_padding = og_head_size % alignment != 0 + if needs_padding: + pad_len = alignment - (og_head_size % alignment) + query = torch.nn.functional.pad(query, (0, pad_len)) + key = torch.nn.functional.pad(key, (0, pad_len)) + value = torch.nn.functional.pad(value, (0, pad_len)) + out = torch.ops.aten._scaled_dot_product_flash_attention( + query, + key, + value, + dropout_p, + is_causal=True, # TODO: Flash accepts causal = True and for this particular op it means lower right + return_debug_mask=False, + scale=og_scale, + )[0] + return _postprocess_flash_output(out, og_head_size) + if can_use_efficient_attention(sdpa_params): + compute_log_sumexp = False + if _input_requires_grad(query, key, value): + compute_log_sumexp = True + return torch.ops.aten._efficient_attention_forward( + query.transpose(1, 2), + key.transpose(1, 2), + value.transpose(1, 2), + bias=None, + cu_seqlens_q=None, + cu_seqlens_k=None, + max_seqlen_q=None, + max_seqlen_k=None, + dropout_p=dropout_p, + custom_mask_type=int(attn_mask.variant), + compute_log_sumexp=compute_log_sumexp, + scale=scale, + seqlen_k=None, + )[0].transpose(1, 2) + else: + _raise_kernel_warnings(sdpa_params) + # We can't use efficient attention the only support for lower right is via materialization + return F.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attn_mask._materialize(query.device), + dropout_p=dropout_p, + is_causal=False, + scale=scale, + enable_gqa=enable_gqa, + ) + else: + raise ValueError( + f"CausalBias.variant must be a CausalVariant type, but found: {attn_mask.variant}" + ) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + """Defines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias""" + if kwargs is None: + kwargs = {} + if func is torch.nn.functional.scaled_dot_product_attention: + return cls._dispatch(*args, **kwargs) + return super().__torch_function__(func, types, args, kwargs) + + def __repr__(self) -> str: # type:ignore[override] + return self._materialize().__repr__() + + +def causal_upper_left(*size) -> CausalBias: + """ + Creates an upper-left triangular causal bias. + + This function generates a upper-left triangular matrix to represent causal attention bias with a + diagonal offset set so that the inclusive values are aligned to the upper left corner of the matrix. + This equivalent to the `is_causal=True` argument in `scaled_dot_product_attention`. + + The equivalent pytorch code for constructing this bias is: + + .. code-block:: python + + torch.tril(torch.ones(size, dtype=torch.bool)) + + For instance, with `shape=(3,4)`, the materialized bias tensor will be: + + .. code-block:: text + + [[1, 0, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 0]] + + Args: + size: The size of the bias matrix. + + Returns: + CausalBias: The UPPER_LEFT triangular causal bias variant. + """ + assert len(size) == 2, "causal_upper_left only supports 2D tensors" + seq_len_q, seq_len_kv = size + return CausalBias(CausalVariant.UPPER_LEFT, seq_len_q, seq_len_kv) + + +def causal_lower_right(*size) -> CausalBias: + """ + Creates a lower-right triangular causal bias. + + This function generates a lower-right triangular matrix to represent causal attention bias with a + diagonal offset set so that the inclusive values are aligned to the lower right corner of the matrix. + + The equivalent pytorch code for constructing this bias is: + + .. code-block:: python + + diagonal_offset = size[1] - size[0] + torch.tril( + torch.ones(size, dtype=torch.bool), + diagonal=diagonal_offset, + ) + + For instance, with `shape=(3,4)`, the materialized bias tensor will be: + + .. code-block:: text + + [[1, 1, 0, 0], + [1, 1, 1, 0], + [1, 1, 1, 1]] + + Args: + size: The size of the bias matrix. + + Returns: + CausalBias: The LOWER_RIGHT triangular causal bias variant. + """ + assert len(size) == 2, "causal_lower_right only supports 2D tensors" + seq_len_q, seq_len_kv = size + return CausalBias(CausalVariant.LOWER_RIGHT, seq_len_q, seq_len_kv) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..ad922227ccff80de42fcefe74c52ea861124add4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py @@ -0,0 +1,1676 @@ +# mypy: allow-untyped-defs +# flake8: noqa: B950 +"""This module implements the user facing API for flex_attention in PyTorch.""" + +import functools +import inspect +import itertools +import math +import operator +import typing +import warnings +from collections.abc import Callable +from enum import Enum +from typing import Any, Literal, NamedTuple, TypeAlias + +import torch +from torch import Tensor + + +try: + from typing import TypedDict +except ImportError: + from typing_extensions import TypedDict + +try: + from typing import NotRequired +except ImportError: + from typing_extensions import NotRequired + +from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop +from torch._higher_order_ops.utils import _set_compilation_env +from torch._prims_common import DeviceLikeType +from torch.fx.experimental.proxy_tensor import ( + _temp_remove_metadata_torch_function_mode, + _temp_remove_pre_dispatch_torch_function_mode, +) +from torch.nn.attention._utils import _validate_sdpa_input +from torch.utils._pytree import GetAttrKey, tree_map_only + + +# Private debug flag to disable internal compilation wrapping for debugging purposes. +# WARNING: This is intended ONLY for debugging score_mod and mask_mod functions. +# When enabled, this bypasses the required internal compilation that ensures correctness +# and performance. Only use this temporarily when you need to set breakpoints +# in your score_mod/mask_mod functions during development. +# +# This flag only affects the internal compilation when flex_attention is called directly. +# If you have already wrapped flex_attention in torch.compile(), this flag has no effect +# and the user's compilation will still occur. +# +# Usage: +# import torch.nn.attention.flex_attention as fa +# fa._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = True +# # Now you can set breakpoints in your score_mod/mask_mod +# output = fa.flex_attention(q, k, v, score_mod=my_score_mod) +# +_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = False + +_WARNINGS_SHOWN: set[str] = set() + + +def _warn_once( + warning_id: str, message: str, category: type[Warning] = UserWarning +) -> None: + """Helper to ensure each warning is shown only once per process.""" + if warning_id not in _WARNINGS_SHOWN: + warnings.warn(message, category, stacklevel=2) + _WARNINGS_SHOWN.add(warning_id) + + +__all__ = [ + "BlockMask", + "flex_attention", + "AuxOutput", + "AuxRequest", + "FlexKernelOptions", + "create_block_mask", + "create_mask", + "or_masks", + "and_masks", + "noop_mask", +] + +_score_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor, Tensor], Tensor] +_mask_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor], Tensor] +_Backend: TypeAlias = Literal["AUTO", "TRITON", "FLASH", "TRITON_DECODE"] + + +# pyrefly: ignore [invalid-inheritance] +class FlexKernelOptions(TypedDict, total=False): + """Options for controlling the behavior of FlexAttention kernels. + + These options are passed to the underlying Triton kernels to control performance + and numerical behavior. Most users will not need to specify these options as the + default autotuning provides good performance. + + The options can be prefixed with ``fwd_`` or ``bwd_`` to apply only to forward or + backward pass respectively. For example: ``fwd_BLOCK_M`` and ``bwd_BLOCK_M1``. + + Note: + We currently do not provide any backward compatibility guarantees for these options. + That being said most of these have remained pretty stable since their introduction. But + We do not consider this part of the public API just yet. We think that some documentation + Is better than secret hidden flags, but we may change these options in the future. + + Example Usage: + .. code-block:: python + + # Using dictionary (backward compatible) + kernel_opts = {"BLOCK_M": 64, "BLOCK_N": 64, "PRESCALE_QK": True} + output = flex_attention(q, k, v, kernel_options=kernel_opts) + + # Using TypedDict (recommended for type safety) + from torch.nn.attention.flex_attention import FlexKernelOptions + + kernel_opts: FlexKernelOptions = { + "BLOCK_M": 64, + "BLOCK_N": 64, + "PRESCALE_QK": True, + } + output = flex_attention(q, k, v, kernel_options=kernel_opts) + + # Forward/backward specific options + kernel_opts: FlexKernelOptions = { + "fwd_BLOCK_M": 64, + "bwd_BLOCK_M1": 32, + "PRESCALE_QK": False, + } + output = flex_attention(q, k, v, kernel_options=kernel_opts) + """ + + # Performance tuning options + # pyrefly: ignore [invalid-annotation] + num_warps: NotRequired[int] + """Number of warps to use in the CUDA kernel. Higher values may improve performance + but increase register pressure. Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + num_stages: NotRequired[int] + """Number of pipeline stages in the CUDA kernel. Higher values may improve performance + but increase shared memory usage. Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + BLOCK_M: NotRequired[int] + """Thread block size for the sequence length dimension of Q in forward pass. + Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + BLOCK_N: NotRequired[int] + """Thread block size for the sequence length dimension of K/V in forward pass. + Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning.""" + + # Backward-specific block sizes (when prefixed with 'bwd_') + # pyrefly: ignore [invalid-annotation] + BLOCK_M1: NotRequired[int] + """Thread block size for Q dimension in backward pass. Use as 'bwd_BLOCK_M1'. + Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + BLOCK_N1: NotRequired[int] + """Thread block size for K/V dimension in backward pass. Use as 'bwd_BLOCK_N1'. + Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + BLOCK_M2: NotRequired[int] + """Thread block size for second Q dimension in backward pass. Use as 'bwd_BLOCK_M2'. + Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + BLOCK_N2: NotRequired[int] + """Thread block size for second K/V dimension in backward pass. Use as 'bwd_BLOCK_N2'. + Default is determined by autotuning.""" + + # pyrefly: ignore [invalid-annotation] + PRESCALE_QK: NotRequired[bool] + """Whether to pre-scale QK by 1/sqrt(d) and change of base. This is slightly faster but + may have more numerical error. Default: False.""" + + # pyrefly: ignore [invalid-annotation] + ROWS_GUARANTEED_SAFE: NotRequired[bool] + """If True, guarantees that at least one value in each row is not masked out. + Allows skipping safety checks for better performance. Only set this if you are certain + your mask guarantees this property. For example, causal attention is guaranteed safe + because each query has at least 1 key-value to attend to. Default: False.""" + + # pyrefly: ignore [invalid-annotation] + BLOCKS_ARE_CONTIGUOUS: NotRequired[bool] + """If True, guarantees that all blocks in the mask are contiguous. + Allows optimizing block traversal. For example, causal masks would satisfy this, + but prefix_lm + sliding window would not. Default: False.""" + + # pyrefly: ignore [invalid-annotation] + WRITE_DQ: NotRequired[bool] + """Controls whether gradient scatters are done in the DQ iteration loop of the backward pass. + Setting this to False will force this to happen in the DK loop which depending on your + specific score_mod and mask_mod might be faster. Default: True.""" + + # pyrefly: ignore [invalid-annotation] + FORCE_USE_FLEX_ATTENTION: NotRequired[bool] + """If True, forces the use of the flex attention kernel instead of potentially using + the more optimized flex-decoding kernel for short sequences. This can be a helpful + option for debugging. Default: False.""" + + # pyrefly: ignore [invalid-annotation] + USE_TMA: NotRequired[bool] + """Whether to use Tensor Memory Accelerator (TMA) on supported hardware. + This is experimental and may not work on all hardware, currently specific + to NVIDIA GPUs Hopper+. Default: False.""" + + # ROCm-specific options + # pyrefly: ignore [invalid-annotation] + kpack: NotRequired[int] + """ROCm-specific kernel packing parameter.""" + + # pyrefly: ignore [invalid-annotation] + matrix_instr_nonkdim: NotRequired[int] + """ROCm-specific matrix instruction non-K dimension.""" + + # pyrefly: ignore [invalid-annotation] + waves_per_eu: NotRequired[int] + """ROCm-specific waves per execution unit.""" + + # pyrefly: ignore [invalid-annotation] + BACKEND: NotRequired[_Backend] + """Selects a specific kernel backend. + + Options: + - "AUTO": Use current heuristics (typically Triton-based kernels with + automatic selection between flex_attention and flex_decoding) + - "TRITON": Standard Triton flex_attention kernel + - "TRITON_DECODE": Triton flex_decoding kernel, only available for short sequence lengths with specific configurations + - "FLASH": Experimental: Flash Attention kernel (cute-dsl), user needs to have flash installed + + This option cannot be combined with legacy knobs such as ``FORCE_USE_FLEX_ATTENTION``. + Raises an error if the requested backend cannot be used. Default: "AUTO" + """ + + +class AuxRequest(NamedTuple): + """Request which auxiliary outputs to compute from flex_attention. + + Each field is a boolean indicating whether that auxiliary output should be computed. + """ + + lse: bool = False + max_scores: bool = False + + +class AuxOutput(NamedTuple): + """Auxiliary outputs from flex_attention operation. + + Fields will be None if not requested, or contain the tensor if requested. + """ + + lse: Tensor | None = None + max_scores: Tensor | None = None + + +class _ModificationType(Enum): + """Enum for the type of modification function. + - SCORE_MOD: score_mod function which accepts a score as the first argument + - mask_mod: mask function which does not accept a score and is only used for generating + block mask + """ + + SCORE_MOD = 1 + MASK_MOD = 2 + UNKNOWN = 3 + + +def _get_mod_type(fn: Callable) -> _ModificationType: + """Get the type of modification function. + This function inspects the number of positional arguments of the function to determine + the type of modification function. If the function has 5 positional arguments, it is + considered as a score_mod function. If the function has 4 positional arguments, it is + considered as a mask function. + """ + if hasattr(fn, "__code__"): + code = fn.__code__ + num_positional_total = code.co_argcount + defaults = () + if hasattr(fn, "__defaults__"): + defaults = fn.__defaults__ or () + num_defaults = len(defaults) + num_positional_args = num_positional_total - num_defaults + else: + num_positional_args = sum( + 1 + for param in inspect.signature(fn).parameters.values() + if param.default is inspect.Parameter.empty + ) + assert num_positional_args == 5 or num_positional_args == 4 + if num_positional_args == 5: + return _ModificationType.SCORE_MOD + elif num_positional_args == 4: + return _ModificationType.MASK_MOD + else: + return _ModificationType.UNKNOWN + + +# Need to define it here so that Dynamo doesn't skip it +def _vmap_for_bhqkv( + fn: Callable, + prefix: tuple[int | None, ...], + suffix: tuple[int | None, ...] = (), + out_dims: int | list[int | None] = 0, + group_dim: bool = False, +): + """Used to vmap both score_mods and mask_mods over 4-dimensional/5-dimension inputs. + Mapping over the [b, hq, q_idx, kv_idx] or [b, hkv, g, q_idx, kv_idx] dimensions. + + Args: + fn (callable): The function to vmap. + prefix (tuple): The prefix of the vmap. For score mod functions, + this should be set to (0,). For mask_mods = () + suffix (tuple): We need to add (0,) if gradOut is being mapped over, + and (None,) * len(other_buffers). + out_dims (tuple): For forward cases, keep this as the default 0 since + we are only returning 1 output. For backwards, the joint + graph returns grads for B, H, Q_idx, KV_idx and other_buffers, + so we set this to (0, None, None, None, None) + (None,) * len(other_buffers). + + Returns: + callable: The vmapped function. + """ + # We vamp a function 4 times, broadcasting the [b, h, q_idx, kv_idx] dimensions + dimensions: list[tuple[None | int, None | int, None | int, None | int]] = [] + dimensions = [ + (None, None, None, 0), + (None, None, 0, None), + (None, 0, None, None), + ] + + if group_dim: + dimensions += [ + (None, 0, None, None), + ] + + dimensions += [ + (0, None, None, None), + ] + + for dims in dimensions: + fn = torch.vmap(fn, in_dims=prefix + dims + suffix, out_dims=out_dims) # type: ignore[arg-type] + return fn + + +def _identity( + score: Tensor, + batch: Tensor, + head: Tensor, + token_q: Tensor, + token_kv: Tensor, +) -> Tensor: + return score + + +def noop_mask( + batch: Tensor, + head: Tensor, + token_q: Tensor, + token_kv: Tensor, +) -> Tensor: + """Returns a noop mask_mod""" + return batch.new_ones(size=(), dtype=torch.bool, device=batch.device) + + +def _sliced_mask_mod_error( + batch: Tensor, + head: Tensor, + token_q: Tensor, + token_kv: Tensor, +) -> Tensor: + """ + Raises helpful error when using mask_mod from a sliced BlockMask. + + After slicing a BlockMask, the mask_mod is reset and cannot be used directly. + Users must reassign mask_mod from the original (unsliced) BlockMask. + """ + raise RuntimeError( + "Cannot use mask_mod from a sliced BlockMask. " + "When you slice a BlockMask using [], the mask_mod attribute is reset. " + "You must set it from the original BlockMask's mask_mod." + "\n\nIncorrect usage:" + "\n base_mask = create_block_mask(my_mask_fn, ...)" + "\n sliced_mask = base_mask[:, :, block_idx]" + "\n sliced_mask.mask_mod = apply_offset(sliced_mask.mask_mod, offset) # WRONG!" + "\n\nCorrect usage:" + "\n base_mask = create_block_mask(my_mask_fn, ...)" + "\n sliced_mask = base_mask[:, :, block_idx]" + "\n sliced_mask.mask_mod = apply_offset(base_mask.mask_mod, offset) # Use base_mask!" + ) + + +_DEFAULT_SPARSE_BLOCK_SIZE = 128 +_LARGE_SPARSE_BLOCK_SIZE = 1 << 30 + + +def _ordered_to_dense(num_blocks_in_row: Tensor, col_indices: Tensor): + num_rows = col_indices.shape[-2] + num_cols = col_indices.shape[-1] + batch_dims = num_blocks_in_row.shape[:-1] + device = num_blocks_in_row.device + + def create_dense_one(kv_num_blocks, kv_indices): + dense_mask = kv_indices.new_zeros(num_rows, num_cols + 1, dtype=torch.int32) + + row_indices = torch.arange(num_rows, dtype=torch.int, device=device).unsqueeze( + -1 + ) + col_range = torch.arange(num_cols, dtype=torch.int, device=device) + index_mask = col_range < kv_num_blocks.unsqueeze(-1) + + # We write to one spot "out of bounds" + valid_indices = torch.where(index_mask, kv_indices, num_cols) + + # set the values in 'a' to 1 where the indices are valid + dense_mask[row_indices, valid_indices] = dense_mask.new_ones(()) + return dense_mask[:, :num_cols].contiguous() + + create_dense_batched = create_dense_one + for _ in range(len(batch_dims)): + create_dense_batched = torch.vmap(create_dense_batched, in_dims=(0, 0)) + + out = create_dense_batched(num_blocks_in_row, col_indices) + return out + + +def _dense_to_ordered(dense_mask) -> tuple[Tensor, Tensor]: + dense_mask = dense_mask.to(dtype=torch.int32) + num_blocks_in_row = dense_mask.sum(dim=-1) + col_indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True) + return ( + num_blocks_in_row.to(torch.int32, memory_format=torch.contiguous_format), + col_indices.to(torch.int32, memory_format=torch.contiguous_format), + ) + + +def _transpose_ordered(num_blocks_in_row: Tensor, col_indices: Tensor): + dense = _ordered_to_dense(num_blocks_in_row, col_indices) + return _dense_to_ordered(dense.transpose(-2, -1)) + + +def _adjust_num_blocks_and_indices( + num_blocks: Tensor, + indices: Tensor, + new_num_rows: int, + new_num_cols: int, +): + indices = indices[:, :, :new_num_rows, :new_num_cols] + num_blocks = num_blocks[:, :, :new_num_rows] + num_blocks = torch.where(num_blocks < new_num_cols, num_blocks, new_num_cols) + num_blocks = torch.sum(indices < num_blocks[:, :, :, None], dim=-1).to(torch.int32) + return num_blocks, indices + + +class BlockMask: + r""" + BlockMask is our format for representing a block-sparse attention mask. + It is somewhat of a cross in-between BCSR and a non-sparse format. + + **Basics** + + A block-sparse mask means that instead of representing the sparsity of + individual elements in the mask, a KV_BLOCK_SIZE x Q_BLOCK_SIZE block is + considered sparse only if every element within that block is sparse. + This aligns well with hardware, which generally expects to perform + contiguous loads and computation. + + This format is primarily optimized for 1. simplicity, and 2. kernel + efficiency. Notably, it is *not* optimized for size, as this mask is always + reduced by a factor of KV_BLOCK_SIZE * Q_BLOCK_SIZE. If the size is a + concern, the tensors can be reduced in size by increasing the block size. + + The essentials of our format are: + + num_blocks_in_row: Tensor[ROWS]: + Describes the number of blocks present in each row. + + col_indices: Tensor[ROWS, MAX_BLOCKS_IN_COL]: + `col_indices[i]` is the sequence of block positions for row i. The values of + this row after `col_indices[i][num_blocks_in_row[i]]` are undefined. + + For example, to reconstruct the original tensor from this format: + + .. code-block:: python + + dense_mask = torch.zeros(ROWS, COLS) + for row in range(ROWS): + for block_idx in range(num_blocks_in_row[row]): + dense_mask[row, col_indices[row, block_idx]] = 1 + + Notably, this format makes it easier to implement a reduction along the + *rows* of the mask. + + **Details** + + The basics of our format require only kv_num_blocks and kv_indices. But, we + have up to 8 tensors on this object. This represents 4 pairs: + + 1. (kv_num_blocks, kv_indices): Used for the forwards pass of attention, as + we reduce along the KV dimension. + + 2. [OPTIONAL] (full_kv_num_blocks, full_kv_indices): This is optional and + purely an optimization. As it turns out, applying masking to every block + is quite expensive! If we specifically know which blocks are "full" and + don't require masking at all, then we can skip applying mask_mod to these + blocks. This requires the user to split out a separate mask_mod from the + score_mod. For causal masks, this is about a 15% speedup. + + 3. [GENERATED] (q_num_blocks, q_indices): Required for the backwards pass, + as computing dKV requires iterating along the mask along the Q dimension. These are autogenerated from 1. + + 4. [GENERATED] (full_q_num_blocks, full_q_indices): Same as above, but for + the backwards pass. These are autogenerated from 2. + """ + + seq_lengths: tuple[int, int] + kv_num_blocks: Tensor + kv_indices: Tensor + full_kv_num_blocks: Tensor | None + full_kv_indices: Tensor | None + q_num_blocks: Tensor | None + q_indices: Tensor | None + full_q_num_blocks: Tensor | None + full_q_indices: Tensor | None + BLOCK_SIZE: tuple[int, int] + mask_mod: _mask_mod_signature + + # Attribute lists for pytree flatten/unflatten + _TENSOR_ATTRS = [ + "kv_num_blocks", + "kv_indices", + "full_kv_num_blocks", + "full_kv_indices", + "q_num_blocks", + "q_indices", + "full_q_num_blocks", + "full_q_indices", + ] + + _CONTEXT_ATTRS = [ + "seq_lengths", + "BLOCK_SIZE", + "mask_mod", + ] + + def __init__( + self, + seq_lengths: tuple[int, int], + kv_num_blocks: Tensor, + kv_indices: Tensor, + full_kv_num_blocks: Tensor | None, + full_kv_indices: Tensor | None, + q_num_blocks: Tensor | None, + q_indices: Tensor | None, + full_q_num_blocks: Tensor | None, + full_q_indices: Tensor | None, + BLOCK_SIZE: tuple[int, int], + mask_mod: _mask_mod_signature, + ) -> None: + if kv_indices.dim() < 2: + raise RuntimeError("BlockMask must have at least 2 dimensions") + assert kv_num_blocks is not None, "kv_num_blocks must be provided" + assert kv_indices is not None, "kv_indices must be provided" + assert (full_kv_num_blocks is None) == (full_kv_indices is None), ( + "full_kv_num_blocks and full_kv_indices must be both provided or omitted" + ) + assert (full_q_num_blocks is None) == (full_q_indices is None), ( + "full_q_num_blocks and full_q_indices must be both provided or omitted" + ) + + self.seq_lengths = seq_lengths + self.kv_num_blocks = kv_num_blocks + self.kv_indices = kv_indices + self.full_kv_num_blocks = full_kv_num_blocks + self.full_kv_indices = full_kv_indices + self.q_num_blocks = q_num_blocks + self.q_indices = q_indices + self.full_q_num_blocks = full_q_num_blocks + self.full_q_indices = full_q_indices + self.BLOCK_SIZE = BLOCK_SIZE + self.mask_mod = mask_mod + + @classmethod + def from_kv_blocks( + cls, + kv_num_blocks: Tensor, + kv_indices: Tensor, + full_kv_num_blocks: Tensor | None = None, + full_kv_indices: Tensor | None = None, + BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE, + mask_mod: _mask_mod_signature | None = None, + seq_lengths: tuple[int, int] | None = None, + compute_q_blocks: bool = True, + ): + """ + Creates a BlockMask instance from key-value block information. + + Args: + kv_num_blocks (Tensor): Number of kv_blocks in each Q_BLOCK_SIZE row tile. + kv_indices (Tensor): Indices of key-value blocks in each Q_BLOCK_SIZE row tile. + full_kv_num_blocks (Optional[Tensor]): Number of full kv_blocks in each Q_BLOCK_SIZE row tile. + full_kv_indices (Optional[Tensor]): Indices of full key-value blocks in each Q_BLOCK_SIZE row tile. + BLOCK_SIZE (Union[int, tuple[int, int]]): Size of KV_BLOCK_SIZE x Q_BLOCK_SIZE tiles. + mask_mod (Optional[Callable]): Function to modify the mask. + + Returns: + BlockMask: Instance with full Q information generated via _transposed_ordered + + Raises: + RuntimeError: If kv_indices has < 2 dimensions. + AssertionError: If only one of full_kv_* args is provided. + """ + if kv_indices.dim() < 2: + raise RuntimeError("BlockMask must have at least 2 dimensions") + + assert (full_kv_num_blocks is None) == (full_kv_indices is None), ( + "full_kv_num_blocks and full_kv_indices must be both provided or omitted" + ) + + # Generate q_num_blocks and q_indices + if compute_q_blocks: + q_num_blocks, q_indices = _transpose_ordered(kv_num_blocks, kv_indices) + if full_kv_num_blocks is not None: + assert full_kv_indices is not None + full_q_num_blocks, full_q_indices = _transpose_ordered( + full_kv_num_blocks, full_kv_indices + ) + else: + full_q_num_blocks, full_q_indices = None, None + else: + q_num_blocks, q_indices = None, None + full_q_num_blocks, full_q_indices = None, None + + if isinstance(BLOCK_SIZE, int): + BLOCK_SIZE = (BLOCK_SIZE, BLOCK_SIZE) + + mask_mod = mask_mod if mask_mod is not None else noop_mask + if seq_lengths is None: + q_length = kv_indices.shape[-2] * BLOCK_SIZE[0] + kv_length = kv_indices.shape[-1] * BLOCK_SIZE[1] + seq_lengths = (q_length, kv_length) + + return cls( + seq_lengths=seq_lengths, + kv_num_blocks=kv_num_blocks, + kv_indices=kv_indices, + full_kv_num_blocks=full_kv_num_blocks, + full_kv_indices=full_kv_indices, + q_num_blocks=q_num_blocks, + q_indices=q_indices, + full_q_num_blocks=full_q_num_blocks, + full_q_indices=full_q_indices, + BLOCK_SIZE=BLOCK_SIZE, + mask_mod=mask_mod, + ) + + def as_tuple(self, flatten: bool = True): + """ + Returns a tuple of the attributes of the BlockMask. + + Args: + flatten (bool): If True, it will flatten the tuple of (KV_BLOCK_SIZE, Q_BLOCK_SIZE) + """ + if flatten: + block_size = (self.BLOCK_SIZE[0], self.BLOCK_SIZE[1]) # type: ignore[assignment] + seq_lengths = (self.seq_lengths[0], self.seq_lengths[1]) # type: ignore[assignment] + else: + block_size = (self.BLOCK_SIZE,) # type: ignore[assignment] + seq_lengths = (self.seq_lengths,) # type: ignore[assignment] + + # pyrefly: ignore [not-iterable] + return ( + *seq_lengths, + self.kv_num_blocks, + self.kv_indices, + self.full_kv_num_blocks, + self.full_kv_indices, + self.q_num_blocks, + self.q_indices, + self.full_q_num_blocks, + self.full_q_indices, + *block_size, + self.mask_mod, + ) + + @property + def shape(self): + *batch_dims, _, _ = self.kv_indices.shape + return tuple(batch_dims) + self.seq_lengths + + def __str__(self) -> str: + s = f"BlockMask(shape={self.shape}, sparsity={self.sparsity():.2f}%, \n" + mask_str = self.to_string().strip() + s += mask_str + s += "\n)" + return s + + def __getitem__(self, index) -> "BlockMask": + """ + Returns a new BlockMask instance by getting the mask for the given index position. + + Args: + index: Index to apply to all attributes. + + Example Usage: + .. code-block:: python + + def causal_mask(b, h, q_idx, kv_idx): + return q_idx >= kv_idx + + + block_mask = create_block_mask( + causal_mask, 4, 2, 512, 512, device="cuda" + ) + assert block_mask.kv_num_blocks.shape == (4, 2, 4) + assert block_mask.kv_indices.shape == (4, 2, 4, 4) + + # Index on batch dimension + new_block_mask = block_mask[0] + assert new_block_mask.kv_num_blocks.shape == (2, 4) + assert new_block_mask.kv_indices.shape == (2, 4, 4) + + # Index on batch and head dimension + new_block_mask = block_mask[0, 1] + assert new_block_mask.kv_num_blocks.shape == (4,) + assert new_block_mask.kv_indices.shape == (4, 4) + + # slicing on batch and head dimension + new_block_mask = block_mask[0:2, 1:2] + assert new_block_mask.kv_num_blocks.shape == (2, 1, 4) + assert new_block_mask.kv_indices.shape == (2, 1, 4, 4) + + # slicing on batch, head, and query dimension + new_block_mask = block_mask[ + 0:2, 1:2, torch.tensor([1], dtype=torch.int32) + ] + assert new_block_mask.kv_num_blocks.shape == (2, 1, 1) + assert new_block_mask.kv_indices.shape == (2, 1, 1, 4) + """ + index = (index,) if not isinstance(index, tuple) else index + padded = (*index, slice(None), slice(None), slice(None))[:3] + sizes = self.kv_num_blocks.shape[:3] + index = tuple( + (slice(i + n, i + n + 1) if -n <= i < 0 else slice(i, i + 1)) + if isinstance(i, int) + else i + for i, n in zip(padded, sizes, strict=True) + ) + new_kv_num_blocks = self.kv_num_blocks[index] + new_kv_indices = self.kv_indices[index] + if self.full_kv_num_blocks is not None: + assert self.full_kv_indices is not None + new_full_kv_num_blocks = self.full_kv_num_blocks[index] + new_full_kv_indices = self.full_kv_indices[index] + else: + new_full_kv_num_blocks = None + new_full_kv_indices = None + return BlockMask.from_kv_blocks( + new_kv_num_blocks, + new_kv_indices, + new_full_kv_num_blocks, + new_full_kv_indices, + BLOCK_SIZE=self.BLOCK_SIZE, + mask_mod=_sliced_mask_mod_error, + seq_lengths=self.seq_lengths, + compute_q_blocks=self.q_indices is not None, + ) + + def __repr__(self) -> str: + def shape_or_none(x: torch.Tensor | None): + return x.shape if x is not None else None + + return ( + f"BlockMask(\n" + f" kv_num_blocks={self.kv_num_blocks.shape},\n" + f" kv_indices={self.kv_indices.shape},\n" + f" full_kv_num_blocks={shape_or_none(self.full_kv_num_blocks)},\n" + f" full_kv_indices={shape_or_none(self.full_kv_indices)},\n" + f" q_num_blocks={shape_or_none(self.q_num_blocks)},\n" + f" q_indices={shape_or_none(self.q_indices)},\n" + f" full_q_num_blocks={shape_or_none(self.full_q_num_blocks)},\n" + f" full_q_indices={shape_or_none(self.full_q_indices)},\n" + f" BLOCK_SIZE={self.BLOCK_SIZE},\n" + f" shape={self.shape},\n" + f" sparsity={self.sparsity():.2f}%,\n" + f" mask_mod={self.mask_mod.__name__ if hasattr(self.mask_mod, '__name__') else self.mask_mod}\n" + f")" + ) + + def _adjust(self, new_q_len: int, new_kv_len: int): + new_num_rows = (new_q_len + self.BLOCK_SIZE[0] - 1) // self.BLOCK_SIZE[0] + new_num_cols = (new_kv_len + self.BLOCK_SIZE[1] - 1) // self.BLOCK_SIZE[1] + new_kv_num_blocks, new_kv_indices = _adjust_num_blocks_and_indices( + self.kv_num_blocks, self.kv_indices, new_num_rows, new_num_cols + ) + if self.full_kv_num_blocks is not None: + assert self.full_kv_indices is not None + ( + new_full_kv_num_blocks, + new_full_kv_indices, + ) = _adjust_num_blocks_and_indices( + self.full_kv_num_blocks, + self.full_kv_indices, + new_num_rows, + new_num_cols, + ) + else: + new_full_kv_num_blocks = None + new_full_kv_indices = None + return self.from_kv_blocks( + new_kv_num_blocks, + new_kv_indices, + new_full_kv_num_blocks, + new_full_kv_indices, + self.BLOCK_SIZE, + self.mask_mod, + ) + + def numel(self): + """Returns the number of elements (not accounting for sparsity) in the mask.""" + shape = self.shape + + def _prod(xs): + return functools.reduce(operator.mul, xs, 1) + + return _prod(shape) + + def sparsity(self) -> float: + """Computes the percentage of blocks that are sparse (i.e. not computed)""" + total_size = self.numel() + computed_blocks = self.kv_num_blocks.sum() + if self.full_kv_num_blocks is not None: + computed_blocks += self.full_kv_num_blocks.sum() + + computed_size = computed_blocks.item() * self.BLOCK_SIZE[0] * self.BLOCK_SIZE[1] + dense_ratio = computed_size / total_size + return 100 * (1 - dense_ratio) + + def to_dense(self) -> Tensor: + """Returns a dense block that is equivalent to the block mask.""" + partial_dense = _ordered_to_dense(self.kv_num_blocks, self.kv_indices) + if self.full_kv_num_blocks is not None: + assert self.full_kv_indices is not None + # pyrefly: ignore [bad-return] + return partial_dense | _ordered_to_dense( + self.full_kv_num_blocks, self.full_kv_indices + ) + return partial_dense + + def to_string(self, grid_size=(20, 20), limit=4): + """Returns a string representation of the block mask. Quite nifty. + + If grid_size is -1, prints out an uncompressed version. Warning, it can be quite big! + """ + dense_mask = self.to_dense() + *batch_dims, num_rows, num_cols = dense_mask.shape + if isinstance(grid_size, int): + max_rows = grid_size + max_cols = grid_size + elif grid_size == -1: + max_rows = num_rows + max_cols = num_cols + else: + max_rows, max_cols = grid_size + + def create_block_vis(*batch_idx): + descriptors = [] + + descriptors.append(f"{batch_idx}") + + vis = ", ".join(reversed(descriptors)) + "\n" + + def summarize_section(section) -> str: + percentage = section.float().mean().item() + if percentage == 1: + return "â–ˆ" + elif percentage == 0: + return " " + else: + return "â–‘" + + def cdiv(a, b): + return (a + (b - 1)) // b + + row_step = max(1, cdiv(num_rows, max_rows)) + col_step = max(1, cdiv(num_cols, max_cols)) + + for r in range(0, num_rows, row_step): + for c in range(0, num_cols, col_step): + cur_mask = dense_mask + for idx in batch_idx: + cur_mask = cur_mask[idx] + char = summarize_section( + cur_mask[r : r + row_step, c : c + col_step] + ) + vis += char * 2 + vis += "\n" + return vis + + total_vis = [] + for idx, batch_idx in enumerate( + itertools.product(*[range(i) for i in batch_dims]) + ): + if idx == limit: + total_vis.append("...") + total_vis.append("To print out more, set BlockMask.to_string(limit=N)") + total_vis.append( + "You can also index (BlockMask[batch, head]) to choose a specific batch or head" + ) + break + block_vis = create_block_vis(*batch_idx) + total_vis.append(block_vis) + + return "\n".join(total_vis) + + def to(self, device: torch.device | str) -> "BlockMask": + """Moves the BlockMask to the specified device. + + Args: + device (torch.device or str): The target device to move the BlockMask to. + Can be a torch.device object or a string (e.g., 'cpu', 'cuda:0'). + + Returns: + BlockMask: A new BlockMask instance with all tensor components moved + to the specified device. + + Note: + This method does not modify the original BlockMask in-place. + Instead, it returns a new BlockMask instance where individual tensor attributes + may or may not be moved to the specified device, depending on their + current device placement. + """ + mapped_attributes = tree_map_only( + torch.Tensor, + lambda x: x.to(device), + self.as_tuple(flatten=False), + ) + return BlockMask(*mapped_attributes) + + def _flatten(self): + """Flatten BlockMask into a list of tensors and context.""" + tensors = tuple(getattr(self, attr) for attr in self._TENSOR_ATTRS) + context = tuple(getattr(self, attr) for attr in self._CONTEXT_ATTRS) + return tensors, context + + @classmethod + def _unflatten(cls, tensors, context): + """Unflatten tensors and context back into a BlockMask.""" + kwargs = { + **dict(zip(cls._CONTEXT_ATTRS, context)), + **dict(zip(cls._TENSOR_ATTRS, tensors)), + } + # pyrefly: ignore [bad-argument-type] + return cls(**kwargs) + + def _flatten_with_keys(self): + """Flatten BlockMask with keys for better tracing.""" + tensors = tuple( + (GetAttrKey(attr), getattr(self, attr)) for attr in self._TENSOR_ATTRS + ) + context = tuple( + (GetAttrKey(attr), getattr(self, attr)) for attr in self._CONTEXT_ATTRS + ) + return tensors, context + + +def _broadcast_to_dim(x, dim): + while x.dim() < dim: + x = x.unsqueeze(0) + return x + + +def _round_up_to_multiple(x, multiple): + return (x + multiple - 1) // multiple * multiple + + +def _convert_mask_to_block_mask( + mask: Tensor, + Q_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE, + KV_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE, + separate_full_blocks: bool = False, +) -> tuple[Tensor, Tensor | None]: + assert mask.dtype == torch.bool + mask = _broadcast_to_dim(mask, 4) + + def padding_needed_for_multiple(x, multiple): + return _round_up_to_multiple(x, multiple) - x + + mask = torch.nn.functional.pad( + mask, + ( + 0, + padding_needed_for_multiple(mask.shape[-1], KV_BLOCK_SIZE), + 0, + padding_needed_for_multiple(mask.shape[-2], Q_BLOCK_SIZE), + ), + ) + B, H, Q, KV = mask.shape + assert Q % Q_BLOCK_SIZE == 0 + assert KV % KV_BLOCK_SIZE == 0 + mask = mask.view( + B, H, Q // Q_BLOCK_SIZE, Q_BLOCK_SIZE, KV // KV_BLOCK_SIZE, KV_BLOCK_SIZE + ) # [B, H, Q//Q_BLOCK_SIZE, Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE, KV_BLOCK_SIZE] + mask = mask.permute( + 0, 1, 2, 4, 3, 5 + ) # [B, H, Q//Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE, Q_BLOCK_SIZE, KV_BLOCK_SIZE] + mask_block_sum = mask.sum( + dim=[-2, -1] + ) # [B, H, Q//Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE] + if separate_full_blocks: + full_block_sum = Q_BLOCK_SIZE * KV_BLOCK_SIZE + full_blocks = mask_block_sum == full_block_sum + partial_blocks = (mask_block_sum > 0) & (mask_block_sum < full_block_sum) + partial_blocks = partial_blocks.to(dtype=torch.int8) + full_blocks = full_blocks.to(dtype=torch.int8) + return partial_blocks, full_blocks + else: + partial_blocks = mask_block_sum > 0 + partial_blocks = partial_blocks.to(dtype=torch.int8) + return partial_blocks, None + + +def or_masks(*mask_mods: _mask_mod_signature) -> _mask_mod_signature: + """Returns a mask_mod that's the union of provided mask_mods""" + if not all(callable(arg) for arg in mask_mods): + raise RuntimeError(f"All inputs should be callable mask_mods: {mask_mods}") + + def or_mask(b, h, q_idx, kv_idx): + result = b.new_zeros((), dtype=torch.bool) + for mask in mask_mods: + result = result | mask(b, h, q_idx, kv_idx) + return result + + return or_mask + + +def and_masks(*mask_mods: _mask_mod_signature) -> _mask_mod_signature: + """Returns a mask_mod that's the intersection of provided mask_mods""" + if not all(callable(arg) for arg in mask_mods): + raise RuntimeError(f"All inputs should be callable mask_mods: {mask_mods}") + + def and_mask(b, h, q_idx, kv_idx): + result = b.new_ones((), dtype=torch.bool) + for mask in mask_mods: + result = result & mask(b, h, q_idx, kv_idx) + return result + + return and_mask + + +def _convert_block_mask_to_mask( + block_mask, + KV_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE, + Q_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE, +) -> Tensor: + assert block_mask.dim() == 4 + B, H, Q, KV = block_mask.shape + block_mask = block_mask.expand(Q_BLOCK_SIZE, KV_BLOCK_SIZE, *block_mask.shape) + block_mask = block_mask.permute(2, 3, 4, 0, 5, 1).reshape( + B, H, Q * Q_BLOCK_SIZE, KV * KV_BLOCK_SIZE + ) + return block_mask + + +def _create_sparse_block_from_block_mask( + block_mask: tuple[Tensor, Tensor | None], + mask_mod: Callable | None, + seq_lengths: tuple[int, int], + Q_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE, + KV_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE, +) -> BlockMask: + partial_blocks, full_blocks = block_mask + + partial_bm = _dense_to_ordered(partial_blocks) + if full_blocks is not None: + full_bm: tuple[Tensor | None, Tensor | None] = _dense_to_ordered(full_blocks) + else: + full_bm = (None, None) + + return BlockMask.from_kv_blocks( + partial_bm[0], + partial_bm[1], + full_bm[0], + full_bm[1], + BLOCK_SIZE=(Q_BLOCK_SIZE, KV_BLOCK_SIZE), + mask_mod=mask_mod, + seq_lengths=seq_lengths, + ) + + +def create_mask( + mod_fn: _score_mod_signature | _mask_mod_signature, + B: int | None, + H: int | None, + Q_LEN: int, + KV_LEN: int, + device: DeviceLikeType | None = None, +) -> Tensor: + r"""This function creates a mask tensor from a mod_fn function. + + Args: + mod_fn (Union[_score_mod_signature, _mask_mod_signature]): Function to modify attention scores. + B (int): Batch size. + H (int): Number of query heads. + Q_LEN (int): Sequence length of query. + KV_LEN (int): Sequence length of key/value. + device (str): Device to run the mask creation on. + + Returns: + mask (Tensor): A mask tensor with shape (B, H, M, N). + """ + if device is None: + device = torch.accelerator.current_accelerator() or "cpu" + if B is None: + B = 1 + if H is None: + H = 1 + b = torch.arange(0, B, device=device) + h = torch.arange(0, H, device=device) + m = torch.arange(0, Q_LEN, device=device) + n = torch.arange(0, KV_LEN, device=device) + mod_type = _get_mod_type(mod_fn) + + from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex + + with TransformGetItemToIndex(): + if mod_type == _ModificationType.SCORE_MOD: + score_mod = mod_fn + score_mod = _vmap_for_bhqkv(score_mod, prefix=(0,)) # first input is score + out = score_mod(torch.zeros(B, H, Q_LEN, KV_LEN, device=device), b, h, m, n) + mask = torch.where(torch.isneginf(out), False, True) + return mask + elif mod_type == _ModificationType.MASK_MOD: + mask_mod = mod_fn + mask_mod = _vmap_for_bhqkv(mask_mod, prefix=()) + mask = mask_mod(b, h, m, n) + return mask + else: + raise AssertionError + + +def create_block_mask( + mask_mod: _mask_mod_signature, + B: int | None, + H: int | None, + Q_LEN: int, + KV_LEN: int, + device: DeviceLikeType | None = None, + BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE, + _compile=False, +) -> BlockMask: + r"""This function creates a block mask tuple from a mask_mod function. + + Args: + mask_mod (Callable): mask_mod function. This is a callable that defines the + masking pattern for the attention mechanism. It takes four arguments: + b (batch size), h (number of heads), q_idx (query index), and kv_idx (key/value index). + It should return a boolean tensor indicating which attention connections are allowed (True) + or masked out (False). + B (int): Batch size. + H (int): Number of query heads. + Q_LEN (int): Sequence length of query. + KV_LEN (int): Sequence length of key/value. + device (str): Device to run the mask creation on. + BLOCK_SIZE (int or tuple[int, int]): Block size for the block mask. If a single int is provided it is used for both query and key/value. + + Returns: + BlockMask: A BlockMask object that contains the block mask information. + + Example Usage: + .. code-block:: python + + def causal_mask(b, h, q_idx, kv_idx): + return q_idx >= kv_idx + + + block_mask = create_block_mask(causal_mask, 1, 1, 8192, 8192, device="cuda") + query = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16) + key = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16) + value = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16) + output = flex_attention(query, key, value, block_mask=block_mask) + """ + if device is None: + device = torch.accelerator.current_accelerator() or "cpu" + mod_type = _get_mod_type(mask_mod) + assert mod_type == _ModificationType.MASK_MOD, ( + f"create-block_mask requires a mask_mod function! Got {mask_mod}" + ) + if B is None: + B = 1 + if H is None: + H = 1 + if isinstance(BLOCK_SIZE, int): + Q_BLOCK_SIZE = BLOCK_SIZE + KV_BLOCK_SIZE = BLOCK_SIZE + else: + Q_BLOCK_SIZE, KV_BLOCK_SIZE = BLOCK_SIZE + + if _compile: + warnings.warn( + "_compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.", + DeprecationWarning, + stacklevel=2, + ) + return torch.compile(create_block_mask)( + mask_mod, B, H, Q_LEN, KV_LEN, device, BLOCK_SIZE + ) + + mask_tensor = create_mask(mask_mod, B, H, Q_LEN, KV_LEN, device) + partial_block_mask, full_block_mask = _convert_mask_to_block_mask( + mask_tensor, + Q_BLOCK_SIZE=Q_BLOCK_SIZE, + KV_BLOCK_SIZE=KV_BLOCK_SIZE, + separate_full_blocks=True, + ) + block_mask = _create_sparse_block_from_block_mask( + (partial_block_mask, full_block_mask), + mask_mod, + (Q_LEN, KV_LEN), + Q_BLOCK_SIZE, + KV_BLOCK_SIZE, + ) + return block_mask + + +def _create_empty_block_mask(query: Tensor, key: Tensor) -> BlockMask: + r"""Default block mask for flex attention. + If users don't specify any block sparse mask info, we create this + empty block sparse mask. Which creates a BlockMask with 1 block that is the full length + of the query and key tensors. + """ + device = query.device + return BlockMask.from_kv_blocks( + kv_num_blocks=torch.ones([1, 1, 1], dtype=torch.int32, device=device), + kv_indices=torch.zeros([1, 1, 1, 1], dtype=torch.int32, device=device), + BLOCK_SIZE=_LARGE_SPARSE_BLOCK_SIZE, + seq_lengths=(1, 1), + ) + + +def _apply_kernel_options( + query: Tensor, + key: Tensor, + value: Tensor, + return_lse: bool, + kernel_options, + return_aux: AuxRequest | None = None, +): + kernel_options = {} if kernel_options is None else dict(kernel_options) + + if "BACKEND" in kernel_options and kernel_options.get( + "FORCE_USE_FLEX_ATTENTION", False + ): + # TODO: remove FORCE_USE_FLEX_ATTENTION once BACKEND is fully adopted. + raise RuntimeError( + "BACKEND cannot be combined with legacy FORCE_USE_FLEX_ATTENTION. " + "BACKEND supersedes the legacy knob; please drop FORCE_USE_FLEX_ATTENTION " + "and only specify the desired BACKEND." + ) + + if "BACKEND" in kernel_options: + valid_backends = typing.get_args(_Backend) + if kernel_options["BACKEND"] not in valid_backends: + raise ValueError( + f"Invalid BACKEND value '{kernel_options['BACKEND']}'. " + f"Must be one of {valid_backends}" + ) + + kernel_options.setdefault("BACKEND", "AUTO") + kernel_options.setdefault("PRESCALE_QK", False) + kernel_options.setdefault("ROWS_GUARANTEED_SAFE", False) + kernel_options.setdefault("BLOCKS_ARE_CONTIGUOUS", False) + # This forces all biases grad scatters to be done in the DQ iteration loop of the backwards + kernel_options.setdefault("WRITE_DQ", True) + + any_inputs_on_cpu_device = ( + query.device.type == "cpu" + or key.device.type == "cpu" + or value.device.type == "cpu" + ) + + # Determine what auxiliary outputs are needed + output_lse = return_lse + output_max = False + + if return_aux is not None: + # New API takes precedence over legacy parameters + output_lse = return_aux.lse + output_max = return_aux.max_scores + + # If forward kernel needs to return logsumexp is decided by this rule internally. + assert "OUTPUT_LOGSUMEXP" not in kernel_options + kernel_options["OUTPUT_LOGSUMEXP"] = True + if not output_lse: + # We used to check if q,k,v required grads but since captured buffers can require grad + # we always write unless in no_grad + kernel_options["OUTPUT_LOGSUMEXP"] = torch.is_grad_enabled() + if any_inputs_on_cpu_device: + # CPU with torch.compile now supports inference, and will not return lse + # TODO: support CPU for training and return lse + kernel_options["OUTPUT_LOGSUMEXP"] = False + + # If forward kernel needs to return max is decided by this rule internally. + assert "OUTPUT_MAX" not in kernel_options + kernel_options["OUTPUT_MAX"] = output_max + if any_inputs_on_cpu_device and output_max: + # CPU doesn't support returning max yet + # TODO: support CPU for returning max + raise NotImplementedError("Returning max scores is not supported on CPU.") + kernel_options["OUTPUT_MAX"] = False + + return kernel_options + + +def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor) -> None: + if query.size(-1) != key.size(-1): + raise ValueError( + f"Expect query and key/value to have the same embedding dimension " + f"but got E={query.size(-1)} and E={key.size(-1)}." + ) + + +def _validate_device(query: Tensor, key: Tensor, value: Tensor) -> None: + """TODO: Remove once non cuda/cpu devices support is added + We only need to check query since we have already that q,k,v are on the same device + """ + supported_devices = {"cuda", "cpu", "xpu", "hpu"} + if query.device.type not in supported_devices: + raise ValueError( + "FlexAttention is only supported on CUDA, CPU or HPU devices. " + f"Found input tensors on {query.device.type} device." + ) + + +def _enforce_mem_layouts( + query: Tensor, key: Tensor, value: Tensor +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Enforce memory layouts for query, key, and value tensors. + + For non-FP8 dtypes, no action is taken. + + For FP8 dtypes, we enforce the following memory layouts: + - Query tensor must be in row-major memory layout, as it will be the left-operand in the FP8 GEMM `q @ k.T`. + - Key tensor must be in row-major memory layout, as it will be transposed when used as the right-operand + in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM. + - Value tensor must be in column-major memory layout, as it will be the right-operand in the FP8 GEMM `softmax_scores @ v`. + + Returns the query, key, and value tensors with the enforced memory layouts. + """ + + def is_row_major(tensor: Tensor) -> bool: + return tensor.stride()[-1] == 1 + + def is_col_major(tensor: Tensor) -> bool: + return tensor.stride()[-2] == 1 + + # These memory layout constraint are only for FP8 GEMMs on NVIDIA GPU architectures >= SM89 and < SM100. + # This is because GPU arch < SM89 does not not support FP8 GEMMs, and + # SM100 has support for TN, NT, TT, NN layouts for FP8 GEMMs + # (i.e., left and right operands can be in row or column major layouts) + # so this check is only needed for older architectures. + # See: https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/blackwell_functionality.md + fp8_dtypes = ( + torch.float8_e4m3fn, + torch.float8_e5m2, + ) + gemm_precision = query.dtype + + should_enforce_mem_layout = ( + gemm_precision in fp8_dtypes + and torch.version.cuda is not None + and torch.cuda.get_device_capability("cuda") >= (8, 9) + and torch.cuda.get_device_capability("cuda") < (10, 0) + ) + if not should_enforce_mem_layout: + return query, key, value + + # Query must be in row-major memory layout as the left-operand in the FP8 GEMM `q @ k.T` + if not is_row_major(query): + query = query.contiguous() + + # Key must be in row-major memory layout as it will be transposed when used as the right-operand + # in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM. + if not is_row_major(key): + key = key.contiguous() + + # Value must be in column-major memory layout as the right-operand in the FP8 GEMM `softmax_scores @ v` + if not is_col_major(value): + value = value.transpose(-2, -1).contiguous().transpose(-2, -1) + return query, key, value + + +def flex_attention( + query: Tensor, + key: Tensor, + value: Tensor, + score_mod: _score_mod_signature | None = None, + block_mask: BlockMask | None = None, + scale: float | None = None, + enable_gqa: bool = False, + return_lse: bool = False, + kernel_options: FlexKernelOptions | None = None, + *, + return_aux: AuxRequest | None = None, +) -> Tensor | tuple[Tensor, Tensor] | tuple[Tensor, AuxOutput]: + r"""This function implements scaled dot product attention with an arbitrary attention score modification function. + + This function computes the scaled dot product attention between query, key, and value tensors with a user-defined + attention score modification function. The attention score modification function will be applied after the attention + scores have been calculated between the query and key tensors. The attention scores are calculated as follows: + + The ``score_mod`` function should have the following signature: + + .. code-block:: python + + def score_mod( + score: Tensor, + batch: Tensor, + head: Tensor, + q_idx: Tensor, + k_idx: Tensor + ) -> Tensor: + + Where: + - ``score``: A scalar tensor representing the attention score, + with the same data type and device as the query, key, and value tensors. + - ``batch``, ``head``, ``q_idx``, ``k_idx``: Scalar tensors indicating + the batch index, query head index, query index, and key/value index, respectively. + These should have the ``torch.int`` data type and be located on the same device as the score tensor. + + Args: + query (Tensor): Query tensor; shape :math:`(B, Hq, L, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance. + key (Tensor): Key tensor; shape :math:`(B, Hkv, S, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance. + value (Tensor): Value tensor; shape :math:`(B, Hkv, S, Ev)`. For FP8 dtypes, should be in column-major memory layout for optimal performance. + score_mod (Optional[Callable]): Function to modify attention scores. By default no score_mod is applied. + block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention. + scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`. + enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads. + return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead. + kernel_options (Optional[FlexKernelOptions]): + Options to control the behavior of the underlying Triton kernels. + See :class:`FlexKernelOptions` for available options and usage examples. + return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return. + If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)`` + to request both auxiliary outputs. + + Returns: + output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`. + + When ``return_aux`` is not None: + aux (AuxOutput): Auxiliary outputs with requested fields populated. + + When ``return_aux`` is None (deprecated paths): + lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``. + + Shape legend: + - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}` + - :math:`S: \text{Source sequence length}` + - :math:`L: \text{Target sequence length}` + - :math:`E: \text{Embedding dimension of the query and key}` + - :math:`Ev: \text{Embedding dimension of the value}` + + .. warning:: + `torch.nn.attention.flex_attention` is a prototype feature in PyTorch. + Please look forward to a more stable implementation in a future version of PyTorch. + Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype + + """ + # Some basic input validation + _validate_sdpa_input(query, key, value) + _validate_embed_dim(query, key, value) + _validate_device(query, key, value) + query, key, value = _enforce_mem_layouts(query, key, value) + if query.dim() != 4 or key.dim() != 4 or value.dim() != 4: + raise NotImplementedError("NYI: query, key, and value must be 4D tensors") + if (not enable_gqa) and query.size(-3) != key.size(-3): + raise ValueError( + f"Expect query and key/value to have the same number of heads " + f"but got Hq={query.size(-3)} and Hkv={key.size(-3)}. " + f"Try setting enable_gqa=True for GQA." + ) + if enable_gqa: + Hq = query.size(1) + Hkv = key.size(1) + if Hq % Hkv != 0: + raise ValueError( + f"Expect number of query heads to be a multiple of kv heads for GQA " + f"but got Hq={Hq} and Hkv={Hkv}." + ) + if query.size(0) != key.size(0): + if block_mask is None: + raise ValueError( + f"Expect query and key/value to have the same batch size, " + f"or non-none block_mask, " + f"but got block_mask=None, Bq={query.size(0)}, and Bkv={key.size(0)}." + ) + + if block_mask.kv_num_blocks.size(0) != query.size(0): + raise ValueError( + f"Expect query and key/value to have the same batch size, " + f"or block_mask and query to have the same batch size, " + f"but got Bq={query.size(0)}, Bkv={key.size(0)}, B_block_mask={block_mask.kv_num_blocks.size(0)}." + ) + + if score_mod is None: + score_mod = _identity + + if block_mask is None: + block_mask = _create_empty_block_mask(query, key) + + # If BlockMask was sliced, its mask_mod is intentionally replaced with an error-raising stub. + # This guard ensures we surface the intended error message before any shape-based checks. + if getattr(block_mask, "mask_mod", None) is _sliced_mask_mod_error: + raise RuntimeError("Cannot use mask_mod from a sliced BlockMask") + + if ( + block_mask.BLOCK_SIZE[0] == _LARGE_SPARSE_BLOCK_SIZE + and block_mask.BLOCK_SIZE[1] == _LARGE_SPARSE_BLOCK_SIZE + ): + # This corresponds to the case where we essentially have a "no-op" block mask. + pass + else: + block_mask_q_len = block_mask.shape[-2] + block_mask_kv_len = block_mask.shape[-1] + if query.size(-2) > block_mask_q_len or key.size(-2) > block_mask_kv_len: + raise ValueError( + f"block_mask was created for block_mask.shape={block_mask.shape} but got q_len={query.size(-2)} and kv_len={key.size(-2)}. " + "As the block mask was created for a smaller length than you're using it for, you likely need to create a new block mask." + ) + elif ( + query.size(-2) < block_mask_q_len and key.size(-2) <= block_mask_kv_len + ) or (query.size(-2) <= block_mask_q_len and key.size(-2) < block_mask_kv_len): + raise ValueError( + f"block_mask was created for block_mask.shape={block_mask.shape} but got q_len={query.size(-2)} and kv_len={key.size(-2)}. " + "As the block mask was created for a larger length than you're using it for, you can either 1. create a new block mask with the correct length, or 2. 'adjust' the existing block mask to the correct length by calling block_mask._adjust(q_len, kv_len). This essentially 'crops' the block mask to the upper left corner, which does not work for all mask_mods!" + ) + assert query.size(-2) == block_mask_q_len + assert key.size(-2) == block_mask_kv_len + + if scale is None: + scale = 1.0 / math.sqrt(query.size(-1)) + + if query.device != block_mask.kv_num_blocks.device: # type: ignore[union-attr] + raise RuntimeError( + f"Expect q/k/v and block_mask to be on the same device " + f"but got {query.device} and {block_mask.kv_num_blocks.device}." # type: ignore[union-attr] + ) + + # Handle deprecation warnings for old parameters + if return_lse and return_aux is not None: + raise ValueError( + "Cannot specify both return_lse and return_aux. " + "return_lse is deprecated, please use return_aux=AuxRequest(lse=True) instead." + ) + elif return_lse and return_aux is None: + _warn_once( + "deprecated_return_lse", + "return_lse is deprecated and will be removed in v2.10. " + "Please use return_aux=AuxRequest(lse=True) instead.", + category=FutureWarning, + ) + + kernel_options = _apply_kernel_options( + query, + key, + value, + return_lse, + kernel_options, + return_aux, + ) + + def _finalize_outputs( + out, + lse, + max_scores, + *, + return_aux: AuxRequest | None, + return_lse: bool, + ): + """Normalize stats and build return value (aux-aware, legacy-compatible).""" + ln2 = math.log(2.0) + return_lse = return_lse or return_aux is not None and return_aux.lse + return_max = return_aux is not None and return_aux.max_scores + + lse_scaled = lse * ln2 if (return_lse and lse.numel() > 0) else None + max_scaled = ( + max_scores * ln2 if (return_max and max_scores.numel() > 0) else None + ) + + if return_aux is not None: + return out, AuxOutput( + lse=lse_scaled, + max_scores=max_scaled, + ) + + if return_lse: + return out, lse_scaled + + return out + + if torch.compiler.is_dynamo_compiling(): + # mark head_dim and number of heads to be static + for x in [query, key, value]: + torch._dynamo.mark_static(x, -3) + torch._dynamo.mark_static(x, -1) + + out, lse, max_scores = flex_attention_hop( + query, + key, + value, + score_mod, + block_mask.as_tuple(), + scale, + kernel_options, # type: ignore[union-attr] + ) + return _finalize_outputs( + out, lse, max_scores, return_aux=return_aux, return_lse=return_lse + ) + + if not _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG: + _warn_once( + warning_id="flex_attention_performance", + message=( + "flex_attention called without torch.compile() - this will use an unfused implementation that materializes the full scores matrix instead of generating a fused kernel.\n\n" + "SOLUTION: Use torch.compile(flex_attention)(...)\n\n" + "If you want to debug your score_mod/mask_mod, you can set:\n" + "torch.nn.attention.flex_attention._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = True\n\n" + "This will allow you to use print statements or breakpoints. Note: This doesn't work with the backwards pass and may produce incorrect results." + ), + ) + + if not torch._dynamo.is_dynamo_supported(): + raise RuntimeError("flex_attention requires dynamo support") + + from torch._dynamo.backends.debugging import ( + make_eager_backend_with_torch_function_mode, + ) + + # Dynamo is expecting a callable with "__code__" attribute. + # We cannot directly pass hop to it. So we wrap it in a dummy function. + def _flex_attention_hop_wrapper(*args, **kwargs): + return flex_attention_hop(*args, **kwargs) + + with _set_compilation_env(): + with torch._dynamo.utils.disable_cache_limit(): + with _temp_remove_pre_dispatch_torch_function_mode(): + with _temp_remove_metadata_torch_function_mode() as metadata_mode: + if metadata_mode: + backend: str | Callable[..., Any] = ( + make_eager_backend_with_torch_function_mode(metadata_mode) + ) + else: + backend = "eager" + + if _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG: + flex_fn = _flex_attention_hop_wrapper + else: + flex_fn = torch.compile( + _flex_attention_hop_wrapper, backend=backend, fullgraph=True + ) + + out, lse, max_scores = flex_fn( + query, + key, + value, + score_mod, + block_mask.as_tuple(), # type: ignore[union-attr] + scale, + kernel_options, + ) + return _finalize_outputs( + out, lse, max_scores, return_aux=return_aux, return_lse=return_lse + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py new file mode 100644 index 0000000000000000000000000000000000000000..b20c1b4b2e49a37cf0e29603f20ef50e0caf6146 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py @@ -0,0 +1,326 @@ +""" +Variable-length attention implementation using Flash Attention. + +This module provides a high-level Python interface for variable-length attention +that calls into the optimized Flash Attention kernels. +""" + +import logging +from functools import lru_cache +from typing import Any, NamedTuple + +import torch + + +log = logging.getLogger(__name__) + +__all__ = ["varlen_attn", "AuxRequest"] + + +@lru_cache(maxsize=8) +def _should_use_cudnn(device_index: int) -> bool: + """Cache device capability check to avoid repeated CUDA calls.""" + return False + + +class AuxRequest(NamedTuple): + """ + Request which auxiliary outputs to compute from varlen_attn. + + Each field is a boolean indicating whether that auxiliary output should be computed. + """ + + lse: bool = False + + +@torch.library.custom_op("torch_attn::_varlen_attn", mutates_args={}) +def _varlen_attn( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seq_q: torch.Tensor, + cu_seq_k: torch.Tensor, + max_q: int, + max_k: int, + is_causal: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Private custom op for variable-length attention. + + This is the internal implementation. Users should use the public varlen_attn function instead. + """ + + use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index) + + if use_cudnn: + log.info("Using cuDNN backend for varlen_attn") + result = torch.ops.aten._cudnn_attention_forward( + query, + key, + value, + None, # attn_bias + cu_seq_q, + cu_seq_k, + max_q, + max_k, + True, # compute_log_sumexp + 0.0, # dropout_p hardcoded to 0.0 + is_causal, + False, # return_debug_mask + ) + # cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask) + output, softmax_lse, rng_state = result[0], result[1], result[6] + else: + log.info("Using Flash Attention backend for varlen_attn") + output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward( + query, + key, + value, + cu_seq_q, + cu_seq_k, + max_q, + max_k, + 0.0, # dropout_p hardcoded to 0.0 + is_causal, + return_debug_mask=False, + ) + + rng_state_ = torch.zeros( + (2,), dtype=torch.uint64, device=query.device + ) # hardcoded since dropout is hardcoded to 0 + return output, softmax_lse, rng_state_ + + +@_varlen_attn.register_fake +def _varlen_attn_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seq_q: torch.Tensor, + cu_seq_k: torch.Tensor, + max_q: int, + max_k: int, + is_causal: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Fake implementation for meta tensor computation and tracing. + + Based on the 3D varlen path from meta__flash_attention_forward: + - query shape: (total, num_heads, head_dim) + - logsumexp shape: (num_heads, total_q) + """ + # Output has same shape as query + output = torch.empty_like(query) + + # For varlen path: logsumexp shape is (num_heads, total_q) + total_q = query.size(0) + num_heads = query.size(1) + logsumexp = torch.empty( + (num_heads, total_q), dtype=torch.float, device=query.device + ) + + rng_state = torch.empty((2,), dtype=torch.uint64, device=query.device) + + return output, logsumexp, rng_state + + +def varlen_attn( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seq_q: torch.Tensor, + cu_seq_k: torch.Tensor, + max_q: int, + max_k: int, + is_causal: bool = False, + return_aux: AuxRequest | None = None, +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """ + Compute variable-length attention using Flash Attention. + This function is similar to scaled_dot_product_attention but optimized for + variable-length sequences using cumulative sequence position tensors. + Args: + - query (Tensor): Query tensor; shape :math:`(T_q, H, D)` + - key (Tensor): Key tensor; shape :math:`(T_k, H, D)` + - value (Tensor): Value tensor; shape :math:`(T_k, H, D)` + - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)` + - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)` + - max_q (int): Maximum query sequence length in the batch. + - max_k (int): Maximum key/value sequence length in the batch. + - is_causal (bool, optional): If set to True, applies causal masking (default: False). + - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor. + + Shape legend: + - :math:`N`: Batch size + - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths) + - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths) + - :math:`H`: Number of attention heads + - :math:`D`: Head dimension + + Returns: + - Tensor: Output tensor from attention computation + - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors: + (output, lse), where lse is the logsumexp + + Example:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16 + >>> head_dim = embed_dim // num_heads + >>> seq_lengths = [] + >>> for _ in range(batch_size): + ... length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64 + ... seq_lengths.append(min(length, max_seq_len)) + >>> seq_lengths = torch.tensor(seq_lengths, device="cuda") + >>> total_tokens = seq_lengths.sum().item() + >>> + >>> # Create packed query, key, value tensors + >>> query = torch.randn( + ... total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda" + ... ) + >>> key = torch.randn( + ... total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda" + ... ) + >>> value = torch.randn( + ... total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda" + ... ) + >>> + >>> # Build cumulative sequence tensor + >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32) + >>> cu_seq[1:] = seq_lengths.cumsum(0) + >>> max_len = seq_lengths.max().item() + >>> + >>> # Call varlen_attn + >>> output = varlen_attn( + ... query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False + ... ) + """ + out, lse, _ = torch.ops.torch_attn._varlen_attn( + query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal + ) + if return_aux is not None and return_aux.lse: + return out, lse + return out + + +def _setup_context(ctx: Any, inputs: tuple[Any, ...], output: Any) -> None: + query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal = inputs + out, lse, rng_state = output + + ctx.save_for_backward(query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state) + + ctx.max_q = max_q + ctx.max_k = max_k + ctx.is_causal = is_causal + + +@torch.library.custom_op("torch_attn::_varlen_attn_backward", mutates_args={}) +def _varlen_attn_backward( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + lse: torch.Tensor, + cu_seq_q: torch.Tensor, + cu_seq_k: torch.Tensor, + max_q: int, + max_k: int, + is_causal: bool, + rng_state: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + unused = torch.empty(0, device=query.device) + + use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index) + if use_cudnn: + log.info("Using cuDNN backend for varlen_attn") + dq, dk, dv = torch.ops.aten._cudnn_attention_backward( + grad_out, + query, + key, + value, + out, + lse, + cu_seq_q, + cu_seq_k, + max_q, + max_k, + 0.0, + is_causal, + rng_state, + unused, + ) + else: + log.info("Using Flash Attention backend for varlen_attn") + dq, dk, dv = torch.ops.aten._flash_attention_backward( + grad_out, + query, + key, + value, + out, + lse, + cu_seq_q, + cu_seq_k, + max_q, + max_k, + 0.0, + is_causal, + rng_state, + unused, + ) + return dq, dk, dv + + +@_varlen_attn_backward.register_fake +def _varlen_attn_backward_fake( + grad_out: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + lse: torch.Tensor, + cu_seq_q: torch.Tensor, + cu_seq_k: torch.Tensor, + max_q: int, + max_k: int, + is_causal: bool, + rng_state: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Fake implementation for meta tensor computation and tracing. + """ + + grad_query = torch.empty_like(query) + grad_key = torch.empty_like(key) + grad_value = torch.empty_like(value) + + return grad_query, grad_key, grad_value + + +def _backward( + ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor +) -> tuple[torch.Tensor | None, ...]: + query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state = ctx.saved_tensors + + max_q = ctx.max_q + max_k = ctx.max_k + is_causal = ctx.is_causal + + dq, dk, dv = torch.ops.torch_attn._varlen_attn_backward( + grad_out, + query, + key, + value, + out, + lse, + cu_seq_q, + cu_seq_k, + max_q, + max_k, + is_causal, + rng_state, + ) + return dq, dk, dv, None, None, None, None, None, None + + +_varlen_attn.register_autograd(_backward, setup_context=_setup_context) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py new file mode 100644 index 0000000000000000000000000000000000000000..c56e923a84383a79c2a3f7ebddb3dfa1ce1f0953 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py @@ -0,0 +1,6 @@ +# mypy: allow-untyped-defs +# this is for historical pickle deserialization, it is not used otherwise + + +def _get_thnn_function_backend() -> None: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe9a09aa31464fd4e88b2e46b4210561a70e42e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py @@ -0,0 +1,36 @@ +from torch.ao.nn.intrinsic import ( + BNReLU2d, + BNReLU3d, + ConvBn1d, + ConvBn2d, + ConvBn3d, + ConvBnReLU1d, + ConvBnReLU2d, + ConvBnReLU3d, + ConvReLU1d, + ConvReLU2d, + ConvReLU3d, + LinearBn1d, + LinearReLU, +) +from torch.ao.nn.intrinsic.modules.fused import _FusedModule # noqa: F401 + +# Include the subpackages in case user imports from it directly +from torch.nn.intrinsic import modules, qat, quantized # noqa: F401 + + +__all__ = [ + "ConvBn1d", + "ConvBn2d", + "ConvBn3d", + "ConvBnReLU1d", + "ConvBnReLU2d", + "ConvBnReLU3d", + "ConvReLU1d", + "ConvReLU2d", + "ConvReLU3d", + "LinearReLU", + "BNReLU2d", + "BNReLU3d", + "LinearBn1d", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..99260ad43fc477c36a9780c057824f57d4914719 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py @@ -0,0 +1,334 @@ +from .module import Module # usort: skip +from .linear import Bilinear, Identity, LazyLinear, Linear # usort: skip +from .activation import ( + CELU, + ELU, + GELU, + GLU, + Hardshrink, + Hardsigmoid, + Hardswish, + Hardtanh, + LeakyReLU, + LogSigmoid, + LogSoftmax, + Mish, + MultiheadAttention, + PReLU, + ReLU, + ReLU6, + RReLU, + SELU, + Sigmoid, + SiLU, + Softmax, + Softmax2d, + Softmin, + Softplus, + Softshrink, + Softsign, + Tanh, + Tanhshrink, + Threshold, +) +from .adaptive import AdaptiveLogSoftmaxWithLoss +from .batchnorm import ( + BatchNorm1d, + BatchNorm2d, + BatchNorm3d, + LazyBatchNorm1d, + LazyBatchNorm2d, + LazyBatchNorm3d, + SyncBatchNorm, +) +from .channelshuffle import ChannelShuffle +from .container import ( + Container, + ModuleDict, + ModuleList, + ParameterDict, + ParameterList, + Sequential, +) +from .conv import ( + Conv1d, + Conv2d, + Conv3d, + ConvTranspose1d, + ConvTranspose2d, + ConvTranspose3d, + LazyConv1d, + LazyConv2d, + LazyConv3d, + LazyConvTranspose1d, + LazyConvTranspose2d, + LazyConvTranspose3d, +) +from .distance import CosineSimilarity, PairwiseDistance +from .dropout import ( + AlphaDropout, + Dropout, + Dropout1d, + Dropout2d, + Dropout3d, + FeatureAlphaDropout, +) +from .flatten import Flatten, Unflatten +from .fold import Fold, Unfold +from .instancenorm import ( + InstanceNorm1d, + InstanceNorm2d, + InstanceNorm3d, + LazyInstanceNorm1d, + LazyInstanceNorm2d, + LazyInstanceNorm3d, +) +from .loss import ( + BCELoss, + BCEWithLogitsLoss, + CosineEmbeddingLoss, + CrossEntropyLoss, + CTCLoss, + GaussianNLLLoss, + HingeEmbeddingLoss, + HuberLoss, + KLDivLoss, + L1Loss, + MarginRankingLoss, + MSELoss, + MultiLabelMarginLoss, + MultiLabelSoftMarginLoss, + MultiMarginLoss, + NLLLoss, + NLLLoss2d, + PoissonNLLLoss, + SmoothL1Loss, + SoftMarginLoss, + TripletMarginLoss, + TripletMarginWithDistanceLoss, +) +from .normalization import ( + CrossMapLRN2d, + GroupNorm, + LayerNorm, + LocalResponseNorm, + RMSNorm, +) +from .padding import ( + CircularPad1d, + CircularPad2d, + CircularPad3d, + ConstantPad1d, + ConstantPad2d, + ConstantPad3d, + ReflectionPad1d, + ReflectionPad2d, + ReflectionPad3d, + ReplicationPad1d, + ReplicationPad2d, + ReplicationPad3d, + ZeroPad1d, + ZeroPad2d, + ZeroPad3d, +) +from .pixelshuffle import PixelShuffle, PixelUnshuffle +from .pooling import ( + AdaptiveAvgPool1d, + AdaptiveAvgPool2d, + AdaptiveAvgPool3d, + AdaptiveMaxPool1d, + AdaptiveMaxPool2d, + AdaptiveMaxPool3d, + AvgPool1d, + AvgPool2d, + AvgPool3d, + FractionalMaxPool2d, + FractionalMaxPool3d, + LPPool1d, + LPPool2d, + LPPool3d, + MaxPool1d, + MaxPool2d, + MaxPool3d, + MaxUnpool1d, + MaxUnpool2d, + MaxUnpool3d, +) +from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNN, RNNBase, RNNCell, RNNCellBase +from .sparse import Embedding, EmbeddingBag +from .transformer import ( + Transformer, + TransformerDecoder, + TransformerDecoderLayer, + TransformerEncoder, + TransformerEncoderLayer, +) +from .upsampling import Upsample, UpsamplingBilinear2d, UpsamplingNearest2d + + +__all__ = [ + "AdaptiveAvgPool1d", + "AdaptiveAvgPool2d", + "AdaptiveAvgPool3d", + "AdaptiveLogSoftmaxWithLoss", + "AdaptiveMaxPool1d", + "AdaptiveMaxPool2d", + "AdaptiveMaxPool3d", + "AlphaDropout", + "AvgPool1d", + "AvgPool2d", + "AvgPool3d", + "BCELoss", + "BCEWithLogitsLoss", + "BatchNorm1d", + "BatchNorm2d", + "BatchNorm3d", + "Bilinear", + "CELU", + "CTCLoss", + "ChannelShuffle", + "CircularPad1d", + "CircularPad2d", + "CircularPad3d", + "ConstantPad1d", + "ConstantPad2d", + "ConstantPad3d", + "Container", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "CosineEmbeddingLoss", + "CosineSimilarity", + "CrossEntropyLoss", + "CrossMapLRN2d", + "Dropout", + "Dropout1d", + "Dropout2d", + "Dropout3d", + "ELU", + "Embedding", + "EmbeddingBag", + "FeatureAlphaDropout", + "Flatten", + "Fold", + "FractionalMaxPool2d", + "FractionalMaxPool3d", + "GELU", + "GLU", + "GRU", + "GRUCell", + "GaussianNLLLoss", + "GroupNorm", + "Hardshrink", + "Hardsigmoid", + "Hardswish", + "Hardtanh", + "HingeEmbeddingLoss", + "HuberLoss", + "Identity", + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", + "KLDivLoss", + "L1Loss", + "LPPool1d", + "LPPool2d", + "LPPool3d", + "LSTM", + "LSTMCell", + "LayerNorm", + "LazyBatchNorm1d", + "LazyBatchNorm2d", + "LazyBatchNorm3d", + "LazyConv1d", + "LazyConv2d", + "LazyConv3d", + "LazyConvTranspose1d", + "LazyConvTranspose2d", + "LazyConvTranspose3d", + "LazyInstanceNorm1d", + "LazyInstanceNorm2d", + "LazyInstanceNorm3d", + "LazyLinear", + "LeakyReLU", + "Linear", + "LocalResponseNorm", + "LogSigmoid", + "LogSoftmax", + "MSELoss", + "MarginRankingLoss", + "MaxPool1d", + "MaxPool2d", + "MaxPool3d", + "MaxUnpool1d", + "MaxUnpool2d", + "MaxUnpool3d", + "Mish", + "Module", + "ModuleDict", + "ModuleList", + "MultiLabelMarginLoss", + "MultiLabelSoftMarginLoss", + "MultiMarginLoss", + "MultiheadAttention", + "NLLLoss", + "NLLLoss2d", + "PReLU", + "PairwiseDistance", + "ParameterDict", + "ParameterList", + "PixelShuffle", + "PixelUnshuffle", + "PoissonNLLLoss", + "RMSNorm", + "RNN", + "RNNBase", + "RNNCell", + "RNNCellBase", + "RReLU", + "ReLU", + "ReLU6", + "ReflectionPad1d", + "ReflectionPad2d", + "ReflectionPad3d", + "ReplicationPad1d", + "ReplicationPad2d", + "ReplicationPad3d", + "SELU", + "Sequential", + "SiLU", + "Sigmoid", + "SmoothL1Loss", + "SoftMarginLoss", + "Softmax", + "Softmax2d", + "Softmin", + "Softplus", + "Softshrink", + "Softsign", + "SyncBatchNorm", + "Tanh", + "Tanhshrink", + "Threshold", + "Transformer", + "TransformerDecoder", + "TransformerDecoderLayer", + "TransformerEncoder", + "TransformerEncoderLayer", + "TripletMarginLoss", + "TripletMarginWithDistanceLoss", + "Unflatten", + "Unfold", + "Upsample", + "UpsamplingBilinear2d", + "UpsamplingNearest2d", + "ZeroPad1d", + "ZeroPad2d", + "ZeroPad3d", +] + +# Please keep this list sorted +assert __all__ == sorted(__all__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..408e6ef42f12843ddbfc38d540fc68e454c9e958 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py @@ -0,0 +1,319 @@ +# mypy: allow-untyped-defs +import torch +import torch.distributed as dist +from torch.autograd.function import Function + + +class SyncBatchNorm(Function): + @staticmethod + # pyrefly: ignore [bad-override] + def forward( + self, + input, + weight, + bias, + running_mean, + running_var, + eps, + momentum, + process_group, + world_size, + ): + if not ( + input.is_contiguous(memory_format=torch.channels_last) + or input.is_contiguous(memory_format=torch.channels_last_3d) + ): + input = input.contiguous() + if weight is not None: + weight = weight.contiguous() + + size = int(input.numel() // input.size(1)) + if size == 1 and world_size < 2: + raise ValueError( + f"Expected more than 1 value per channel when training, got input size {size}" + ) + + num_channels = input.shape[1] + if input.numel() > 0: + # calculate mean/invstd for input. + mean, invstd = torch.batch_norm_stats(input, eps) + + count = torch.full( + (1,), + input.numel() // input.size(1), + dtype=mean.dtype, + device=mean.device, + ) + + # C, C, 1 -> (2C + 1) + combined = torch.cat([mean, invstd, count], dim=0) + else: + # for empty input, set stats and the count to zero. The stats with + # zero count will be filtered out later when computing global mean + # & invstd, but they still needs to participate the all_gather + # collective communication to unblock other peer processes. + combined = torch.zeros( + 2 * num_channels + 1, dtype=input.dtype, device=input.device + ) + + # Use allgather instead of allreduce because count could be different across + # ranks, simple all reduce op can not give correct results. + # batch_norm_gather_stats_with_counts calculates global mean & invstd based on + # all gathered mean, invstd and count. + # for nccl backend, use the optimized version of all gather. + # The Gloo backend does not support `all_gather_into_tensor`. + if process_group._get_backend_name() != "gloo": + # world_size * (2C + 1) + combined_size = combined.numel() + combined_flat = torch.empty( + 1, + combined_size * world_size, + dtype=combined.dtype, + device=combined.device, + ) + dist.all_gather_into_tensor( + combined_flat, combined, process_group, async_op=False + ) + combined = torch.reshape(combined_flat, (world_size, combined_size)) + # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1 + mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1) + else: + # world_size * (2C + 1) + combined_list = [torch.empty_like(combined) for _ in range(world_size)] + dist.all_gather(combined_list, combined, process_group, async_op=False) + combined = torch.stack(combined_list, dim=0) + # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1 + mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1) + + if not (torch.cuda.is_available() and torch.cuda.is_current_stream_capturing()): + # The lines below force a synchronization between CUDA and CPU, because + # the shape of the result count_all depends on the values in mask tensor. + # Such synchronizations break CUDA Graph capturing. + # See https://github.com/pytorch/pytorch/issues/78549 + # FIXME: https://github.com/pytorch/pytorch/issues/78656 describes + # a better longer-term solution. + + # remove stats from empty inputs + mask = count_all.squeeze(-1) >= 1 + count_all = count_all[mask] + mean_all = mean_all[mask] + invstd_all = invstd_all[mask] + + # calculate global mean & invstd + counts = count_all.view(-1) + if running_mean is not None and counts.dtype != running_mean.dtype: + counts = counts.to(running_mean.dtype) + mean, invstd = torch.batch_norm_gather_stats_with_counts( + input, + mean_all, + invstd_all, + running_mean, + running_var, + momentum, + eps, + counts, + ) + + self.save_for_backward(input, weight, mean, invstd, count_all.to(torch.int32)) + self.process_group = process_group + + # apply element-wise normalization + if input.numel() > 0: + return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps) + else: + return torch.empty_like(input) + + @staticmethod + def backward(self, grad_output): + if not ( + grad_output.is_contiguous(memory_format=torch.channels_last) + or grad_output.is_contiguous(memory_format=torch.channels_last_3d) + ): + grad_output = grad_output.contiguous() + saved_input, weight, mean, invstd, count_tensor = self.saved_tensors + grad_input = grad_weight = grad_bias = None + process_group = self.process_group + + if saved_input.numel() > 0: + # calculate local stats as well as grad_weight / grad_bias + ( + sum_dy, + sum_dy_xmu, + grad_weight, + grad_bias, + ) = torch.batch_norm_backward_reduce( + grad_output, + saved_input, + mean, + invstd, + weight, + self.needs_input_grad[0], + self.needs_input_grad[1], + self.needs_input_grad[2], + ) + + if self.needs_input_grad[0]: + # synchronizing stats used to calculate input gradient. + num_channels = sum_dy.shape[0] + combined = torch.cat([sum_dy, sum_dy_xmu], dim=0) + torch.distributed.all_reduce( + combined, + torch.distributed.ReduceOp.SUM, + process_group, + async_op=False, + ) + sum_dy, sum_dy_xmu = torch.split(combined, num_channels) + + # backward pass for gradient calculation + if weight is not None and weight.dtype != mean.dtype: + weight = weight.to(mean.dtype) + grad_input = torch.batch_norm_backward_elemt( + grad_output, + saved_input, + mean, + invstd, + weight, + sum_dy, + sum_dy_xmu, + count_tensor, + ) + # synchronizing of grad_weight / grad_bias is not needed as distributed + # training would handle all reduce. + if weight is None or not self.needs_input_grad[1]: + grad_weight = None + + if weight is None or not self.needs_input_grad[2]: + grad_bias = None + else: + # This process got an empty input tensor in the forward pass. + # Although this process can directly set grad_input as an empty + # tensor of zeros, it still needs to participate in the collective + # communication to unblock its peers, as other peer processes might + # have received non-empty inputs. + num_channels = saved_input.shape[1] + if self.needs_input_grad[0]: + # launch all_reduce to unblock other peer processes + combined = torch.zeros( + 2 * num_channels, dtype=saved_input.dtype, device=saved_input.device + ) + torch.distributed.all_reduce( + combined, + torch.distributed.ReduceOp.SUM, + process_group, + async_op=False, + ) + + # Leave grad_input, grad_weight and grad_bias as None, which will be + # interpreted by the autograd engine as Tensors full of zeros. + + return grad_input, grad_weight, grad_bias, None, None, None, None, None, None + + +class CrossMapLRN2d(Function): + @staticmethod + # pyrefly: ignore [bad-override] + def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1): + ctx.size = size + ctx.alpha = alpha + ctx.beta = beta + ctx.k = k + ctx.scale = None + + if input.dim() != 4: + raise ValueError( + f"CrossMapLRN2d: Expected input to be 4D, got {input.dim()}D instead." + ) + + ctx.scale = ctx.scale or input.new() + output = input.new() + channels = input.size(1) + + output.resize_as_(input) + ctx.scale.resize_as_(input) + + # use output storage as temporary buffer + input_square = output + torch.pow(input, 2, out=input_square) + + pre_pad = int((ctx.size - 1) / 2 + 1) + pre_pad_crop = min(pre_pad, channels) + + scale_first = ctx.scale.select(1, 0) + scale_first.zero_() + # compute first feature map normalization + for c in range(pre_pad_crop): + scale_first.add_(input_square.select(1, c)) + + # reuse computations for next feature maps normalization + # by adding the next feature map and removing the previous + for c in range(1, channels): + scale_previous = ctx.scale.select(1, c - 1) + scale_current = ctx.scale.select(1, c) + scale_current.copy_(scale_previous) + if c < channels - pre_pad + 1: + square_next = input_square.select(1, c + pre_pad - 1) + scale_current.add_(square_next, alpha=1) + + if c > pre_pad: + square_previous = input_square.select(1, c - pre_pad) + scale_current.add_(square_previous, alpha=-1) + + ctx.scale.mul_(ctx.alpha / ctx.size).add_(ctx.k) + + torch.pow(ctx.scale, -ctx.beta, out=output) + output.mul_(input) + + ctx.save_for_backward(input, output) + return output + + @staticmethod + # pyrefly: ignore [bad-override] + def backward(ctx, grad_output): + input, output = ctx.saved_tensors + grad_input = grad_output.new() + + batch_size = input.size(0) + channels = input.size(1) + input_height = input.size(2) + input_width = input.size(3) + + paddded_ratio = input.new(channels + ctx.size - 1, input_height, input_width) + accum_ratio = input.new(input_height, input_width) + + cache_ratio_value = 2 * ctx.alpha * ctx.beta / ctx.size + inversePrePad = int(ctx.size - (ctx.size - 1) / 2) + + grad_input.resize_as_(input) + torch.pow(ctx.scale, -ctx.beta, out=grad_input).mul_(grad_output) + + paddded_ratio.zero_() + padded_ratio_center = paddded_ratio.narrow(0, inversePrePad, channels) + for n in range(batch_size): + torch.mul(grad_output[n], output[n], out=padded_ratio_center) + padded_ratio_center.div_(ctx.scale[n]) + torch.sum( + paddded_ratio.narrow(0, 0, ctx.size - 1), + 0, + keepdim=False, + out=accum_ratio, + ) + for c in range(channels): + accum_ratio.add_(paddded_ratio[c + ctx.size - 1]) + grad_input[n][c].addcmul_( + input[n][c], accum_ratio, value=-cache_ratio_value + ) + accum_ratio.add_(paddded_ratio[c], alpha=-1) + + return grad_input, None, None, None, None + + +class BackwardHookFunction(torch.autograd.Function): + @staticmethod + # pyrefly: ignore [bad-override] + def forward(ctx, *args): + ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad]) + return args + + @staticmethod + def backward(ctx, *args): + return args diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..dac27cdb0d2464847a85e4ee8683326188875977 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py @@ -0,0 +1,1905 @@ +# mypy: allow-untyped-defs +import warnings + +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ +from torch.nn.parameter import Parameter + +from .linear import NonDynamicallyQuantizableLinear +from .module import Module + + +__all__ = [ + "Threshold", + "ReLU", + "RReLU", + "Hardtanh", + "ReLU6", + "Sigmoid", + "Hardsigmoid", + "Tanh", + "SiLU", + "Mish", + "Hardswish", + "ELU", + "CELU", + "SELU", + "GLU", + "GELU", + "Hardshrink", + "LeakyReLU", + "LogSigmoid", + "Softplus", + "Softshrink", + "MultiheadAttention", + "PReLU", + "Softsign", + "Tanhshrink", + "Softmin", + "Softmax", + "Softmax2d", + "LogSoftmax", +] + + +class Threshold(Module): + r"""Thresholds each element of the input Tensor. + + Threshold is defined as: + + .. math:: + y = + \begin{cases} + x, &\text{ if } x > \text{threshold} \\ + \text{value}, &\text{ otherwise } + \end{cases} + + Args: + threshold: The value to threshold at + value: The value to replace with + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Threshold.png + + Examples:: + + >>> m = nn.Threshold(0, 0.5) + >>> input = torch.arange(-3, 3) + >>> output = m(input) + """ + + __constants__ = ["threshold", "value", "inplace"] + + threshold: float + value: float + inplace: bool + + def __init__(self, threshold: float, value: float, inplace: bool = False) -> None: + super().__init__() + self.threshold = threshold + self.value = value + self.inplace = inplace + # TODO: check in THNN (if inplace == True, then assert value <= threshold) + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.threshold(input, self.threshold, self.value, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"threshold={self.threshold}, value={self.value}{inplace_str}" + + +class ReLU(Module): + r"""Applies the rectified linear unit function element-wise. + + :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)` + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ReLU.png + + Examples:: + + >>> m = nn.ReLU() + >>> input = torch.randn(2) + >>> output = m(input) + + + An implementation of CReLU - https://arxiv.org/abs/1603.05201 + + >>> m = nn.ReLU() + >>> input = torch.randn(2).unsqueeze(0) + >>> output = torch.cat((m(input), m(-input))) + """ + + __constants__ = ["inplace"] + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.relu(input, inplace=self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = "inplace=True" if self.inplace else "" + return inplace_str + + +class RReLU(Module): + r"""Applies the randomized leaky rectified linear unit function, element-wise. + + Method described in the paper: + `Empirical Evaluation of Rectified Activations in Convolutional Network `_. + + The function is defined as: + + .. math:: + \text{RReLU}(x) = + \begin{cases} + x & \text{if } x \geq 0 \\ + ax & \text{ otherwise } + \end{cases} + + where :math:`a` is randomly sampled from uniform distribution + :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during + evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`. + + Args: + lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}` + upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}` + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/RReLU.png + + Examples:: + + >>> m = nn.RReLU(0.1, 0.3) + >>> input = torch.randn(2) + >>> output = m(input) + + """ + + __constants__ = ["lower", "upper", "inplace"] + + lower: float + upper: float + inplace: bool + + def __init__( + self, lower: float = 1.0 / 8, upper: float = 1.0 / 3, inplace: bool = False + ) -> None: + super().__init__() + self.lower = lower + self.upper = upper + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.rrelu(input, self.lower, self.upper, self.training, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"lower={self.lower}, upper={self.upper}{inplace_str}" + + +class Hardtanh(Module): + r"""Applies the HardTanh function element-wise. + + HardTanh is defined as: + + .. math:: + \text{HardTanh}(x) = \begin{cases} + \text{max\_val} & \text{ if } x > \text{ max\_val } \\ + \text{min\_val} & \text{ if } x < \text{ min\_val } \\ + x & \text{ otherwise } \\ + \end{cases} + + Args: + min_val: minimum value of the linear region range. Default: -1 + max_val: maximum value of the linear region range. Default: 1 + inplace: can optionally do the operation in-place. Default: ``False`` + + Keyword arguments :attr:`min_value` and :attr:`max_value` + have been deprecated in favor of :attr:`min_val` and :attr:`max_val`. + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardtanh.png + + Examples:: + + >>> m = nn.Hardtanh(-2, 2) + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["min_val", "max_val", "inplace"] + + min_val: float + max_val: float + inplace: bool + + def __init__( + self, + min_val: float = -1.0, + max_val: float = 1.0, + inplace: bool = False, + min_value: float | None = None, + max_value: float | None = None, + ) -> None: + super().__init__() + if min_value is not None: + warnings.warn( + "keyword argument `min_value` is deprecated and rename to `min_val`", + FutureWarning, + stacklevel=2, + ) + min_val = min_value + if max_value is not None: + warnings.warn( + "keyword argument `max_value` is deprecated and rename to `max_val`", + FutureWarning, + stacklevel=2, + ) + max_val = max_value + + self.min_val = min_val + self.max_val = max_val + self.inplace = inplace + assert self.max_val > self.min_val + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.hardtanh(input, self.min_val, self.max_val, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"min_val={self.min_val}, max_val={self.max_val}{inplace_str}" + + +class ReLU6(Hardtanh): + r"""Applies the ReLU6 function element-wise. + + .. math:: + \text{ReLU6}(x) = \min(\max(0,x), 6) + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ReLU6.png + + Examples:: + + >>> m = nn.ReLU6() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def __init__(self, inplace: bool = False) -> None: + super().__init__(0.0, 6.0, inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = "inplace=True" if self.inplace else "" + return inplace_str + + +class Sigmoid(Module): + r"""Applies the Sigmoid function element-wise. + + .. math:: + \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)} + + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Sigmoid.png + + Examples:: + + >>> m = nn.Sigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return torch.sigmoid(input) + + +class Hardsigmoid(Module): + r"""Applies the Hardsigmoid function element-wise. + + Hardsigmoid is defined as: + + .. math:: + \text{Hardsigmoid}(x) = \begin{cases} + 0 & \text{if~} x \le -3, \\ + 1 & \text{if~} x \ge +3, \\ + x / 6 + 1 / 2 & \text{otherwise} + \end{cases} + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardsigmoid.png + + Examples:: + + >>> m = nn.Hardsigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["inplace"] + + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.hardsigmoid(input, self.inplace) + + +class Tanh(Module): + r"""Applies the Hyperbolic Tangent (Tanh) function element-wise. + + Tanh is defined as: + + .. math:: + \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)} + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Tanh.png + + Examples:: + + >>> m = nn.Tanh() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return torch.tanh(input) + + +class SiLU(Module): + r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise. + + The SiLU function is also known as the swish function. + + .. math:: + \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.} + + .. note:: + See `Gaussian Error Linear Units (GELUs) `_ + where the SiLU (Sigmoid Linear Unit) was originally coined, and see + `Sigmoid-Weighted Linear Units for Neural Network Function Approximation + in Reinforcement Learning `_ and `Swish: + a Self-Gated Activation Function `_ + where the SiLU was experimented with later. + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/SiLU.png + + Examples:: + + >>> m = nn.SiLU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["inplace"] + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.silu(input, inplace=self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = "inplace=True" if self.inplace else "" + return inplace_str + + +class Mish(Module): + r"""Applies the Mish function, element-wise. + + Mish: A Self Regularized Non-Monotonic Neural Activation Function. + + .. math:: + \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x)) + + .. note:: + See `Mish: A Self Regularized Non-Monotonic Neural Activation Function `_ + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Mish.png + + Examples:: + + >>> m = nn.Mish() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["inplace"] + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.mish(input, inplace=self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = "inplace=True" if self.inplace else "" + return inplace_str + + +class Hardswish(Module): + r"""Applies the Hardswish function, element-wise. + + Method described in the paper: `Searching for MobileNetV3 `_. + + Hardswish is defined as: + + .. math:: + \text{Hardswish}(x) = \begin{cases} + 0 & \text{if~} x \le -3, \\ + x & \text{if~} x \ge +3, \\ + x \cdot (x + 3) /6 & \text{otherwise} + \end{cases} + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardswish.png + + Examples:: + + >>> m = nn.Hardswish() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["inplace"] + + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.hardswish(input, self.inplace) + + +class ELU(Module): + r"""Applies the Exponential Linear Unit (ELU) function, element-wise. + + Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear + Units (ELUs) `__. + + ELU is defined as: + + .. math:: + \text{ELU}(x) = \begin{cases} + x, & \text{ if } x > 0\\ + \alpha * (\exp(x) - 1), & \text{ if } x \leq 0 + \end{cases} + + Args: + alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ELU.png + + Examples:: + + >>> m = nn.ELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["alpha", "inplace"] + alpha: float + inplace: bool + + def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None: + super().__init__() + self.alpha = alpha + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.elu(input, self.alpha, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"alpha={self.alpha}{inplace_str}" + + +class CELU(Module): + r"""Applies the CELU function element-wise. + + .. math:: + \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1)) + + More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ . + + Args: + alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/CELU.png + + Examples:: + + >>> m = nn.CELU() + >>> input = torch.randn(2) + >>> output = m(input) + + .. _`Continuously Differentiable Exponential Linear Units`: + https://arxiv.org/abs/1704.07483 + """ + + __constants__ = ["alpha", "inplace"] + alpha: float + inplace: bool + + def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None: + super().__init__() + self.alpha = alpha + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.celu(input, self.alpha, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"alpha={self.alpha}{inplace_str}" + + +class SELU(Module): + r"""Applies the SELU function element-wise. + + .. math:: + \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1))) + + with :math:`\alpha = 1.6732632423543772848170429916717` and + :math:`\text{scale} = 1.0507009873554804934193349852946`. + + .. warning:: + When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation, + ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'`` + in order to get `Self-Normalizing Neural Networks`_. + See :func:`torch.nn.init.calculate_gain` for more information. + + More details can be found in the paper `Self-Normalizing Neural Networks`_ . + + Args: + inplace (bool, optional): can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/SELU.png + + Examples:: + + >>> m = nn.SELU() + >>> input = torch.randn(2) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + """ + + __constants__ = ["inplace"] + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.selu(input, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = "inplace=True" if self.inplace else "" + return inplace_str + + +class GLU(Module): + r"""Applies the gated linear unit function. + + :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half + of the input matrices and :math:`b` is the second half. + + Args: + dim (int): the dimension on which to split the input. Default: -1 + + Shape: + - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional + dimensions + - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2` + + .. image:: ../scripts/activation_images/GLU.png + + Examples:: + + >>> m = nn.GLU() + >>> input = torch.randn(4, 2) + >>> output = m(input) + """ + + __constants__ = ["dim"] + dim: int + + def __init__(self, dim: int = -1) -> None: + super().__init__() + self.dim = dim + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.glu(input, self.dim) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"dim={self.dim}" + + +class GELU(Module): + r"""Applies the Gaussian Error Linear Units function. + + .. math:: \text{GELU}(x) = x * \Phi(x) + + where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. + + When the approximate argument is 'tanh', Gelu is estimated with: + + .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3))) + + Args: + approximate (str, optional): the gelu approximation algorithm to use: + ``'none'`` | ``'tanh'``. Default: ``'none'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/GELU.png + + Examples:: + + >>> m = nn.GELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["approximate"] + approximate: str + + def __init__(self, approximate: str = "none") -> None: + super().__init__() + self.approximate = approximate + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.gelu(input, approximate=self.approximate) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"approximate={repr(self.approximate)}" + + +class Hardshrink(Module): + r"""Applies the Hard Shrinkage (Hardshrink) function element-wise. + + Hardshrink is defined as: + + .. math:: + \text{HardShrink}(x) = + \begin{cases} + x, & \text{ if } x > \lambda \\ + x, & \text{ if } x < -\lambda \\ + 0, & \text{ otherwise } + \end{cases} + + Args: + lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardshrink.png + + Examples:: + + >>> m = nn.Hardshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["lambd"] + lambd: float + + def __init__(self, lambd: float = 0.5) -> None: + super().__init__() + self.lambd = lambd + + def forward(self, input: Tensor) -> Tensor: + """ + Run forward pass. + """ + return F.hardshrink(input, self.lambd) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"{self.lambd}" + + +class LeakyReLU(Module): + r"""Applies the LeakyReLU function element-wise. + + .. math:: + \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x) + + + or + + .. math:: + \text{LeakyReLU}(x) = + \begin{cases} + x, & \text{ if } x \geq 0 \\ + \text{negative\_slope} \times x, & \text{ otherwise } + \end{cases} + + Args: + negative_slope: Controls the angle of the negative slope (which is used for + negative input values). Default: 1e-2 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + .. image:: ../scripts/activation_images/LeakyReLU.png + + Examples:: + + >>> m = nn.LeakyReLU(0.1) + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["inplace", "negative_slope"] + inplace: bool + negative_slope: float + + def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None: + super().__init__() + self.negative_slope = negative_slope + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + """ + Run forward pass. + """ + return F.leaky_relu(input, self.negative_slope, self.inplace) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + inplace_str = ", inplace=True" if self.inplace else "" + return f"negative_slope={self.negative_slope}{inplace_str}" + + +class LogSigmoid(Module): + r"""Applies the Logsigmoid function element-wise. + + .. math:: + \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right) + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/LogSigmoid.png + + Examples:: + + >>> m = nn.LogSigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Run forward pass. + """ + return F.logsigmoid(input) + + +class Softplus(Module): + r"""Applies the Softplus function element-wise. + + .. math:: + \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) + + SoftPlus is a smooth approximation to the ReLU function and can be used + to constrain the output of a machine to always be positive. + + For numerical stability the implementation reverts to the linear function + when :math:`input \times \beta > threshold`. + + Args: + beta: the :math:`\beta` value for the Softplus formulation. Default: 1 + threshold: values above this revert to a linear function. Default: 20 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softplus.png + + Examples:: + + >>> m = nn.Softplus() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["beta", "threshold"] + beta: float + threshold: float + + def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None: + super().__init__() + self.beta = beta + self.threshold = threshold + + def forward(self, input: Tensor) -> Tensor: + """ + Run forward pass. + """ + return F.softplus(input, self.beta, self.threshold) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"beta={self.beta}, threshold={self.threshold}" + + +class Softshrink(Module): + r"""Applies the soft shrinkage function element-wise. + + .. math:: + \text{SoftShrinkage}(x) = + \begin{cases} + x - \lambda, & \text{ if } x > \lambda \\ + x + \lambda, & \text{ if } x < -\lambda \\ + 0, & \text{ otherwise } + \end{cases} + + Args: + lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softshrink.png + + Examples:: + + >>> m = nn.Softshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["lambd"] + lambd: float + + def __init__(self, lambd: float = 0.5) -> None: + super().__init__() + self.lambd = lambd + + def forward(self, input: Tensor) -> Tensor: + """ + Run forward pass. + """ + return F.softshrink(input, self.lambd) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return str(self.lambd) + + +def _check_arg_device(x: torch.Tensor | None) -> bool: + if x is not None: + return x.device.type in [ + "cpu", + "cuda", + torch.utils.backend_registration._privateuse1_backend_name, + ] + return True + + +def _arg_requires_grad(x: torch.Tensor | None) -> bool: + if x is not None: + return x.requires_grad + return False + + +def _is_make_fx_tracing(): + if not torch.jit.is_scripting(): + torch_dispatch_mode_stack = ( + torch.utils._python_dispatch._get_current_dispatch_mode_stack() + ) + # this can be triggered when dynamo inlining the module too. + return ( + any( + type(x) is torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode + for x in torch_dispatch_mode_stack + ) + or torch.compiler.is_exporting() + ) + else: + return False + + +class MultiheadAttention(Module): + r"""Allows the model to jointly attend to information from different representation subspaces. + + This MultiheadAttention layer implements the original architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build efficient layers from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + Multi-Head Attention is defined as: + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,\dots,\text{head}_h)W^O + + where :math:`\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. + + ``nn.MultiheadAttention`` will use the optimized implementations of + ``scaled_dot_product_attention()`` when possible. + + In addition to support for the new ``scaled_dot_product_attention()`` + function, for speeding up Inference, MHA will use + fastpath inference with support for Nested Tensors, iff: + + - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor). + - inputs are batched (3D) with ``batch_first==True`` + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - ``add_bias_kv`` is ``False`` + - ``add_zero_attn`` is ``False`` + - ``kdim`` and ``vdim`` are equal to ``embed_dim`` + - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` + nor ``attn_mask`` is passed + - autocast is disabled + + If the optimized inference fastpath implementation is in use, a + `NestedTensor `_ can be passed for + ``query``/``key``/``value`` to represent padding more efficiently than using a + padding mask. In this case, a `NestedTensor `_ + will be returned, and an additional speedup proportional to the fraction of the input + that is padding can be expected. + + Args: + embed_dim: Total dimension of the model. + num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split + across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). + dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). + bias: If specified, adds bias to input / output projection layers. Default: ``True``. + add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. + add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. + Default: ``False``. + kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). + vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + + Examples:: + + >>> # xdoctest: +SKIP + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + + .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`: + https://arxiv.org/abs/2205.14135 + + """ + + __constants__ = ["batch_first"] + bias_k: torch.Tensor | None + bias_v: torch.Tensor | None + + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + device=None, + dtype=None, + ) -> None: + if embed_dim <= 0 or num_heads <= 0: + raise ValueError( + f"embed_dim and num_heads must be greater than 0," + f" got embed_dim={embed_dim} and num_heads={num_heads} instead" + ) + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, ( + "embed_dim must be divisible by num_heads" + ) + + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter( + torch.empty((embed_dim, embed_dim), **factory_kwargs) + ) + self.k_proj_weight = Parameter( + torch.empty((embed_dim, self.kdim), **factory_kwargs) + ) + self.v_proj_weight = Parameter( + torch.empty((embed_dim, self.vdim), **factory_kwargs) + ) + self.register_parameter("in_proj_weight", None) + else: + self.in_proj_weight = Parameter( + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + ) + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) + else: + self.register_parameter("in_proj_bias", None) + self.out_proj = NonDynamicallyQuantizableLinear( + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self._reset_parameters() + + def _reset_parameters(self) -> None: + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.0) + constant_(self.out_proj.bias, 0.0) + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if "_qkv_same_embed_dim" not in state: + state["_qkv_same_embed_dim"] = True + + super().__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Tensor | None = None, + need_weights: bool = True, + attn_mask: Tensor | None = None, + average_attn_weights: bool = True, + is_causal: bool = False, + ) -> tuple[Tensor, Tensor | None]: + r"""Compute attention outputs using query, key, and value embeddings. + + Supports optional parameters for padding, masks and attention weights. + + Args: + query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` + or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, + :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. + Queries are compared against key-value pairs to produce the output. + See "Attention Is All You Need" for more details. + key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` + or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, + :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. + See "Attention Is All You Need" for more details. + value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when + ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source + sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. + See "Attention Is All You Need" for more details. + key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` + to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. + Binary and float masks are supported. + For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for + the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. + need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. + Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention`` + and achieve the best performance for MHA. + Default: ``True``. + attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape + :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, + :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be + broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. + Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the + corresponding position is not allowed to attend. For a float mask, the mask values will be added to + the attention weight. + If both attn_mask and key_padding_mask are supplied, their types should match. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across + heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an + effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) + is_causal: If specified, applies a causal mask as attention mask. + Default: ``False``. + Warning: + ``is_causal`` provides a hint that ``attn_mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Outputs: + - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, + :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, + where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the + embedding dimension ``embed_dim``. + - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, + returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. + + .. note:: + `batch_first` argument is ignored for unbatched inputs. + """ # noqa: B950 + why_not_fast_path = "" + if ( + (attn_mask is not None and torch.is_floating_point(attn_mask)) + or (key_padding_mask is not None) + and torch.is_floating_point(key_padding_mask) + ): + why_not_fast_path = "floating-point masks are not supported for fast path." + + is_batched = query.dim() == 3 + + key_padding_mask = F._canonical_mask( + mask=key_padding_mask, + mask_name="key_padding_mask", + other_type=F._none_or_dtype(attn_mask), + other_name="attn_mask", + target_type=query.dtype, + ) + + attn_mask = F._canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + + if not is_fastpath_enabled: + why_not_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True" + elif not is_batched: + why_not_fast_path = ( + f"input not batched; expected query.dim() of 3 but got {query.dim()}" + ) + elif query is not key or key is not value: + # When lifting this restriction, don't forget to either + # enforce that the dtypes all match or test cases where + # they don't! + why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" + elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + elif self.in_proj_weight is None: + why_not_fast_path = "in_proj_weight was None" + elif query.dtype != self.in_proj_weight.dtype: + # this case will fail anyway, but at least they'll get a useful error message. + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + elif self.training: + why_not_fast_path = "training is enabled" + elif (self.num_heads % 2) != 0: + why_not_fast_path = "self.num_heads is not even" + elif not self.batch_first: + why_not_fast_path = "batch_first was not True" + elif self.bias_k is not None: + why_not_fast_path = "self.bias_k was not None" + elif self.bias_v is not None: + why_not_fast_path = "self.bias_v was not None" + elif self.add_zero_attn: + why_not_fast_path = "add_zero_attn was enabled" + elif not self._qkv_same_embed_dim: + why_not_fast_path = "_qkv_same_embed_dim was not True" + elif query.is_nested and ( + key_padding_mask is not None or attn_mask is not None + ): + why_not_fast_path = ( + "supplying both src_key_padding_mask and src_mask at the same time \ + is not supported with NestedTensor input" + ) + elif torch.is_autocast_enabled(): + why_not_fast_path = "autocast is enabled" + + if not why_not_fast_path: + tensor_args = ( + query, + key, + value, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + ) + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + if torch.overrides.has_torch_function(tensor_args): + why_not_fast_path = "some Tensor argument has_torch_function" + elif _is_make_fx_tracing(): + why_not_fast_path = "we are running make_fx tracing" + elif not all(_check_arg_device(x) for x in tensor_args): + why_not_fast_path = ( + "some Tensor argument's device is neither one of " + f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}" + ) + elif torch.is_grad_enabled() and any( + _arg_requires_grad(x) for x in tensor_args + ): + why_not_fast_path = ( + "grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad" + ) + if not why_not_fast_path: + merged_mask, mask_type = self.merge_masks( + attn_mask, key_padding_mask, query + ) + + if self.in_proj_bias is not None and self.in_proj_weight is not None: + return torch._native_multi_head_attention( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + merged_mask, + need_weights, + average_attn_weights, + mask_type, + ) + + any_nested = query.is_nested or key.is_nested or value.is_nested + assert not any_nested, ( + "MultiheadAttention does not support NestedTensor outside of its fast path. " + + f"The fast path was not hit because {why_not_fast_path}" + ) + + if self.batch_first and is_batched: + # make sure that the transpose op does not affect the "is" property + if key is value: + if query is key: + query = key = value = query.transpose(1, 0) + else: + query, key = (x.transpose(1, 0) for x in (query, key)) + value = key + else: + query, key, value = (x.transpose(1, 0) for x in (query, key, value)) + + if not self._qkv_same_embed_dim: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, + k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight, + average_attn_weights=average_attn_weights, + is_causal=is_causal, + ) + else: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + is_causal=is_causal, + ) + if self.batch_first and is_batched: + return attn_output.transpose(1, 0), attn_output_weights + else: + return attn_output, attn_output_weights + + def merge_masks( + self, + attn_mask: Tensor | None, + key_padding_mask: Tensor | None, + query: Tensor, + ) -> tuple[Tensor | None, int | None]: + r"""Determine mask type and combine masks if necessary. + + If only one mask is provided, that mask + and the corresponding mask type will be returned. If both masks are provided, they will be both + expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or`` + and mask type 2 will be returned + Args: + attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0 + key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1 + query: query embeddings of shape ``(batch_size, seq_len, embed_dim)`` + Returns: + merged_mask: merged mask + mask_type: merged mask type (0, 1, or 2) + """ + mask_type: int | None = None + merged_mask: Tensor | None = None + + if key_padding_mask is not None: + mask_type = 1 + merged_mask = key_padding_mask + + if attn_mask is not None: + # In this branch query can't be a nested tensor, so it has a shape + batch_size, seq_len, _ = query.shape + mask_type = 2 + + # Always expands attn_mask to 4D + if attn_mask.dim() == 3: + attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len) + else: # attn_mask.dim() == 2: + attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand( + batch_size, self.num_heads, -1, -1 + ) + merged_mask = attn_mask_expanded + + if key_padding_mask is not None: + key_padding_mask_expanded = key_padding_mask.view( + batch_size, 1, 1, seq_len + ).expand(-1, self.num_heads, -1, -1) + merged_mask = attn_mask_expanded + key_padding_mask_expanded + + # no attn_mask and no key_padding_mask, returns None, None + return merged_mask, mask_type + + +class PReLU(Module): + r"""Applies the element-wise PReLU function. + + .. math:: + \text{PReLU}(x) = \max(0,x) + a * \min(0,x) + + or + + .. math:: + \text{PReLU}(x) = + \begin{cases} + x, & \text{ if } x \ge 0 \\ + ax, & \text{ otherwise } + \end{cases} + + Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single + parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`, + a separate :math:`a` is used for each input channel. + + + .. note:: + weight decay should not be used when learning :math:`a` for good performance. + + .. note:: + Channel dim is the 2nd dim of input. When input has dims < 2, then there is + no channel dim and the number of channels = 1. + + Args: + num_parameters (int): number of :math:`a` to learn. + Although it takes an int as input, there is only two values are legitimate: + 1, or the number of channels at input. Default: 1 + init (float): the initial value of :math:`a`. Default: 0.25 + + Shape: + - Input: :math:`( *)` where `*` means, any number of additional + dimensions. + - Output: :math:`(*)`, same shape as the input. + + Attributes: + weight (Tensor): the learnable weights of shape (:attr:`num_parameters`). + + .. image:: ../scripts/activation_images/PReLU.png + + Examples:: + + >>> m = nn.PReLU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ["num_parameters"] + num_parameters: int + + def __init__( + self, num_parameters: int = 1, init: float = 0.25, device=None, dtype=None + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + self.num_parameters = num_parameters + super().__init__() + self.init = init + self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs)) + self.reset_parameters() + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in ``__init__``. + """ + torch.nn.init.constant_(self.weight, self.init) + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.prelu(input, self.weight) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"num_parameters={self.num_parameters}" + + +class Softsign(Module): + r"""Applies the element-wise Softsign function. + + .. math:: + \text{SoftSign}(x) = \frac{x}{ 1 + |x|} + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softsign.png + + Examples:: + + >>> m = nn.Softsign() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.softsign(input) + + +class Tanhshrink(Module): + r"""Applies the element-wise Tanhshrink function. + + .. math:: + \text{Tanhshrink}(x) = x - \tanh(x) + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Tanhshrink.png + + Examples:: + + >>> m = nn.Tanhshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.tanhshrink(input) + + +class Softmin(Module): + r"""Applies the Softmin function to an n-dimensional input Tensor. + + Rescales them so that the elements of the n-dimensional output Tensor + lie in the range `[0, 1]` and sum to 1. + + Softmin is defined as: + + .. math:: + \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)} + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Args: + dim (int): A dimension along which Softmin will be computed (so every slice + along dim will sum to 1). + + Returns: + a Tensor of the same dimension and shape as the input, with + values in the range [0, 1] + + Examples:: + + >>> m = nn.Softmin(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + """ + + __constants__ = ["dim"] + dim: int | None + + def __init__(self, dim: int | None = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, "dim"): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.softmin(input, self.dim, _stacklevel=5) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"dim={self.dim}" + + +class Softmax(Module): + r"""Applies the Softmax function to an n-dimensional input Tensor. + + Rescales them so that the elements of the n-dimensional output Tensor + lie in the range [0,1] and sum to 1. + + Softmax is defined as: + + .. math:: + \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)} + + When the input Tensor is a sparse tensor then the unspecified + values are treated as ``-inf``. + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [0, 1] + + Args: + dim (int): A dimension along which Softmax will be computed (so every slice + along dim will sum to 1). + + .. note:: + This module doesn't work directly with NLLLoss, + which expects the Log to be computed between the Softmax and itself. + Use `LogSoftmax` instead (it's faster and has better numerical properties). + + Examples:: + + >>> m = nn.Softmax(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + + """ + + __constants__ = ["dim"] + dim: int | None + + def __init__(self, dim: int | None = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, "dim"): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.softmax(input, self.dim, _stacklevel=5) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"dim={self.dim}" + + +class Softmax2d(Module): + r"""Applies SoftMax over features to each spatial location. + + When given an image of ``Channels x Height x Width``, it will + apply `Softmax` to each location :math:`(Channels, h_i, w_j)` + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`. + - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input) + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [0, 1] + + Examples:: + + >>> m = nn.Softmax2d() + >>> # you softmax over the 2nd dimension + >>> input = torch.randn(2, 3, 12, 13) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + if input.dim() not in (3, 4): + raise ValueError( + f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead" + ) + return F.softmax(input, -3, _stacklevel=5) + + +class LogSoftmax(Module): + r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor. + + The LogSoftmax formulation can be simplified as: + + .. math:: + \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right) + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Args: + dim (int): A dimension along which LogSoftmax will be computed. + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [-inf, 0) + + Examples:: + + >>> m = nn.LogSoftmax(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + """ + + __constants__ = ["dim"] + dim: int | None + + def __init__(self, dim: int | None = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, "dim"): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.log_softmax(input, self.dim, _stacklevel=5) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"dim={self.dim}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py new file mode 100644 index 0000000000000000000000000000000000000000..4267ed9993bff1ff69d57028308f4a3121ef2050 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py @@ -0,0 +1,339 @@ +# mypy: allow-untyped-defs + +import itertools +from collections import namedtuple +from collections.abc import Sequence + +import torch +import torch.nn.functional as F +from torch import Tensor + +from .container import ModuleList, Sequential +from .linear import Linear +from .module import Module + + +__all__ = ["AdaptiveLogSoftmaxWithLoss"] + +_ASMoutput = namedtuple("_ASMoutput", ["output", "loss"]) + + +class AdaptiveLogSoftmaxWithLoss(Module): + ( + """Efficient softmax approximation. + + As described in + `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin, + Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou + `__. +""" + r""" + Adaptive softmax is an approximate strategy for training models with large + output spaces. It is most effective when the label distribution is highly + imbalanced, for example in natural language modelling, where the word + frequency distribution approximately follows the `Zipf's law`_. + + Adaptive softmax partitions the labels into several clusters, according to + their frequency. These clusters may contain different number of targets + each. + Additionally, clusters containing less frequent labels assign lower + dimensional embeddings to those labels, which speeds up the computation. + For each minibatch, only clusters for which at least one target is + present are evaluated. + + The idea is that the clusters which are accessed frequently + (like the first one, containing most frequent labels), should also be cheap + to compute -- that is, contain a small number of assigned labels. + + We highly recommend taking a look at the original paper for more details. + + * :attr:`cutoffs` should be an ordered Sequence of integers sorted + in the increasing order. + It controls number of clusters and the partitioning of targets into + clusters. For example setting ``cutoffs = [10, 100, 1000]`` + means that first `10` targets will be assigned + to the 'head' of the adaptive softmax, targets `11, 12, ..., 100` will be + assigned to the first cluster, and targets `101, 102, ..., 1000` will be + assigned to the second cluster, while targets + `1001, 1002, ..., n_classes - 1` will be assigned + to the last, third cluster. + + * :attr:`div_value` is used to compute the size of each additional cluster, + which is given as + :math:`\left\lfloor\frac{\texttt{in\_features}}{\texttt{div\_value}^{idx}}\right\rfloor`, + where :math:`idx` is the cluster index (with clusters + for less frequent words having larger indices, + and indices starting from :math:`1`). + + * :attr:`head_bias` if set to True, adds a bias term to the 'head' of the + adaptive softmax. See paper for details. Set to False in the official + implementation. + + .. warning:: + Labels passed as inputs to this module should be sorted according to + their frequency. This means that the most frequent label should be + represented by the index `0`, and the least frequent + label should be represented by the index `n_classes - 1`. + + .. note:: + This module returns a ``NamedTuple`` with ``output`` + and ``loss`` fields. See further documentation for details. + + .. note:: + To compute log-probabilities for all classes, the ``log_prob`` + method can be used. + + Args: + in_features (int): Number of features in the input tensor + n_classes (int): Number of classes in the dataset + cutoffs (Sequence): Cutoffs used to assign targets to their buckets + div_value (float, optional): value used as an exponent to compute sizes + of the clusters. Default: 4.0 + head_bias (bool, optional): If ``True``, adds a bias term to the 'head' of the + adaptive softmax. Default: ``False`` + + Returns: + ``NamedTuple`` with ``output`` and ``loss`` fields: + * **output** is a Tensor of size ``N`` containing computed target + log probabilities for each example + * **loss** is a Scalar representing the computed negative + log likelihood loss + + Shape: + - input: :math:`(N, \texttt{in\_features})` or :math:`(\texttt{in\_features})` + - target: :math:`(N)` or :math:`()` where each value satisfies :math:`0 <= \texttt{target[i]} <= \texttt{n\_classes}` + - output1: :math:`(N)` or :math:`()` + - output2: ``Scalar`` + + .. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law + """ + ) + + in_features: int + n_classes: int + cutoffs: list[int] + div_value: float + head_bias: bool + head: Linear + tail: ModuleList + + def __init__( + self, + in_features: int, + n_classes: int, + cutoffs: Sequence[int], + div_value: float = 4.0, + head_bias: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + cutoffs = list(cutoffs) + + if len(cutoffs) == 0: + raise ValueError("cutoffs should be a sequence of length larger than 0") + + if ( + (cutoffs != sorted(cutoffs)) + or (min(cutoffs) <= 0) + or (max(cutoffs) > (n_classes - 1)) + or (len(set(cutoffs)) != len(cutoffs)) + or any(int(c) != c for c in cutoffs) + ): + raise ValueError( + "cutoffs should be a sequence of unique, positive " + "integers sorted in an increasing order, where " + "each value is between 1 and n_classes-1" + ) + + self.in_features = in_features + self.n_classes = n_classes + self.cutoffs = cutoffs + [n_classes] + self.div_value = div_value + self.head_bias = head_bias + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + self.head = Linear( + self.in_features, self.head_size, bias=self.head_bias, **factory_kwargs + ) + self.tail = ModuleList() + + for i in range(self.n_clusters): + hsz = int(self.in_features // (self.div_value ** (i + 1))) + osz = self.cutoffs[i + 1] - self.cutoffs[i] + + projection = Sequential( + Linear(self.in_features, hsz, bias=False, **factory_kwargs), + Linear(hsz, osz, bias=False, **factory_kwargs), + ) + + self.tail.append(projection) + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in ``__init__``. + """ + self.head.reset_parameters() + for i2h, h2o in self.tail: # type: ignore[misc] + i2h.reset_parameters() # type: ignore[has-type] + h2o.reset_parameters() # type: ignore[has-type] + + def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput: + """ + Runs the forward pass. + """ + targ_dim = target_.dim() + + if targ_dim == 1: + if input_.size(0) != target_.size(0): + raise RuntimeError( + "Input and target should have the same size in the batch dimension." + ) + if input_.dim() != 2: + raise RuntimeError( + "1D target tensor expects 2D input tensors, " + "but found inputs with size", + input_.size(), + ) + elif targ_dim == 0: + if input_.dim() != 1: + raise RuntimeError( + "0D target tensor expects 1D input tensors, " + "but found inputs with size", + input_.size(), + ) + else: + raise RuntimeError( + "0D or 1D target tensor expected, multi-target not supported" + ) + + is_batched = targ_dim > 0 + input = input_ if is_batched else input_.unsqueeze(0) + target = target_ if is_batched else target_.unsqueeze(0) + + used_rows = 0 + batch_size = target.size(0) + + output = input.new_zeros(batch_size) + gather_inds = target.new_empty(batch_size) + + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + low_idx = cutoff_values[i] + high_idx = cutoff_values[i + 1] + + target_mask = (target >= low_idx) & (target < high_idx) + row_indices = target_mask.nonzero().squeeze() + + if row_indices.numel() == 0: + continue + + if i == 0: + gather_inds.index_copy_(0, row_indices, target[target_mask]) + + else: + relative_target = target[target_mask] - low_idx + input_subset = input.index_select(0, row_indices) + + cluster_output = self.tail[i - 1](input_subset) + cluster_index = self.shortlist_size + i - 1 + + gather_inds.index_fill_(0, row_indices, cluster_index) + cluster_logprob = F.log_softmax(cluster_output, dim=1) + local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1)) + output.index_copy_(0, row_indices, local_logprob.squeeze(1)) + + used_rows += row_indices.numel() + + if used_rows != batch_size: + raise RuntimeError( + f"Target values should be in [0, {self.n_classes - 1}], " + f"but values in range [{target.min().item()}, {target.max().item()}] " + "were found. " + ) + + head_output = self.head(input) + head_logprob = F.log_softmax(head_output, dim=1) + output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze() + loss = (-output).mean() + + if not is_batched: + output = output.squeeze(0) + + return _ASMoutput(output, loss) + + def _get_full_log_prob(self, input, head_output): + """Given input tensor, and output of ``self.head``, compute the log of the full distribution.""" + out = input.new_empty((head_output.size(0), self.n_classes)) + head_logprob = F.log_softmax(head_output, dim=1) + + out[:, : self.shortlist_size] = head_logprob[:, : self.shortlist_size] + + for i, (start_idx, stop_idx) in enumerate(itertools.pairwise(self.cutoffs)): + cluster_output = self.tail[i](input) + cluster_logprob = F.log_softmax(cluster_output, dim=1) + output_logprob = cluster_logprob + head_logprob[ + :, self.shortlist_size + i + ].unsqueeze(1) + + out[:, start_idx:stop_idx] = output_logprob + + return out + + def log_prob(self, input: Tensor) -> Tensor: + r"""Compute log probabilities for all :math:`\texttt{n\_classes}`. + + Args: + input (Tensor): a minibatch of examples + + Returns: + log-probabilities of for each class :math:`c` + in range :math:`0 <= c <= \texttt{n\_classes}`, where :math:`\texttt{n\_classes}` is a + parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. + + Shape: + - Input: :math:`(N, \texttt{in\_features})` + - Output: :math:`(N, \texttt{n\_classes})` + + """ + head_output = self.head(input) + return self._get_full_log_prob(input, head_output) + + def predict(self, input: Tensor) -> Tensor: + r"""Return the class with the highest probability for each example in the input minibatch. + + This is equivalent to ``self.log_prob(input).argmax(dim=1)``, but is more efficient in some cases. + + Args: + input (Tensor): a minibatch of examples + + Returns: + output (Tensor): a class with the highest probability for each example + + Shape: + - Input: :math:`(N, \texttt{in\_features})` + - Output: :math:`(N)` + """ + head_output = self.head(input) + output = torch.argmax(head_output, dim=1) + not_in_shortlist = output >= self.shortlist_size + all_in_shortlist = not (not_in_shortlist.any()) + + if all_in_shortlist: + return output + + elif not_in_shortlist.all(): + log_prob = self._get_full_log_prob(input, head_output) + return torch.argmax(log_prob, dim=1) + + else: + log_prob = self._get_full_log_prob( + input[not_in_shortlist], head_output[not_in_shortlist] + ) + output[not_in_shortlist] = torch.argmax(log_prob, dim=1) + return output diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..40a912b4f05682792b1a3126b6df53230ced88c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py @@ -0,0 +1,902 @@ +# mypy: allow-untyped-defs +from typing import Any + +import torch +from torch import Tensor +from torch.nn import functional as F, init +from torch.nn.parameter import Parameter, UninitializedBuffer, UninitializedParameter + +from ._functions import SyncBatchNorm as sync_batch_norm +from .lazy import LazyModuleMixin +from .module import Module + + +__all__ = [ + "BatchNorm1d", + "LazyBatchNorm1d", + "BatchNorm2d", + "LazyBatchNorm2d", + "BatchNorm3d", + "LazyBatchNorm3d", + "SyncBatchNorm", +] + + +class _NormBase(Module): + """Common base of _InstanceNorm and _BatchNorm.""" + + _version = 2 + __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"] + num_features: int + eps: float + momentum: float | None + affine: bool + track_running_stats: bool + # WARNING: weight and bias purposely not defined here. + # See https://github.com/pytorch/pytorch/issues/39670 + + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float | None = 0.1, + affine: bool = True, + track_running_stats: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.num_features = num_features + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + if self.affine: + self.weight = Parameter(torch.empty(num_features, **factory_kwargs)) + self.bias = Parameter(torch.empty(num_features, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if self.track_running_stats: + self.register_buffer( + "running_mean", torch.zeros(num_features, **factory_kwargs) + ) + self.register_buffer( + "running_var", torch.ones(num_features, **factory_kwargs) + ) + self.running_mean: Tensor | None + self.running_var: Tensor | None + self.register_buffer( + "num_batches_tracked", + torch.tensor( + 0, + dtype=torch.long, + # pyrefly: ignore [bad-argument-type] + **{k: v for k, v in factory_kwargs.items() if k != "dtype"}, + ), + ) + self.num_batches_tracked: Tensor | None + else: + self.register_buffer("running_mean", None) + self.register_buffer("running_var", None) + self.register_buffer("num_batches_tracked", None) + self.reset_parameters() + + def reset_running_stats(self) -> None: + if self.track_running_stats: + # running_mean/running_var/num_batches... are registered at runtime depending + # if self.track_running_stats is on + self.running_mean.zero_() # type: ignore[union-attr] + self.running_var.fill_(1) # type: ignore[union-attr] + self.num_batches_tracked.zero_() # type: ignore[union-attr,operator] + + def reset_parameters(self) -> None: + self.reset_running_stats() + if self.affine: + init.ones_(self.weight) + init.zeros_(self.bias) + + def _check_input_dim(self, input): + raise NotImplementedError + + def extra_repr(self): + return ( + "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, " + "track_running_stats={track_running_stats}".format(**self.__dict__) + ) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) -> None: + version = local_metadata.get("version", None) + + if (version is None or version < 2) and self.track_running_stats: + # at version 2: added num_batches_tracked buffer + # this should have a default value of 0 + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key not in state_dict: + state_dict[num_batches_tracked_key] = ( + self.num_batches_tracked + if self.num_batches_tracked is not None + and self.num_batches_tracked.device != torch.device("meta") + else torch.tensor(0, dtype=torch.long) + ) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +class _BatchNorm(_NormBase): + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float | None = 0.1, + affine: bool = True, + track_running_stats: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + + # exponential_average_factor is set to self.momentum + # (when it is available) only so that it gets updated + # in ONNX graph when this node is exported to ONNX. + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + # TODO: if statement only here to tell the jit to skip emitting this when it is None + if self.num_batches_tracked is not None: # type: ignore[has-type] + self.num_batches_tracked.add_(1) # type: ignore[has-type] + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / float(self.num_batches_tracked) + else: # use exponential moving average + exponential_average_factor = self.momentum + + r""" + Decide whether the mini-batch stats should be used for normalization rather than the buffers. + Mini-batch stats are used in training mode, and in eval mode when buffers are None. + """ + if self.training: + bn_training = True + else: + bn_training = (self.running_mean is None) and (self.running_var is None) + + r""" + Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be + passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are + used for normalization (i.e. in eval mode when buffers are not None). + """ + return F.batch_norm( + input, + # If buffers are not to be tracked, ensure that they won't be updated + ( + self.running_mean + if not self.training or self.track_running_stats + else None + ), + self.running_var if not self.training or self.track_running_stats else None, + self.weight, + self.bias, + bn_training, + exponential_average_factor, + self.eps, + ) + + +class _LazyNormBase(LazyModuleMixin, _NormBase): + weight: UninitializedParameter # type: ignore[assignment] + bias: UninitializedParameter # type: ignore[assignment] + + def __init__( + self, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + # affine and track_running_stats are hardcoded to False to + # avoid creating tensors that will soon be overwritten. + 0, + eps, + momentum, + False, + False, + **factory_kwargs, + ) + self.affine = affine + self.track_running_stats = track_running_stats + if self.affine: + # pyrefly: ignore [bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + if self.track_running_stats: + # pyrefly: ignore [bad-argument-type] + self.running_mean = UninitializedBuffer(**factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.running_var = UninitializedBuffer(**factory_kwargs) + self.num_batches_tracked = torch.tensor( + 0, + dtype=torch.long, + # pyrefly: ignore [bad-argument-type] + **{k: v for k, v in factory_kwargs.items() if k != "dtype"}, + ) + + def reset_parameters(self) -> None: + # pyrefly: ignore [bad-argument-type] + if not self.has_uninitialized_params() and self.num_features != 0: + super().reset_parameters() + + def initialize_parameters(self, input) -> None: # type: ignore[override] + # pyrefly: ignore [bad-argument-type] + if self.has_uninitialized_params(): + self.num_features = input.shape[1] + if self.affine: + assert isinstance(self.weight, UninitializedParameter) + assert isinstance(self.bias, UninitializedParameter) + self.weight.materialize((self.num_features,)) + self.bias.materialize((self.num_features,)) + if self.track_running_stats: + self.running_mean.materialize( # type:ignore[union-attr] + (self.num_features,) + ) + self.running_var.materialize( # type:ignore[union-attr] + (self.num_features,) + ) + self.reset_parameters() + + +class BatchNorm1d(_BatchNorm): + r"""Applies Batch Normalization over a 2D or 3D input. + + Method described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the number of features or channels of the input). By default, the + elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0. + At train time in the forward pass, the variance is calculated via the biased estimator, + equivalent to ``torch.var(input, correction=0)``. However, the value stored in the + moving average of the variance is calculated via the unbiased estimator, equivalent to + ``torch.var(input, correction=1)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization. + + Args: + num_features: number of features or channels :math:`C` of the input + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size, + :math:`C` is the number of features or channels, and :math:`L` is the sequence length + - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm1d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm1d(100, affine=False) + >>> input = torch.randn(20, 100) + >>> output = m(input) + """ + + def _check_input_dim(self, input) -> None: + if input.dim() != 2 and input.dim() != 3: + raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)") + + +# pyrefly: ignore [inconsistent-inheritance] +class LazyBatchNorm1d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization. + + Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm1d # type: ignore[assignment] + + def _check_input_dim(self, input) -> None: + if input.dim() != 2 and input.dim() != 3: + raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)") + + +class BatchNorm2d(_BatchNorm): + r"""Applies Batch Normalization over a 4D input. + + 4D is a mini-batch of 2D inputs + with additional channel dimension. Method described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set + to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the + standard-deviation is calculated via the biased estimator, equivalent to + ``torch.var(input, correction=0)``. However, the value stored in the moving average of the + standard-deviation is calculated via the unbiased estimator, equivalent to + ``torch.var(input, correction=1)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, H, W)` + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C, H, W)` + - Output: :math:`(N, C, H, W)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm2d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm2d(100, affine=False) + >>> input = torch.randn(20, 100, 35, 45) + >>> output = m(input) + """ + + def _check_input_dim(self, input) -> None: + if input.dim() != 4: + raise ValueError(f"expected 4D input (got {input.dim()}D input)") + + +# pyrefly: ignore [inconsistent-inheritance] +class LazyBatchNorm2d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization. + + Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm2d # type: ignore[assignment] + + def _check_input_dim(self, input) -> None: + if input.dim() != 4: + raise ValueError(f"expected 4D input (got {input.dim()}D input)") + + +class BatchNorm3d(_BatchNorm): + r"""Applies Batch Normalization over a 5D input. + + 5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set + to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the + standard-deviation is calculated via the biased estimator, equivalent to + ``torch.var(input, correction=0)``. However, the value stored in the moving average of the + standard-deviation is calculated via the unbiased estimator, equivalent to + ``torch.var(input, correction=1)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization + or Spatio-temporal Batch Normalization. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, D, H, W)` + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C, D, H, W)` + - Output: :math:`(N, C, D, H, W)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm3d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm3d(100, affine=False) + >>> input = torch.randn(20, 100, 35, 45, 10) + >>> output = m(input) + """ + + def _check_input_dim(self, input) -> None: + if input.dim() != 5: + raise ValueError(f"expected 5D input (got {input.dim()}D input)") + + +# pyrefly: ignore [inconsistent-inheritance] +class LazyBatchNorm3d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization. + + Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm3d # type: ignore[assignment] + + def _check_input_dim(self, input) -> None: + if input.dim() != 5: + raise ValueError(f"expected 5D input (got {input.dim()}D input)") + + +class SyncBatchNorm(_BatchNorm): + r"""Applies Batch Normalization over a N-Dimensional input. + + The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over all + mini-batches of the same process groups. :math:`\gamma` and :math:`\beta` + are learnable parameter vectors of size `C` (where `C` is the input size). + By default, the elements of :math:`\gamma` are sampled from + :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done for each channel in the ``C`` dimension, computing + statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch + Normalization or Spatio-temporal Batch Normalization. + + Currently :class:`SyncBatchNorm` only supports + :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use + :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert + :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping + Network with DDP. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, +)` + eps: a value added to the denominator for numerical stability. + Default: ``1e-5`` + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + process_group: synchronization of stats happen within each process group + individually. Default behavior is synchronization across the whole + world + + Shape: + - Input: :math:`(N, C, +)` + - Output: :math:`(N, C, +)` (same shape as input) + + .. note:: + Synchronization of batchnorm statistics occurs only while training, i.e. + synchronization is disabled when ``model.eval()`` is set or if + ``self.training`` is otherwise ``False``. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With Learnable Parameters + >>> m = nn.SyncBatchNorm(100) + >>> # creating process group (optional) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group) + >>> input = torch.randn(20, 100, 35, 45, 10) + >>> output = m(input) + + >>> # network is nn.BatchNorm layer + >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group) + >>> # only single gpu per process is currently supported + >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel( + >>> sync_bn_network, + >>> device_ids=[args.local_rank], + >>> output_device=args.local_rank) + """ + + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float | None = 0.1, + affine: bool = True, + track_running_stats: bool = True, + process_group: Any | None = None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + self.process_group = process_group + + def _check_input_dim(self, input) -> None: + if input.dim() < 2: + raise ValueError(f"expected at least 2D input (got {input.dim()}D input)") + + def _check_non_zero_input_channels(self, input) -> None: + if input.size(1) == 0: + raise ValueError( + "SyncBatchNorm number of input channels should be non-zero" + ) + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + self._check_input_dim(input) + self._check_non_zero_input_channels(input) + + # exponential_average_factor is set to self.momentum + # (when it is available) only so that it gets updated + # in ONNX graph when this node is exported to ONNX. + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + assert self.num_batches_tracked is not None + self.num_batches_tracked.add_(1) + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / self.num_batches_tracked.item() + else: # use exponential moving average + exponential_average_factor = self.momentum + + r""" + Decide whether the mini-batch stats should be used for normalization rather than the buffers. + Mini-batch stats are used in training mode, and in eval mode when buffers are None. + """ + if self.training: + bn_training = True + else: + bn_training = (self.running_mean is None) and (self.running_var is None) + + r""" + Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be + passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are + used for normalization (i.e. in eval mode when buffers are not None). + """ + # If buffers are not to be tracked, ensure that they won't be updated + running_mean = ( + self.running_mean if not self.training or self.track_running_stats else None + ) + running_var = ( + self.running_var if not self.training or self.track_running_stats else None + ) + + # Don't sync batchnorm stats in inference mode (model.eval()). + need_sync = ( + bn_training + and self.training + and torch.distributed.is_available() + and torch.distributed.is_initialized() + ) + if need_sync: + # currently only GPU/PrivateUse1 input is supported + if input.device.type not in [ + "cuda", + "hpu", + "xpu", + torch._C._get_privateuse1_backend_name(), + ]: + raise ValueError( + "SyncBatchNorm expected input tensor to be on GPU or XPU or " + f"{torch._C._get_privateuse1_backend_name()}" + ) + + process_group = torch.distributed.group.WORLD + if self.process_group: + process_group = self.process_group + world_size = torch.distributed.get_world_size(process_group) + need_sync = world_size > 1 + + # fallback to framework BN when synchronization is not necessary + if not need_sync: + return F.batch_norm( + input, + running_mean, + running_var, + self.weight, + self.bias, + bn_training, + exponential_average_factor, + self.eps, + ) + else: + assert bn_training + return sync_batch_norm.apply( + input, + self.weight, + self.bias, + running_mean, + running_var, + self.eps, + exponential_average_factor, + process_group, # type: ignore[possibly-undefined] + world_size, # type: ignore[possibly-undefined] + ) + + @classmethod + def convert_sync_batchnorm(cls, module, process_group=None): + r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers. + + Args: + module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers + process_group (optional): process group to scope synchronization, + default is the whole world + + Returns: + The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm` + layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer, + a new :class:`torch.nn.SyncBatchNorm` layer object will be returned + instead. + + Example:: + + >>> # Network with nn.BatchNorm layer + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> module = torch.nn.Sequential( + >>> torch.nn.Linear(20, 100), + >>> torch.nn.BatchNorm1d(100), + >>> ).cuda() + >>> # creating process group (optional) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> # xdoctest: +SKIP("distributed") + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] + >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group) + + """ + module_output = module + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): + module_output = torch.nn.SyncBatchNorm( + module.num_features, + module.eps, + module.momentum, + module.affine, + module.track_running_stats, + process_group, + ) + if module.affine: + with torch.no_grad(): + module_output.weight = module.weight + module_output.bias = module.bias + module_output.running_mean = module.running_mean + module_output.running_var = module.running_var + module_output.num_batches_tracked = module.num_batches_tracked + module_output.training = module.training + if hasattr(module, "qconfig"): + module_output.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module( + name, cls.convert_sync_batchnorm(child, process_group) + ) + del module + return module_output diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..34a48f04f853dd1c458b035635728a122e9cc4d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py @@ -0,0 +1,62 @@ +import torch.nn.functional as F +from torch import Tensor + +from .module import Module + + +__all__ = ["ChannelShuffle"] + + +class ChannelShuffle(Module): + r"""Divides and rearranges the channels in a tensor. + + This operation divides the channels in a tensor of shape :math:`(N, C, *)` + into g groups as :math:`(N, \frac{C}{g}, g, *)` and shuffles them, + while retaining the original tensor shape in the final output. + + Args: + groups (int): number of groups to divide channels in. + + Examples:: + + >>> channel_shuffle = nn.ChannelShuffle(2) + >>> input = torch.arange(1, 17, dtype=torch.float32).view(1, 4, 2, 2) + >>> input + tensor([[[[ 1., 2.], + [ 3., 4.]], + [[ 5., 6.], + [ 7., 8.]], + [[ 9., 10.], + [11., 12.]], + [[13., 14.], + [15., 16.]]]]) + >>> output = channel_shuffle(input) + >>> output + tensor([[[[ 1., 2.], + [ 3., 4.]], + [[ 9., 10.], + [11., 12.]], + [[ 5., 6.], + [ 7., 8.]], + [[13., 14.], + [15., 16.]]]]) + """ + + __constants__ = ["groups"] + groups: int + + def __init__(self, groups: int) -> None: + super().__init__() + self.groups = groups + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.channel_shuffle(input, self.groups) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"groups={self.groups}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py new file mode 100644 index 0000000000000000000000000000000000000000..d99151369e18e4d55ef843d6b8c6f4395d6a6453 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py @@ -0,0 +1,1043 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import operator +from collections import abc as container_abcs, OrderedDict +from itertools import chain, islice +from typing import Any, overload, TYPE_CHECKING, TypeVar +from typing_extensions import deprecated, Self + +import torch +from torch._jit_internal import _copy_to_script_wrapper +from torch.nn.parameter import Parameter + +from .module import Module + + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Mapping + + +__all__ = [ + "Container", + "Sequential", + "ModuleList", + "ModuleDict", + "ParameterList", + "ParameterDict", +] + +T = TypeVar("T", bound=Module) +_V = TypeVar("_V") + + +# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList +def _addindent(s_, numSpaces): + s = s_.split("\n") + # don't do anything for single-line stuff + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(numSpaces * " ") + line for line in s] + s = "\n".join(s) + s = first + "\n" + s + return s + + +@deprecated( + "`nn.Container` is deprecated. " + "All of it's functionality is now implemented in `nn.Module`. Subclass that instead.", + category=FutureWarning, +) +class Container(Module): + def __init__(self, **kwargs: Any) -> None: + super().__init__() + for key, value in kwargs.items(): + self.add_module(key, value) + + +class Sequential(Module): + r"""A sequential container. + + Modules will be added to it in the order they are passed in the + constructor. Alternatively, an ``OrderedDict`` of modules can be + passed in. The ``forward()`` method of ``Sequential`` accepts any + input and forwards it to the first module it contains. It then + "chains" outputs to inputs sequentially for each subsequent module, + finally returning the output of the last module. + + The value a ``Sequential`` provides over manually calling a sequence + of modules is that it allows treating the whole container as a + single module, such that performing a transformation on the + ``Sequential`` applies to each of the modules it stores (which are + each a registered submodule of the ``Sequential``). + + What's the difference between a ``Sequential`` and a + :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it + sounds like--a list for storing ``Module`` s! On the other hand, + the layers in a ``Sequential`` are connected in a cascading way. + + Example:: + + # Using Sequential to create a small model. When `model` is run, + # input will first be passed to `Conv2d(1,20,5)`. The output of + # `Conv2d(1,20,5)` will be used as the input to the first + # `ReLU`; the output of the first `ReLU` will become the input + # for `Conv2d(20,64,5)`. Finally, the output of + # `Conv2d(20,64,5)` will be used as input to the second `ReLU` + model = nn.Sequential( + nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU() + ) + + # Using Sequential with OrderedDict. This is functionally the + # same as the above code + model = nn.Sequential( + OrderedDict( + [ + ("conv1", nn.Conv2d(1, 20, 5)), + ("relu1", nn.ReLU()), + ("conv2", nn.Conv2d(20, 64, 5)), + ("relu2", nn.ReLU()), + ] + ) + ) + """ + + _modules: dict[str, Module] # type: ignore[assignment] + + @overload + def __init__(self, *args: Module) -> None: ... + + @overload + # pyrefly: ignore [inconsistent-overload] + def __init__(self, arg: OrderedDict[str, Module]) -> None: ... + + def __init__(self, *args): + super().__init__() + if len(args) == 1 and isinstance(args[0], OrderedDict): + for key, module in args[0].items(): + self.add_module(key, module) + else: + for idx, module in enumerate(args): + self.add_module(str(idx), module) + + def _get_item_by_idx(self, iterator: Iterable[_V], idx: int) -> _V: + """Get the idx-th item of the iterator.""" + size = len(self) + idx = operator.index(idx) + if not -size <= idx < size: + raise IndexError(f"index {idx} is out of range") + idx %= size + return next(islice(iterator, idx, None)) + + @_copy_to_script_wrapper + def __getitem__(self, idx: slice | int) -> Sequential | Module: + if isinstance(idx, slice): + return self.__class__(OrderedDict(list(self._modules.items())[idx])) + else: + return self._get_item_by_idx(self._modules.values(), idx) + + def __setitem__(self, idx: int, module: Module) -> None: + key: str = self._get_item_by_idx(self._modules.keys(), idx) + return setattr(self, key, module) + + def __delitem__(self, idx: slice | int) -> None: + if isinstance(idx, slice): + for key in list(self._modules.keys())[idx]: + delattr(self, key) + else: + key = self._get_item_by_idx(self._modules.keys(), idx) + delattr(self, key) + # To preserve numbering + str_indices = [str(i) for i in range(len(self._modules))] + self._modules = OrderedDict( + zip(str_indices, self._modules.values(), strict=True) + ) + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + def __add__(self, other) -> Sequential: + if isinstance(other, Sequential): + ret = Sequential() + for layer in self: + ret.append(layer) + for layer in other: + ret.append(layer) + return ret + else: + raise ValueError( + "add operator supports only objects " + f"of Sequential class, but {str(type(other))} is given." + ) + + def pop(self, key: int | slice) -> Module: + """ + Pop ``key`` from self. + """ + v = self[key] + del self[key] + return v + + def __iadd__(self, other) -> Self: + if isinstance(other, Sequential): + offset = len(self) + for i, module in enumerate(other): + self.add_module(str(i + offset), module) + return self + else: + raise ValueError( + "add operator supports only objects " + f"of Sequential class, but {str(type(other))} is given." + ) + + def __mul__(self, other: int) -> Sequential: + if not isinstance(other, int): + raise TypeError( + f"unsupported operand type(s) for *: {type(self)} and {type(other)}" + ) + elif other <= 0: + raise ValueError( + f"Non-positive multiplication factor {other} for {type(self)}" + ) + else: + combined = Sequential() + offset = 0 + for _ in range(other): + for module in self: + combined.add_module(str(offset), module) + offset += 1 + return combined + + def __rmul__(self, other: int) -> Sequential: + return self.__mul__(other) + + def __imul__(self, other: int) -> Self: + if not isinstance(other, int): + raise TypeError( + f"unsupported operand type(s) for *: {type(self)} and {type(other)}" + ) + elif other <= 0: + raise ValueError( + f"Non-positive multiplication factor {other} for {type(self)}" + ) + else: + len_original = len(self) + offset = len(self) + for _ in range(other - 1): + for i in range(len_original): + self.add_module(str(i + offset), self._modules[str(i)]) + offset += len_original + return self + + @_copy_to_script_wrapper + def __dir__(self) -> list[str]: + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[Module]: + return iter(self._modules.values()) + + # NB: We can't really type check this function as the type of input + # may change dynamically (as is tested in + # TestScript.test_sequential_intermediary_types). Cannot annotate + # with Any as TorchScript expects a more precise type + def forward(self, input): + """ + Runs the forward pass. + """ + for module in self: + input = module(input) + return input + + def append(self, module: Module) -> Self: + r"""Append a given module to the end. + + Args: + module (nn.Module): module to append + + Example:: + + >>> import torch.nn as nn + >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3)) + >>> n.append(nn.Linear(3, 4)) + Sequential( + (0): Linear(in_features=1, out_features=2, bias=True) + (1): Linear(in_features=2, out_features=3, bias=True) + (2): Linear(in_features=3, out_features=4, bias=True) + ) + + """ + self.add_module(str(len(self)), module) + return self + + def insert(self, index: int, module: Module) -> Self: + """ + Inserts a module into the Sequential container at the specified index. + + Args: + index (int): The index to insert the module. + module (Module): The module to be inserted. + + Example:: + + >>> import torch.nn as nn + >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3)) + >>> n.insert(0, nn.Linear(3, 4)) + Sequential( + (0): Linear(in_features=3, out_features=4, bias=True) + (1): Linear(in_features=1, out_features=2, bias=True) + (2): Linear(in_features=2, out_features=3, bias=True) + ) + + """ + if not isinstance(module, Module): + raise AssertionError(f"module should be of type: {Module}") + n = len(self._modules) + if not (-n <= index <= n): + raise IndexError(f"Index out of range: {index}") + if index < 0: + index += n + for i in range(n, index, -1): + self._modules[str(i)] = self._modules[str(i - 1)] + self._modules[str(index)] = module + return self + + def extend(self, sequential: Iterable[Module]) -> Self: + """ + Extends the current Sequential container with layers from another Sequential container. + + Args: + sequential (Sequential): A Sequential container whose layers will be added to the current container. + + Example:: + + >>> import torch.nn as nn + >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3)) + >>> other = nn.Sequential(nn.Linear(3, 4), nn.Linear(4, 5)) + >>> n.extend(other) # or `n + other` + Sequential( + (0): Linear(in_features=1, out_features=2, bias=True) + (1): Linear(in_features=2, out_features=3, bias=True) + (2): Linear(in_features=3, out_features=4, bias=True) + (3): Linear(in_features=4, out_features=5, bias=True) + ) + + """ + for layer in sequential: + self.append(layer) + return self + + +class ModuleList(Module): + r"""Holds submodules in a list. + + :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but + modules it contains are properly registered, and will be visible by all + :class:`~torch.nn.Module` methods. + + Args: + modules (iterable, optional): an iterable of modules to add + + Example:: + + class MyModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)]) + + def forward(self, x): + # ModuleList can act as an iterable, or be indexed using ints + for i, l in enumerate(self.linears): + x = self.linears[i // 2](x) + l(x) + return x + """ + + _modules: dict[str, Module] # type: ignore[assignment] + + def __init__(self, modules: Iterable[Module] | None = None) -> None: + super().__init__() + if modules is not None: + self += modules + + def _get_abs_string_index(self, idx): + """Get the absolute index for the list of modules.""" + idx = operator.index(idx) + if not (-len(self) <= idx < len(self)): + raise IndexError(f"index {idx} is out of range") + if idx < 0: + idx += len(self) + return str(idx) + + @overload + def __getitem__(self, idx: slice) -> ModuleList: ... + + @overload + def __getitem__(self, idx: int) -> Module: ... + + @_copy_to_script_wrapper + def __getitem__(self, idx: int | slice) -> Module | ModuleList: + if isinstance(idx, slice): + return self.__class__(list(self._modules.values())[idx]) + else: + return self._modules[self._get_abs_string_index(idx)] + + def __setitem__(self, idx: int, module: Module) -> None: + idx = self._get_abs_string_index(idx) + return setattr(self, str(idx), module) + + def __delitem__(self, idx: int | slice) -> None: + if isinstance(idx, slice): + for k in range(len(self._modules))[idx]: + delattr(self, str(k)) + else: + delattr(self, self._get_abs_string_index(idx)) + # To preserve numbering, self._modules is being reconstructed with modules after deletion + str_indices = [str(i) for i in range(len(self._modules))] + self._modules = OrderedDict( + zip(str_indices, self._modules.values(), strict=True) + ) + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[Module]: + return iter(self._modules.values()) + + def __iadd__(self, modules: Iterable[Module]) -> Self: + return self.extend(modules) + + def __add__(self, other: Iterable[Module]) -> ModuleList: + combined = ModuleList() + for i, module in enumerate(chain(self, other)): + combined.add_module(str(i), module) + return combined + + def __repr__(self) -> str: + """Return a custom repr for ModuleList that compresses repeated module representations.""" + list_of_reprs = [repr(item) for item in self] + if len(list_of_reprs) == 0: + return self._get_name() + "()" + + start_end_indices = [[0, 0]] + repeated_blocks = [list_of_reprs[0]] + for i, r in enumerate(list_of_reprs[1:], 1): + if r == repeated_blocks[-1]: + start_end_indices[-1][1] += 1 + continue + + start_end_indices.append([i, i]) + repeated_blocks.append(r) + + lines = [] + main_str = self._get_name() + "(" + for (start_id, end_id), b in zip( + start_end_indices, repeated_blocks, strict=True + ): + local_repr = f"({start_id}): {b}" # default repr + + if start_id != end_id: + n = end_id - start_id + 1 + local_repr = f"({start_id}-{end_id}): {n} x {b}" + + local_repr = _addindent(local_repr, 2) + lines.append(local_repr) + + main_str += "\n " + "\n ".join(lines) + "\n" + main_str += ")" + return main_str + + @_copy_to_script_wrapper + def __dir__(self) -> list[str]: + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + def insert(self, index: int, module: Module) -> None: + r"""Insert a given module before a given index in the list. + + Args: + index (int): index to insert. + module (nn.Module): module to insert + """ + for i in range(len(self._modules), index, -1): + self._modules[str(i)] = self._modules[str(i - 1)] + self._modules[str(index)] = module + + def append(self, module: Module) -> Self: + r"""Append a given module to the end of the list. + + Args: + module (nn.Module): module to append + """ + self.add_module(str(len(self)), module) + return self + + def pop(self, key: int | slice) -> Module: + v = self[key] + del self[key] + return v + + def extend(self, modules: Iterable[Module]) -> Self: + r"""Append modules from a Python iterable to the end of the list. + + Args: + modules (iterable): iterable of modules to append + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError( + "ModuleList.extend should be called with an " + "iterable, but got " + type(modules).__name__ + ) + offset = len(self) + for i, module in enumerate(modules): + self.add_module(str(offset + i), module) + return self + + # remove forward altogether to fallback on Module's _forward_unimplemented + + +class ModuleDict(Module): + r"""Holds submodules in a dictionary. + + :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary, + but modules it contains are properly registered, and will be visible by all + :class:`~torch.nn.Module` methods. + + :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects + + * the order of insertion, and + + * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged + ``OrderedDict``, ``dict`` (started from Python 3.6) or another + :class:`~torch.nn.ModuleDict` (the argument to + :meth:`~torch.nn.ModuleDict.update`). + + Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping + types does not preserve the order of the merged mapping. + + Args: + modules (iterable, optional): a mapping (dictionary) of (string: module) + or an iterable of key-value pairs of type (string, module) + + Example:: + + class MyModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.choices = nn.ModuleDict( + {"conv": nn.Conv2d(10, 10, 3), "pool": nn.MaxPool2d(3)} + ) + self.activations = nn.ModuleDict( + [["lrelu", nn.LeakyReLU()], ["prelu", nn.PReLU()]] + ) + + def forward(self, x, choice, act): + x = self.choices[choice](x) + x = self.activations[act](x) + return x + """ + + _modules: dict[str, Module] # type: ignore[assignment] + + def __init__(self, modules: Mapping[str, Module] | None = None) -> None: + super().__init__() + if modules is not None: + self.update(modules) + + @_copy_to_script_wrapper + def __getitem__(self, key: str) -> Module: + return self._modules[key] + + def __setitem__(self, key: str, module: Module) -> None: + self.add_module(key, module) + + def __delitem__(self, key: str) -> None: + del self._modules[key] + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[str]: + return iter(self._modules) + + @_copy_to_script_wrapper + def __contains__(self, key: str) -> bool: + return key in self._modules + + def clear(self) -> None: + """Remove all items from the ModuleDict.""" + self._modules.clear() + + def pop(self, key: str) -> Module: + r"""Remove key from the ModuleDict and return its module. + + Args: + key (str): key to pop from the ModuleDict + """ + v = self[key] + del self[key] + return v + + @_copy_to_script_wrapper + def keys(self) -> container_abcs.KeysView[str]: + r"""Return an iterable of the ModuleDict keys.""" + return self._modules.keys() + + @_copy_to_script_wrapper + def items(self) -> container_abcs.ItemsView[str, Module]: + r"""Return an iterable of the ModuleDict key/value pairs.""" + return self._modules.items() + + @_copy_to_script_wrapper + def values(self) -> container_abcs.ValuesView[Module]: + r"""Return an iterable of the ModuleDict values.""" + return self._modules.values() + + def update(self, modules: Mapping[str, Module]) -> None: + r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys. + + .. note:: + If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`, + or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`) + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError( + "ModuleDict.update should be called with an " + "iterable of key/value pairs, but got " + type(modules).__name__ + ) + + if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)): + for key, module in modules.items(): + self[key] = module + else: + # modules here can be a list with two items + for j, m in enumerate(modules): + if not isinstance(m, container_abcs.Iterable): + raise TypeError( + "ModuleDict update sequence element " + "#" + str(j) + " should be Iterable; is" + type(m).__name__ + ) + # pyrefly: ignore [bad-argument-type] + if not len(m) == 2: + raise ValueError( + "ModuleDict update sequence element " + # pyrefly: ignore [bad-argument-type] + "#" + str(j) + " has length " + str(len(m)) + "; 2 is required" + ) + # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)] + # that's too cumbersome to type correctly with overloads, so we add an ignore here + self[m[0]] = m[1] # type: ignore[assignment] + + # remove forward altogether to fallback on Module's _forward_unimplemented + + +class ParameterList(Module): + r"""Holds parameters in a list. + + :class:`~torch.nn.ParameterList` can be used like a regular Python + list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered, + and will be visible by all :class:`~torch.nn.Module` methods. + + Note that the constructor, assigning an element of the list, the + :meth:`~torch.nn.ParameterList.append` method and the :meth:`~torch.nn.ParameterList.extend` + method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`. + + Args: + parameters (iterable, optional): an iterable of elements to add to the list. + + Example:: + + class MyModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.params = nn.ParameterList( + [nn.Parameter(torch.randn(10, 10)) for i in range(10)] + ) + + def forward(self, x): + # ParameterList can act as an iterable, or be indexed using ints + for i, p in enumerate(self.params): + x = self.params[i // 2].mm(x) + p.mm(x) + return x + """ + + def __init__(self, values: Iterable[Any] | None = None) -> None: + super().__init__() + self._size = 0 + if values is not None: + self += values + + def _get_abs_string_index(self, idx): + """Get the absolute index for the list of modules.""" + idx = operator.index(idx) + if not (-len(self) <= idx < len(self)): + raise IndexError(f"index {idx} is out of range") + if idx < 0: + idx += len(self) + return str(idx) + + @overload + def __getitem__(self, idx: int) -> Any: ... + + @overload + # pyrefly: ignore [inconsistent-overload] + def __getitem__(self: T, idx: slice) -> T: ... + + def __getitem__(self, idx): + if isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + out = self.__class__() + for i in range(start, stop, step): + out.append(self[i]) + return out + else: + idx = self._get_abs_string_index(idx) + return getattr(self, str(idx)) + + def __setitem__(self, idx: int, param: Any) -> None: + # Note that all other function that add an entry to the list part of + # the ParameterList end up here. So this is the only place where we need + # to wrap things into Parameter if needed. + # Objects added via setattr() are not in the list part and thus won't + # call into this function. + idx = self._get_abs_string_index(idx) + if isinstance(param, torch.Tensor) and not isinstance(param, Parameter): + param = Parameter(param) + return setattr(self, str(idx), param) + + def __len__(self) -> int: + return self._size + + def __iter__(self) -> Iterator[Any]: + return iter(self[i] for i in range(len(self))) + + def __iadd__(self, parameters: Iterable[Any]) -> Self: + return self.extend(parameters) + + def __dir__(self) -> list[str]: + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + def append(self, value: Any) -> Self: + """Append a given value at the end of the list. + + Args: + value (Any): value to append + """ + new_idx = len(self) + self._size += 1 + self[new_idx] = value + return self + + def extend(self, values: Iterable[Any]) -> Self: + """Append values from a Python iterable to the end of the list. + + Args: + values (iterable): iterable of values to append + """ + # Tensor is an iterable but we never want to unpack it here + if not isinstance(values, container_abcs.Iterable) or isinstance( + values, torch.Tensor + ): + raise TypeError( + "ParameterList.extend should be called with an " + "iterable, but got " + type(values).__name__ + ) + for value in values: + self.append(value) + return self + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + child_lines = [] + for k, p in enumerate(self): + if isinstance(p, torch.Tensor): + size_str = "x".join(str(size) for size in p.size()) + if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]: + device_str = f" ({p.device})" + else: + device_str = "" + parastr = "{} containing: [{} of size {}{}]".format( + "Parameter" if isinstance(p, Parameter) else "Tensor", + p.dtype, + size_str, + device_str, + ) + # pyrefly: ignore [bad-argument-type] + child_lines.append(" (" + str(k) + "): " + parastr) + else: + child_lines.append( + # pyrefly: ignore [bad-argument-type] + " (" + str(k) + "): Object of type: " + type(p).__name__ + ) + + tmpstr = "\n".join(child_lines) + return tmpstr + + def __call__(self, *args, **kwargs): + raise RuntimeError("ParameterList should not be called.") + + +class ParameterDict(Module): + r"""Holds parameters in a dictionary. + + ParameterDict can be indexed like a regular Python dictionary, but Parameters it + contains are properly registered, and will be visible by all Module methods. + Other objects are treated as would be done by a regular Python dictionary + + :class:`~torch.nn.ParameterDict` is an **ordered** dictionary. + :meth:`~torch.nn.ParameterDict.update` with other unordered mapping + types (e.g., Python's plain ``dict``) does not preserve the order of the + merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict` + will preserve their ordering. + + Note that the constructor, assigning an element of the dictionary and the + :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into + :class:`~torch.nn.Parameter`. + + Args: + values (iterable, optional): a mapping (dictionary) of + (string : Any) or an iterable of key-value pairs + of type (string, Any) + + Example:: + + class MyModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.params = nn.ParameterDict( + { + "left": nn.Parameter(torch.randn(5, 10)), + "right": nn.Parameter(torch.randn(5, 10)), + } + ) + + def forward(self, x, choice): + x = self.params[choice].mm(x) + return x + """ + + def __init__(self, parameters: Any = None) -> None: + super().__init__() + self._keys: dict[str, None] = {} + if parameters is not None: + self.update(parameters) + + def _key_to_attr(self, key: str) -> str: + if not isinstance(key, str): + raise TypeError( + "Index given to ParameterDict cannot be used as a key as it is " + f"not a string (type is '{type(key).__name__}'). Open an issue on " + "github if you need non-string keys." + ) + else: + # Use the key as-is so that `.named_parameters()` returns the right thing + return key + + def __getitem__(self, key: str) -> Any: + attr = self._key_to_attr(key) + return getattr(self, attr) + + def __setitem__(self, key: str, value: Any) -> None: + # Note that all other function that add an entry to the dictionary part of + # the ParameterDict end up here. So this is the only place where we need + # to wrap things into Parameter if needed. + # Objects added via setattr() are not in the dictionary part and thus won't + # call into this function. + self._keys[key] = None + attr = self._key_to_attr(key) + if isinstance(value, torch.Tensor) and not isinstance(value, Parameter): + value = Parameter(value) + setattr(self, attr, value) + + def __delitem__(self, key: str) -> None: + del self._keys[key] + attr = self._key_to_attr(key) + delattr(self, attr) + + def __len__(self) -> int: + return len(self._keys) + + def __iter__(self) -> Iterator[str]: + return iter(self._keys) + + def __reversed__(self) -> Iterator[str]: + return reversed(self._keys) + + def copy(self) -> ParameterDict: + """Return a copy of this :class:`~torch.nn.ParameterDict` instance.""" + # We have to use an OrderedDict because the ParameterDict constructor + # behaves differently on plain dict vs OrderedDict + return ParameterDict(OrderedDict((k, self[k]) for k in self._keys)) + + def __contains__(self, key: str) -> bool: + return key in self._keys + + def setdefault(self, key: str, default: Any | None = None) -> Any: + """Set the default for a key in the Parameterdict. + + If key is in the ParameterDict, return its value. + If not, insert `key` with a parameter `default` and return `default`. + `default` defaults to `None`. + + Args: + key (str): key to set default for + default (Any): the parameter set to the key + """ + if key not in self: + self[key] = default + return self[key] + + def clear(self) -> None: + """Remove all items from the ParameterDict.""" + for k in self._keys.copy(): + del self[k] + + def pop(self, key: str) -> Any: + r"""Remove key from the ParameterDict and return its parameter. + + Args: + key (str): key to pop from the ParameterDict + """ + v = self[key] + del self[key] + return v + + def popitem(self) -> tuple[str, Any]: + """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict.""" + k, _ = self._keys.popitem() + # We need the key in the _keys to be able to access/del + self._keys[k] = None + val = self[k] + del self[k] + return k, val + + def get(self, key: str, default: Any | None = None) -> Any: + r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not. + + Args: + key (str): key to get from the ParameterDict + default (Parameter, optional): value to return if key not present + """ + return self[key] if key in self else default # noqa: SIM401 + + def fromkeys( + self, keys: Iterable[str], default: Any | None = None + ) -> ParameterDict: + r"""Return a new ParameterDict with the keys provided. + + Args: + keys (iterable, string): keys to make the new ParameterDict from + default (Parameter, optional): value to set for all keys + """ + return ParameterDict((k, default) for k in keys) + + def keys(self) -> container_abcs.KeysView[str]: + r"""Return an iterable of the ParameterDict keys.""" + return self._keys.keys() + + def items(self) -> Iterable[tuple[str, Any]]: + r"""Return an iterable of the ParameterDict key/value pairs.""" + return ((k, self[k]) for k in self._keys) + + def values(self) -> Iterable[Any]: + r"""Return an iterable of the ParameterDict values.""" + return (self[k] for k in self._keys) + + def update(self, parameters: Mapping[str, Any] | ParameterDict) -> None: + r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys. + + .. note:: + If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + parameters (iterable): a mapping (dictionary) from string to + :class:`~torch.nn.Parameter`, or an iterable of + key-value pairs of type (string, :class:`~torch.nn.Parameter`) + """ + if not isinstance(parameters, container_abcs.Iterable): + raise TypeError( + "ParametersDict.update should be called with an " + "iterable of key/value pairs, but got " + type(parameters).__name__ + ) + + if isinstance(parameters, (OrderedDict, ParameterDict)): + for key, parameter in parameters.items(): + self[key] = parameter + elif isinstance(parameters, container_abcs.Mapping): + for key, parameter in sorted(parameters.items()): + self[key] = parameter + else: + for j, p in enumerate(parameters): + if not isinstance(p, container_abcs.Iterable): + raise TypeError( + "ParameterDict update sequence element " + "#" + str(j) + " should be Iterable; is" + type(p).__name__ + ) + # pyrefly: ignore [bad-argument-type] + if not len(p) == 2: + raise ValueError( + "ParameterDict update sequence element " + # pyrefly: ignore [bad-argument-type] + "#" + str(j) + " has length " + str(len(p)) + "; 2 is required" + ) + # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment + self[p[0]] = p[1] # type: ignore[assignment] + + def extra_repr(self) -> str: + child_lines = [] + for k, p in self.items(): + if isinstance(p, torch.Tensor): + size_str = "x".join(str(size) for size in p.size()) + if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]: + device_str = f" ({p.device})" + else: + device_str = "" + parastr = "{} containing: [{} of size {}{}]".format( + "Parameter" if isinstance(p, Parameter) else "Tensor", + torch.typename(p), + size_str, + device_str, + ) + # pyrefly: ignore [bad-argument-type] + child_lines.append(" (" + str(k) + "): " + parastr) + else: + child_lines.append( + # pyrefly: ignore [bad-argument-type] + " (" + str(k) + "): Object of type: " + type(p).__name__ + ) + tmpstr = "\n".join(child_lines) + return tmpstr + + def __call__(self, input): + raise RuntimeError("ParameterDict should not be called.") + + def __or__(self, other: ParameterDict) -> ParameterDict: + copy = self.copy() + copy.update(other) + return copy + + def __ror__(self, other: ParameterDict) -> ParameterDict: + copy = other.copy() + copy.update(self) + return copy + + def __ior__(self, other: ParameterDict) -> Self: + self.update(other) + return self diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..8b74b6a5a39e8ebfec821a047936e82b3cf002f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py @@ -0,0 +1,1904 @@ +# mypy: allow-untyped-defs +import math +from typing import Literal, Optional +from typing_extensions import deprecated + +import torch +from torch import Tensor +from torch._torch_docs import reproducibility_notes +from torch.nn import functional as F, init +from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t +from torch.nn.parameter import Parameter, UninitializedParameter + +from .lazy import LazyModuleMixin +from .module import Module +from .utils import _pair, _reverse_repeat_tuple, _single, _triple + + +__all__ = [ + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "LazyConv1d", + "LazyConv2d", + "LazyConv3d", + "LazyConvTranspose1d", + "LazyConvTranspose2d", + "LazyConvTranspose3d", +] + +convolution_notes = { + "groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs. + :attr:`in_channels` and :attr:`out_channels` must both be divisible by + :attr:`groups`. For example, + + * At groups=1, all inputs are convolved to all outputs. + * At groups=2, the operation becomes equivalent to having two conv + layers side by side, each seeing half the input channels + and producing half the output channels, and both subsequently + concatenated. + * At groups= :attr:`in_channels`, each input channel is convolved with + its own set of filters (of size + :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""", + "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`, + where `K` is a positive integer, this operation is also known as a "depthwise convolution". + + In other words, for an input of size :math:`(N, C_{in}, L_{in})`, + a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments + :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`.""", +} # noqa: B950 + + +class _ConvNd(Module): + __constants__ = [ + "stride", + "padding", + "dilation", + "groups", + "padding_mode", + "output_padding", + "in_channels", + "out_channels", + "kernel_size", + ] + __annotations__ = {"bias": Optional[torch.Tensor]} + + def _conv_forward( # type: ignore[empty-body] + self, input: Tensor, weight: Tensor, bias: Tensor | None + ) -> Tensor: ... + + in_channels: int + _reversed_padding_repeated_twice: list[int] + out_channels: int + kernel_size: tuple[int, ...] + stride: tuple[int, ...] + padding: str | tuple[int, ...] + dilation: tuple[int, ...] + transposed: bool + output_padding: tuple[int, ...] + groups: int + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] + weight: Tensor + bias: Tensor | None + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: tuple[int, ...], + stride: tuple[int, ...], + padding: str | tuple[int, ...], + dilation: tuple[int, ...], + transposed: bool, + output_padding: tuple[int, ...], + groups: int, + bias: bool, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"], + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + if groups <= 0: + raise ValueError("groups must be a positive integer") + if in_channels % groups != 0: + raise ValueError("in_channels must be divisible by groups") + if out_channels % groups != 0: + raise ValueError("out_channels must be divisible by groups") + valid_padding_strings = {"same", "valid"} + if isinstance(padding, str): + if padding not in valid_padding_strings: + raise ValueError( + f"Invalid padding string {padding!r}, should be one of {valid_padding_strings}" + ) + if padding == "same" and any(s != 1 for s in stride): + raise ValueError( + "padding='same' is not supported for strided convolutions" + ) + + valid_padding_modes = {"zeros", "reflect", "replicate", "circular"} + if padding_mode not in valid_padding_modes: + raise ValueError( + f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'" + ) + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.transposed = transposed + self.output_padding = output_padding + self.groups = groups + self.padding_mode = padding_mode + # `_reversed_padding_repeated_twice` is the padding to be passed to + # `F.pad` if needed (e.g., for non-zero padding types that are + # implemented as two ops: padding + conv). `F.pad` accepts paddings in + # reverse order than the dimension. + if isinstance(self.padding, str): + self._reversed_padding_repeated_twice = [0, 0] * len(kernel_size) + if padding == "same": + for d, k, i in zip( + dilation, + kernel_size, + range(len(kernel_size) - 1, -1, -1), + strict=False, + ): + total_padding = d * (k - 1) + left_pad = total_padding // 2 + self._reversed_padding_repeated_twice[2 * i] = left_pad + self._reversed_padding_repeated_twice[2 * i + 1] = ( + total_padding - left_pad + ) + else: + self._reversed_padding_repeated_twice = _reverse_repeat_tuple( + self.padding, 2 + ) + + if transposed: + self.weight = Parameter( + torch.empty( + (in_channels, out_channels // groups, *kernel_size), + **factory_kwargs, + ) + ) + else: + self.weight = Parameter( + torch.empty( + (out_channels, in_channels // groups, *kernel_size), + **factory_kwargs, + ) + ) + if bias: + self.bias = Parameter(torch.empty(out_channels, **factory_kwargs)) + else: + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with + # uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size) + # For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573 + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) + if fan_in != 0: + bound = 1 / math.sqrt(fan_in) + init.uniform_(self.bias, -bound, bound) + + def extra_repr(self): + s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}" + if self.padding != (0,) * len(self.padding): + s += ", padding={padding}" + if self.dilation != (1,) * len(self.dilation): + s += ", dilation={dilation}" + if self.output_padding != (0,) * len(self.output_padding): + s += ", output_padding={output_padding}" + if self.groups != 1: + s += ", groups={groups}" + if self.bias is None: + s += ", bias=False" + if self.padding_mode != "zeros": + s += ", padding_mode={padding_mode}" + return s.format(**self.__dict__) + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, "padding_mode"): + self.padding_mode = "zeros" + + +class Conv1d(_ConvNd): + __doc__ = ( + r"""Applies a 1D convolution over an input signal composed of several input + planes. + + In the simplest case, the output value of the layer with input size + :math:`(N, C_{\text{in}}, L)` and output :math:`(N, C_{\text{out}}, L_{\text{out}})` can be + precisely described as: + + .. math:: + \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) + + \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{\text{out}_j}, k) + \star \text{input}(N_i, k) + + where :math:`\star` is the valid `cross-correlation`_ operator, + :math:`N` is a batch size, :math:`C` denotes a number of channels, + :math:`L` is a length of signal sequence. + """ + + r""" + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation, a single + number or a one-element tuple. + + * :attr:`padding` controls the amount of padding applied to the input. It + can be either a string {{'valid', 'same'}} or a tuple of ints giving the + amount of implicit padding applied on both sides. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also + known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_ + has a nice visualization of what :attr:`dilation` does. +""" + r""" + {groups_note} + + Note: + {depthwise_separable_note} + Note: + {cudnn_reproducibility_note} + + Note: + ``padding='valid'`` is the same as no padding. ``padding='same'`` pads + the input so the output has the shape as the input. However, this mode + doesn't support any stride values other than 1. + + Note: + This module supports complex data types i.e. ``complex32, complex64, complex128``. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel + elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})` + - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor\frac{L_{in} + 2 \times \text{padding} - \text{dilation} + \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{out\_channels}, + \frac{\text{in\_channels}}{\text{groups}}, \text{kernel\_size})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}` + bias (Tensor): the learnable bias of the module of shape + (out_channels). If :attr:`bias` is ``True``, then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}` + + Examples:: + + >>> m = nn.Conv1d(16, 33, 3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + + .. _cross-correlation: + https://en.wikipedia.org/wiki/Cross-correlation + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: str | _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # we create new variables below to make mypy happy since kernel_size has + # type Union[int, Tuple[int]] and kernel_size_ has type Tuple[int] + kernel_size_ = _single(kernel_size) + stride_ = _single(stride) + padding_ = padding if isinstance(padding, str) else _single(padding) + dilation_ = _single(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride_, + padding_, + dilation_, + False, + _single(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None): + if self.padding_mode != "zeros": + return F.conv1d( + F.pad( + input, self._reversed_padding_repeated_twice, mode=self.padding_mode + ), + weight, + bias, + self.stride, + _single(0), + self.dilation, + self.groups, + ) + + return F.conv1d( + input, weight, bias, self.stride, self.padding, self.dilation, self.groups + ) + + def forward(self, input: Tensor) -> Tensor: + return self._conv_forward(input, self.weight, self.bias) + + +class Conv2d(_ConvNd): + __doc__ = ( + r"""Applies a 2D convolution over an input signal composed of several input + planes. + + In the simplest case, the output value of the layer with input size + :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` + can be precisely described as: + + .. math:: + \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) + + \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k) + + + where :math:`\star` is the valid 2D `cross-correlation`_ operator, + :math:`N` is a batch size, :math:`C` denotes a number of channels, + :math:`H` is a height of input planes in pixels, and :math:`W` is + width in pixels. + """ + + r""" + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation, a single + number or a tuple. + + * :attr:`padding` controls the amount of padding applied to the input. It + can be either a string {{'valid', 'same'}} or an int / a tuple of ints giving the + amount of implicit padding applied on both sides. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also + known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_ + has a nice visualization of what :attr:`dilation` does. +""" + r""" + + {groups_note} + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Note: + {depthwise_separable_note} + + Note: + {cudnn_reproducibility_note} + + Note: + ``padding='valid'`` is the same as no padding. ``padding='same'`` pads + the input so the output has the shape as the input. However, this mode + doesn't support any stride values other than 1. + + Note: + This module supports complex data types i.e. ``complex32, complex64, complex128``. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to all four sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})` + - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] + \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] + \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},` + :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + bias (Tensor): the learnable bias of the module of shape + (out_channels). If :attr:`bias` is ``True``, + then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + + Examples: + + >>> # With square kernels and equal stride + >>> m = nn.Conv2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> # non-square kernels and unequal stride and with padding and dilation + >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + >>> input = torch.randn(20, 16, 50, 100) + >>> output = m(input) + + .. _cross-correlation: + https://en.wikipedia.org/wiki/Cross-correlation + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: str | _size_2_t = 0, + dilation: _size_2_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size_ = _pair(kernel_size) + stride_ = _pair(stride) + padding_ = padding if isinstance(padding, str) else _pair(padding) + dilation_ = _pair(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride_, + padding_, + dilation_, + False, + _pair(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None): + if self.padding_mode != "zeros": + return F.conv2d( + F.pad( + input, self._reversed_padding_repeated_twice, mode=self.padding_mode + ), + weight, + bias, + self.stride, + _pair(0), + self.dilation, + self.groups, + ) + + return F.conv2d( + input, weight, bias, self.stride, self.padding, self.dilation, self.groups + ) + + def forward(self, input: Tensor) -> Tensor: + return self._conv_forward(input, self.weight, self.bias) + + +class Conv3d(_ConvNd): + __doc__ = ( + r"""Applies a 3D convolution over an input signal composed of several input + planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, D, H, W)` + and output :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` can be precisely described as: + + .. math:: + out(N_i, C_{out_j}) = bias(C_{out_j}) + + \sum_{k = 0}^{C_{in} - 1} weight(C_{out_j}, k) \star input(N_i, k) + + where :math:`\star` is the valid 3D `cross-correlation`_ operator + """ + + r""" + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation. + + * :attr:`padding` controls the amount of padding applied to the input. It + can be either a string {{'valid', 'same'}} or a tuple of ints giving the + amount of implicit padding applied on both sides. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + + {groups_note} + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Note: + {depthwise_separable_note} + + Note: + {cudnn_reproducibility_note} + + Note: + ``padding='valid'`` is the same as no padding. ``padding='same'`` pads + the input so the output has the shape as the input. However, this mode + doesn't support any stride values other than 1. + + Note: + This module supports complex data types i.e. ``complex32, complex64, complex128``. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to all six sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})` + - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, + where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] + \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] + \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] + \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},` + :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + bias (Tensor): the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``, + then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + + Examples:: + + >>> # With square kernels and equal stride + >>> m = nn.Conv3d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)) + >>> input = torch.randn(20, 16, 10, 50, 100) + >>> output = m(input) + + .. _cross-correlation: + https://en.wikipedia.org/wiki/Cross-correlation + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: str | _size_3_t = 0, + dilation: _size_3_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size_ = _triple(kernel_size) + stride_ = _triple(stride) + padding_ = padding if isinstance(padding, str) else _triple(padding) + dilation_ = _triple(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride_, + padding_, + dilation_, + False, + _triple(0), + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None): + if self.padding_mode != "zeros": + return F.conv3d( + F.pad( + input, self._reversed_padding_repeated_twice, mode=self.padding_mode + ), + weight, + bias, + self.stride, + _triple(0), + self.dilation, + self.groups, + ) + + return F.conv3d( + input, weight, bias, self.stride, self.padding, self.dilation, self.groups + ) + + def forward(self, input: Tensor) -> Tensor: + return self._conv_forward(input, self.weight, self.bias) + + +class _ConvTransposeNd(_ConvNd): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + device=None, + dtype=None, + ) -> None: + if padding_mode != "zeros": + raise ValueError( + f'Only "zeros" padding mode is supported for {self.__class__.__name__}' + ) + + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + # dilation being an optional parameter is for backwards + # compatibility + def _output_padding( + self, + input: Tensor, + output_size: list[int] | None, + stride: list[int], + padding: list[int], + kernel_size: list[int], + num_spatial_dims: int, + dilation: list[int] | None = None, + ) -> list[int]: + if output_size is None: + ret = _single(self.output_padding) # converting to list if was not already + else: + has_batch_dim = input.dim() == num_spatial_dims + 2 + num_non_spatial_dims = 2 if has_batch_dim else 1 + if len(output_size) == num_non_spatial_dims + num_spatial_dims: + output_size = output_size[num_non_spatial_dims:] + if len(output_size) != num_spatial_dims: + raise ValueError( + f"ConvTranspose{num_spatial_dims}D: for {input.dim()}D input, output_size must have {num_spatial_dims} " + f"or {num_non_spatial_dims + num_spatial_dims} elements (got {len(output_size)})" + ) + + min_sizes = torch.jit.annotate(list[int], []) + max_sizes = torch.jit.annotate(list[int], []) + for d in range(num_spatial_dims): + dim_size = ( + (input.size(d + num_non_spatial_dims) - 1) * stride[d] + - 2 * padding[d] + + (dilation[d] if dilation is not None else 1) + * (kernel_size[d] - 1) + + 1 + ) + min_sizes.append(dim_size) + max_sizes.append(min_sizes[d] + stride[d] - 1) + + for i in range(len(output_size)): + size = output_size[i] + min_size = min_sizes[i] + max_size = max_sizes[i] + if size < min_size or size > max_size: + raise ValueError( + f"requested an output size of {output_size}, but valid sizes range " + f"from {min_sizes} to {max_sizes} (for an input of {input.size()[2:]})" + ) + + res = torch.jit.annotate(list[int], []) + for d in range(num_spatial_dims): + res.append(output_size[d] - min_sizes[d]) + + ret = res + return ret + + +class ConvTranspose1d(_ConvTransposeNd): + __doc__ = ( + r"""Applies a 1D transposed convolution operator over an input image + composed of several input planes. + + This module can be seen as the gradient of Conv1d with respect to its input. + It is also known as a fractionally-strided convolution or + a deconvolution (although it is not an actual deconvolution operation as it does + not compute a true inverse of convolution). For more information, see the visualizations + `here`_ and the `Deconvolutional Networks`_ paper. + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation. + + * :attr:`padding` controls the amount of implicit zero padding on both + sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note + below for details. + + * :attr:`output_padding` controls the additional size added to one side + of the output shape. See note below for details. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + {groups_note} + + Note: + The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding`` + amount of zero padding to both sizes of the input. This is set so that + when a :class:`~torch.nn.Conv1d` and a :class:`~torch.nn.ConvTranspose1d` + are initialized with same parameters, they are inverses of each other in + regard to the input and output shapes. However, when ``stride > 1``, + :class:`~torch.nn.Conv1d` maps multiple input shapes to the same output + shape. :attr:`output_padding` is provided to resolve this ambiguity by + effectively increasing the calculated output shape on one side. Note + that :attr:`output_padding` is only used to find output shape, but does + not actually add zero-padding to output. + + Note: + In some circumstances when using the CUDA backend with CuDNN, this operator + may select a nondeterministic algorithm to increase performance. If this is + undesirable, you can try to make the operation deterministic (potentially at + a performance cost) by setting ``torch.backends.cudnn.deterministic = + True``. + Please see the notes on :doc:`/notes/randomness` for background. + + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})` + - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where + + .. math:: + L_{out} = (L_{in} - 1) \times \text{stride} - 2 \times \text{padding} + \text{dilation} + \times (\text{kernel\_size} - 1) + \text{output\_padding} + 1 + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},` + :math:`\text{kernel\_size})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}` + bias (Tensor): the learnable bias of the module of shape (out_channels). + If :attr:`bias` is ``True``, then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}` + + Examples:: + + >>> # With square kernels and equal stride + >>> m = nn.ConvTranspose1d(16, 33, 3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + >>> # exact output size can be also specified as an argument + >>> input = torch.randn(1, 16, 12) + >>> downsample = nn.Conv1d(16, 16, 3, stride=2, padding=1) + >>> upsample = nn.ConvTranspose1d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(input) + >>> h.size() + torch.Size([1, 16, 6]) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12]) + + .. _`here`: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + + .. _`Deconvolutional Networks`: + https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + output_padding: _size_1_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_1_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _single(kernel_size) + stride = _single(stride) + padding = _single(padding) + dilation = _single(dilation) + output_padding = _single(output_padding) + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor: + if self.padding_mode != "zeros": + raise ValueError( + "Only `zeros` padding mode is supported for ConvTranspose1d" + ) + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 1 + output_padding = self._output_padding( + input, + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, + self.dilation, # type: ignore[arg-type] + ) + return F.conv_transpose1d( + input, + self.weight, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + + +class ConvTranspose2d(_ConvTransposeNd): + __doc__ = ( + r"""Applies a 2D transposed convolution operator over an input image + composed of several input planes. + + This module can be seen as the gradient of Conv2d with respect to its input. + It is also known as a fractionally-strided convolution or + a deconvolution (although it is not an actual deconvolution operation as it does + not compute a true inverse of convolution). For more information, see the visualizations + `here`_ and the `Deconvolutional Networks`_ paper. + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation. When stride > 1, ConvTranspose2d inserts zeros between input + elements along the spatial dimensions before applying the convolution kernel. This zero-insertion operation is the standard + behavior of transposed convolutions, which can increase the spatial resolution and is equivalent to a learnable + upsampling operation. + + * :attr:`padding` controls the amount of implicit zero padding on both + sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note + below for details. + + * :attr:`output_padding` controls the additional size added to one side + of the output shape. See note below for details. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + {groups_note} + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` + can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimensions + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Note: + The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding`` + amount of zero padding to both sizes of the input. This is set so that + when a :class:`~torch.nn.Conv2d` and a :class:`~torch.nn.ConvTranspose2d` + are initialized with same parameters, they are inverses of each other in + regard to the input and output shapes. However, when ``stride > 1``, + :class:`~torch.nn.Conv2d` maps multiple input shapes to the same output + shape. :attr:`output_padding` is provided to resolve this ambiguity by + effectively increasing the calculated output shape on one side. Note + that :attr:`output_padding` is only used to find output shape, but does + not actually add zero-padding to output. + + Note: + {cudnn_reproducibility_note} + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of each dimension in the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of each dimension in the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})` + - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0] + \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1 + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1] + \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1 + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},` + :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + bias (Tensor): the learnable bias of the module of shape (out_channels) + If :attr:`bias` is ``True``, then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + + Examples:: + + >>> # With square kernels and equal stride + >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + >>> input = torch.randn(20, 16, 50, 100) + >>> output = m(input) + >>> # exact output size can be also specified as an argument + >>> input = torch.randn(1, 16, 12, 12) + >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1) + >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1) + >>> h = downsample(input) + >>> h.size() + torch.Size([1, 16, 6, 6]) + >>> output = upsample(h, output_size=input.size()) + >>> output.size() + torch.Size([1, 16, 12, 12]) + + .. _`here`: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + + .. _`Deconvolutional Networks`: + https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: _size_2_t = 0, + output_padding: _size_2_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_2_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + output_padding = _pair(output_padding) + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor: + """ + Performs the forward pass. + + Attributes: + input (Tensor): The input tensor. + output_size (list[int], optional): A list of integers representing + the size of the output tensor. Default is None. + """ + if self.padding_mode != "zeros": + raise ValueError( + "Only `zeros` padding mode is supported for ConvTranspose2d" + ) + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 2 + output_padding = self._output_padding( + input, + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, + self.dilation, # type: ignore[arg-type] + ) + + return F.conv_transpose2d( + input, + self.weight, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + + +class ConvTranspose3d(_ConvTransposeNd): + __doc__ = ( + r"""Applies a 3D transposed convolution operator over an input image composed of several input + planes. + The transposed convolution operator multiplies each input value element-wise by a learnable kernel, + and sums over the outputs from all input feature planes. + + This module can be seen as the gradient of Conv3d with respect to its input. + It is also known as a fractionally-strided convolution or + a deconvolution (although it is not an actual deconvolution operation as it does + not compute a true inverse of convolution). For more information, see the visualizations + `here`_ and the `Deconvolutional Networks`_ paper. + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + * :attr:`stride` controls the stride for the cross-correlation. + + * :attr:`padding` controls the amount of implicit zero padding on both + sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note + below for details. + + * :attr:`output_padding` controls the additional size added to one side + of the output shape. See note below for details. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + {groups_note} + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` + can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimensions + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Note: + The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding`` + amount of zero padding to both sizes of the input. This is set so that + when a :class:`~torch.nn.Conv3d` and a :class:`~torch.nn.ConvTranspose3d` + are initialized with same parameters, they are inverses of each other in + regard to the input and output shapes. However, when ``stride > 1``, + :class:`~torch.nn.Conv3d` maps multiple input shapes to the same output + shape. :attr:`output_padding` is provided to resolve this ambiguity by + effectively increasing the calculated output shape on one side. Note + that :attr:`output_padding` is only used to find output shape, but does + not actually add zero-padding to output. + + Note: + {cudnn_reproducibility_note} + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of each dimension in the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of each dimension in the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + """.format(**reproducibility_notes, **convolution_notes) + + r""" + + Shape: + - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})` + - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or + :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0] + \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1 + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1] + \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1 + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2] + \text{dilation}[2] + \times (\text{kernel\_size}[2] - 1) + \text{output\_padding}[2] + 1 + + + Attributes: + weight (Tensor): the learnable weights of the module of shape + :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},` + :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`. + The values of these weights are sampled from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + bias (Tensor): the learnable bias of the module of shape (out_channels) + If :attr:`bias` is ``True``, then the values of these weights are + sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + + Examples:: + + >>> # With square kernels and equal stride + >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2) + >>> # non-square kernels and unequal stride and with padding + >>> m = nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2)) + >>> input = torch.randn(20, 16, 10, 50, 100) + >>> output = m(input) + + .. _`here`: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + + .. _`Deconvolutional Networks`: + https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf + """ + ) + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: _size_3_t = 0, + output_padding: _size_3_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_3_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + dilation = _triple(dilation) + output_padding = _triple(output_padding) + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + True, + output_padding, + groups, + bias, + padding_mode, + **factory_kwargs, + ) + + def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor: + if self.padding_mode != "zeros": + raise ValueError( + "Only `zeros` padding mode is supported for ConvTranspose3d" + ) + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 3 + output_padding = self._output_padding( + input, + output_size, + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, + self.dilation, # type: ignore[arg-type] + ) + + return F.conv_transpose3d( + input, + self.weight, + self.bias, + self.stride, + self.padding, + output_padding, + self.groups, + self.dilation, + ) + + +# TODO: Deprecate and remove the following alias `_ConvTransposeMixin`. +# +# `_ConvTransposeMixin` was a mixin that was removed. It is meant to be used +# with `_ConvNd` to construct actual module classes that implements conv +# transpose ops: +# +# class MyConvTranspose(_ConvNd, _ConvTransposeMixin): +# ... +# +# In PyTorch, it has been replaced by `_ConvTransposeNd`, which is a proper +# subclass of `_ConvNd`. However, some user code in the wild still (incorrectly) +# use the internal class `_ConvTransposeMixin`. Hence, we provide this alias +# for BC, because it is cheap and easy for us to do so, even though that +# `_ConvTransposeNd` is really not a mixin anymore (but multiple inheritance as +# above would still work). +class _ConvTransposeMixin(_ConvTransposeNd): + @deprecated( + "`_ConvTransposeMixin` is a deprecated internal class. " + "Please consider using public APIs.", + category=FutureWarning, + ) + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + +# TODO: Conv2dLocal +# TODO: Conv2dMap +# TODO: ConvTranspose2dMap + + +class _LazyConvXdMixin(LazyModuleMixin): + groups: int + transposed: bool + in_channels: int + out_channels: int + kernel_size: tuple[int, ...] + weight: UninitializedParameter + bias: UninitializedParameter + + def reset_parameters(self) -> None: + # has_uninitialized_params is defined in parent class and it is using a protocol on self + if not self.has_uninitialized_params() and self.in_channels != 0: # type: ignore[misc] + # "type:ignore[..]" is required because mypy thinks that "reset_parameters" is undefined + # in super class. Turns out that it is defined in _ConvND which is inherited by any class + # that also inherits _LazyConvXdMixin + super().reset_parameters() # type: ignore[misc] + + # Signature of "initialize_parameters" is incompatible with the definition in supertype LazyModuleMixin + def initialize_parameters(self, input: Tensor, *args, **kwargs) -> None: # type: ignore[override] + # defined by parent class but using a protocol + if self.has_uninitialized_params(): # type: ignore[misc] + self.in_channels = self._get_in_channels(input) + if self.in_channels % self.groups != 0: + raise ValueError("in_channels must be divisible by groups") + assert isinstance(self.weight, UninitializedParameter) + if self.transposed: + self.weight.materialize( + ( + self.in_channels, + self.out_channels // self.groups, + *self.kernel_size, + ) + ) + else: + self.weight.materialize( + ( + self.out_channels, + self.in_channels // self.groups, + *self.kernel_size, + ) + ) + if self.bias is not None: + assert isinstance(self.bias, UninitializedParameter) + self.bias.materialize((self.out_channels,)) + self.reset_parameters() + + # Function to extract in_channels from first input. + def _get_in_channels(self, input: Tensor) -> int: + num_spatial_dims = self._get_num_spatial_dims() + num_dims_no_batch = num_spatial_dims + 1 # +1 for channels dim + num_dims_batch = num_dims_no_batch + 1 + if input.dim() not in (num_dims_no_batch, num_dims_batch): + raise RuntimeError( + f"Expected {num_dims_no_batch}D (unbatched) or {num_dims_batch}D (batched) input " + f"to {self.__class__.__name__}, but " + f"got input of size: {input.shape}" + ) + return input.shape[1] if input.dim() == num_dims_batch else input.shape[0] + + # Function to return the number of spatial dims expected for inputs to the module. + # This is expected to be implemented by subclasses. + def _get_num_spatial_dims(self) -> int: + raise NotImplementedError + + +# LazyConv1d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConv1d(_LazyConvXdMixin, Conv1d): # type: ignore[misc] + r"""A :class:`torch.nn.Conv1d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`Conv1d` is inferred from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel + elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + + .. seealso:: :class:`torch.nn.Conv1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = Conv1d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + dilation, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 1 + + +# LazyConv2d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConv2d(_LazyConvXdMixin, Conv2d): # type: ignore[misc] + r"""A :class:`torch.nn.Conv2d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`Conv2d` that is inferred from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel + elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + + .. seealso:: :class:`torch.nn.Conv2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = Conv2d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: _size_2_t = 0, + dilation: _size_2_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + dilation, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 2 + + +# LazyConv3d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConv3d(_LazyConvXdMixin, Conv3d): # type: ignore[misc] + r"""A :class:`torch.nn.Conv3d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`Conv3d` that is inferred from + the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel + elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + padding_mode (str, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + + .. seealso:: :class:`torch.nn.Conv3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = Conv3d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: _size_3_t = 0, + dilation: _size_3_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + dilation, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 3 + + +# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d): # type: ignore[misc] + r"""A :class:`torch.nn.ConvTranspose1d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`ConvTranspose1d` that is inferred from + the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + + .. seealso:: :class:`torch.nn.ConvTranspose1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = ConvTranspose1d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + output_padding: _size_1_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_1_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + output_padding, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + dilation, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 1 + + +# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d): # type: ignore[misc] + r"""A :class:`torch.nn.ConvTranspose2d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`ConvTranspose2d` is inferred from + the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of each dimension in the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of each dimension in the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + + .. seealso:: :class:`torch.nn.ConvTranspose2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = ConvTranspose2d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: _size_2_t = 0, + output_padding: _size_2_t = 0, + groups: int = 1, + bias: bool = True, + dilation: int = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + output_padding, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + dilation, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 2 + + +# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UninitializeParameter +class LazyConvTranspose3d(_LazyConvXdMixin, ConvTranspose3d): # type: ignore[misc] + r"""A :class:`torch.nn.ConvTranspose3d` module with lazy initialization of the ``in_channels`` argument. + + The ``in_channels`` argument of the :class:`ConvTranspose3d` is inferred from + the ``input.size(1)``. + The attributes that will be lazily initialized are `weight` and `bias`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding + will be added to both sides of each dimension in the input. Default: 0 + output_padding (int or tuple, optional): Additional size added to one side + of each dimension in the output shape. Default: 0 + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + + .. seealso:: :class:`torch.nn.ConvTranspose3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin` + """ + + # super class define this variable as None. "type: ignore[..] is required + # since we are redefining the variable. + cls_to_become = ConvTranspose3d # type: ignore[assignment] + + def __init__( + self, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: _size_3_t = 0, + output_padding: _size_3_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_3_t = 1, + padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # pyrefly: ignore [bad-argument-type] + super().__init__( + 0, + 0, + kernel_size, + stride, + padding, + output_padding, + groups, + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + False, + dilation, + padding_mode, + **factory_kwargs, + ) + # pyrefly: ignore [bad-override, bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_channels = out_channels + if bias: + # pyrefly: ignore [bad-override, bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def _get_num_spatial_dims(self) -> int: + return 3 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py new file mode 100644 index 0000000000000000000000000000000000000000..27ab92fef5eb4a6da80d97d8559a204a6956ac4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py @@ -0,0 +1,100 @@ +import torch.nn.functional as F +from torch import Tensor + +from .module import Module + + +__all__ = ["PairwiseDistance", "CosineSimilarity"] + + +class PairwiseDistance(Module): + r""" + Computes the pairwise distance between input vectors, or between columns of input matrices. + + Distances are computed using ``p``-norm, with constant ``eps`` added to avoid division by zero + if ``p`` is negative, i.e.: + + .. math :: + \mathrm{dist}\left(x, y\right) = \left\Vert x-y + \epsilon e \right\Vert_p, + + where :math:`e` is the vector of ones and the ``p``-norm is given by. + + .. math :: + \Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}. + + Args: + p (real, optional): the norm degree. Can be negative. Default: 2 + eps (float, optional): Small value to avoid division by zero. + Default: 1e-6 + keepdim (bool, optional): Determines whether or not to keep the vector dimension. + Default: False + Shape: + - Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension` + - Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1 + - Output: :math:`(N)` or :math:`()` based on input dimension. + If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension. + + Examples: + >>> pdist = nn.PairwiseDistance(p=2) + >>> input1 = torch.randn(100, 128) + >>> input2 = torch.randn(100, 128) + >>> output = pdist(input1, input2) + """ + + __constants__ = ["norm", "eps", "keepdim"] + norm: float + eps: float + keepdim: bool + + def __init__( + self, p: float = 2.0, eps: float = 1e-6, keepdim: bool = False + ) -> None: + super().__init__() + self.norm = p + self.eps = eps + self.keepdim = keepdim + + def forward(self, x1: Tensor, x2: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim) + + +class CosineSimilarity(Module): + r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along `dim`. + + .. math :: + \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}. + + Args: + dim (int, optional): Dimension where cosine similarity is computed. Default: 1 + eps (float, optional): Small value to avoid division by zero. + Default: 1e-8 + Shape: + - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim` + - Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`, + and broadcastable with x1 at other dimensions. + - Output: :math:`(\ast_1, \ast_2)` + + Examples: + >>> input1 = torch.randn(100, 128) + >>> input2 = torch.randn(100, 128) + >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6) + >>> output = cos(input1, input2) + """ + + __constants__ = ["dim", "eps"] + dim: int + eps: float + + def __init__(self, dim: int = 1, eps: float = 1e-8) -> None: + super().__init__() + self.dim = dim + self.eps = eps + + def forward(self, x1: Tensor, x2: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.cosine_similarity(x1, x2, self.dim, self.eps) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3de5d61dc0b56d6f708a242611bfc5b2850288 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py @@ -0,0 +1,323 @@ +import torch.nn.functional as F +from torch import Tensor + +from .module import Module + + +__all__ = [ + "Dropout", + "Dropout1d", + "Dropout2d", + "Dropout3d", + "AlphaDropout", + "FeatureAlphaDropout", +] + + +class _DropoutNd(Module): + __constants__ = ["p", "inplace"] + p: float + inplace: bool + + def __init__(self, p: float = 0.5, inplace: bool = False) -> None: + super().__init__() + if p < 0 or p > 1: + raise ValueError( + f"dropout probability has to be between 0 and 1, but got {p}" + ) + self.p = p + self.inplace = inplace + + def extra_repr(self) -> str: + return f"p={self.p}, inplace={self.inplace}" + + +class Dropout(_DropoutNd): + r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`. + + The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution. + + Each channel will be zeroed out independently on every forward call. + + This has proven to be an effective technique for regularization and + preventing the co-adaptation of neurons as described in the paper + `Improving neural networks by preventing co-adaptation of feature + detectors`_ . + + Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during + training. This means that during evaluation the module simply computes an + identity function. + + Args: + p: probability of an element to be zeroed. Default: 0.5 + inplace: If set to ``True``, will do this operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`. Input can be of any shape + - Output: :math:`(*)`. Output is of the same shape as input + + Examples:: + + >>> m = nn.Dropout(p=0.2) + >>> input = torch.randn(20, 16) + >>> output = m(input) + + .. _Improving neural networks by preventing co-adaptation of feature + detectors: https://arxiv.org/abs/1207.0580 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.dropout(input, self.p, self.training, self.inplace) + + +class Dropout1d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 1D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 1D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv1d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout1d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zero-ed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, L)` or :math:`(C, L)`. + - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout1d(p=0.2) + >>> input = torch.randn(20, 16, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.dropout1d(input, self.p, self.training, self.inplace) + + +class Dropout2d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 2D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 2D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv2d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout2d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zero-ed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + .. warning :: + Due to historical reasons, this class will perform 1D channel-wise dropout + for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT + support inputs without a batch dimension of shape :math:`(C, H, W)`. This + behavior will change in a future release to interpret 3D inputs as no-batch-dim + inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`. + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`. + - Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout2d(p=0.2) + >>> input = torch.randn(20, 16, 32, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.dropout2d(input, self.p, self.training, self.inplace) + + +class Dropout3d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 3D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 3D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv3d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout3d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zeroed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`. + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout3d(p=0.2) + >>> input = torch.randn(20, 16, 4, 32, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.dropout3d(input, self.p, self.training, self.inplace) + + +class AlphaDropout(_DropoutNd): + r"""Applies Alpha Dropout over the input. + + Alpha Dropout is a type of Dropout that maintains the self-normalizing + property. + For an input with zero mean and unit standard deviation, the output of + Alpha Dropout maintains the original mean and standard deviation of the + input. + Alpha Dropout goes hand-in-hand with SELU activation function, which ensures + that the outputs have zero mean and unit standard deviation. + + During training, it randomly masks some of the elements of the input + tensor with probability *p* using samples from a bernoulli distribution. + The elements to masked are randomized on every forward call, and scaled + and shifted to maintain zero mean and unit standard deviation. + + During evaluation the module simply computes an identity function. + + More details can be found in the paper `Self-Normalizing Neural Networks`_ . + + Args: + p (float): probability of an element to be dropped. Default: 0.5 + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(*)`. Input can be of any shape + - Output: :math:`(*)`. Output is of the same shape as input + + Examples:: + + >>> m = nn.AlphaDropout(p=0.2) + >>> input = torch.randn(20, 16) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.alpha_dropout(input, self.p, self.training) + + +class FeatureAlphaDropout(_DropoutNd): + r"""Randomly masks out entire channels. + + A channel is a feature map, + e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input + is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of + setting activations to zero, as in regular Dropout, the activations are set + to the negative saturation value of the SELU activation function. More details + can be found in the paper `Self-Normalizing Neural Networks`_ . + + Each element will be masked independently for each sample on every forward + call with probability :attr:`p` using samples from a Bernoulli distribution. + The elements to be masked are randomized on every forward call, and scaled + and shifted to maintain zero mean and unit variance. + + Usually the input comes from :class:`nn.AlphaDropout` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.AlphaDropout` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zeroed. Default: 0.5 + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`. + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input). + + Examples:: + + >>> m = nn.FeatureAlphaDropout(p=0.2) + >>> input = torch.randn(20, 16, 4, 32, 32) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.feature_alpha_dropout(input, self.p, self.training) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py new file mode 100644 index 0000000000000000000000000000000000000000..146a1890d422475712c9d62d0ff841530282d30e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py @@ -0,0 +1,167 @@ +# mypy: allow-untyped-defs + +from torch import Tensor +from torch.types import _size + +from .module import Module + + +__all__ = ["Flatten", "Unflatten"] + + +class Flatten(Module): + r""" + Flattens a contiguous range of dims into a tensor. + + For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details. + + Shape: + - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,' + where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any + number of dimensions including none. + - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`. + + Args: + start_dim: first dim to flatten (default = 1). + end_dim: last dim to flatten (default = -1). + + Examples:: + >>> input = torch.randn(32, 1, 5, 5) + >>> # With default parameters + >>> m = nn.Flatten() + >>> output = m(input) + >>> output.size() + torch.Size([32, 25]) + >>> # With non-default parameters + >>> m = nn.Flatten(0, 2) + >>> output = m(input) + >>> output.size() + torch.Size([160, 5]) + """ + + __constants__ = ["start_dim", "end_dim"] + start_dim: int + end_dim: int + + def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None: + super().__init__() + self.start_dim = start_dim + self.end_dim = end_dim + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return input.flatten(self.start_dim, self.end_dim) + + def extra_repr(self) -> str: + """ + Returns the extra representation of the module. + """ + return f"start_dim={self.start_dim}, end_dim={self.end_dim}" + + +class Unflatten(Module): + r""" + Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`. + + * :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can + be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively. + + * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be + a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input; a `NamedShape` + (tuple of `(name, size)` tuples) for `NamedTensor` input. + + Shape: + - Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at + dimension :attr:`dim` and :math:`*` means any number of dimensions including none. + - Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and + :math:`\prod_{i=1}^n U_i = S_{\text{dim}}`. + + Args: + dim (Union[int, str]): Dimension to be unflattened + unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension + + Examples: + >>> input = torch.randn(2, 50) + >>> # With tuple of ints + >>> m = nn.Sequential( + >>> nn.Linear(50, 50), + >>> nn.Unflatten(1, (2, 5, 5)) + >>> ) + >>> output = m(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + >>> # With torch.Size + >>> m = nn.Sequential( + >>> nn.Linear(50, 50), + >>> nn.Unflatten(1, torch.Size([2, 5, 5])) + >>> ) + >>> output = m(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + >>> # With namedshape (tuple of tuples) + >>> input = torch.randn(2, 50, names=("N", "features")) + >>> unflatten = nn.Unflatten("features", (("C", 2), ("H", 5), ("W", 5))) + >>> output = unflatten(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + """ + + NamedShape = tuple[tuple[str, int]] + + __constants__ = ["dim", "unflattened_size"] + dim: int | str + unflattened_size: _size | NamedShape + + def __init__(self, dim: int | str, unflattened_size: _size | NamedShape) -> None: + super().__init__() + + if isinstance(dim, int): + self._require_tuple_int(unflattened_size) + elif isinstance(dim, str): + self._require_tuple_tuple(unflattened_size) + else: + raise TypeError("invalid argument type for dim parameter") + + self.dim = dim + self.unflattened_size = unflattened_size + + def _require_tuple_tuple(self, input) -> None: + if isinstance(input, tuple): + for idx, elem in enumerate(input): + if not isinstance(elem, tuple): + raise TypeError( + "unflattened_size must be tuple of tuples, " + + f"but found element of type {type(elem).__name__} at pos {idx}" + ) + return + raise TypeError( + "unflattened_size must be a tuple of tuples, " + + f"but found type {type(input).__name__}" + ) + + def _require_tuple_int(self, input) -> None: + if isinstance(input, (tuple, list)): + for idx, elem in enumerate(input): + if not isinstance(elem, int): + raise TypeError( + "unflattened_size must be tuple of ints, " + + f"but found element of type {type(elem).__name__} at pos {idx}" + ) + return + raise TypeError( + f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}" + ) + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return input.unflatten(self.dim, self.unflattened_size) + + def extra_repr(self) -> str: + """ + Returns the extra representation of the module. + """ + return f"dim={self.dim}, unflattened_size={self.unflattened_size}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py new file mode 100644 index 0000000000000000000000000000000000000000..ab1a58882c852370141e1e1dd911278334b425d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py @@ -0,0 +1,335 @@ +import torch.nn.functional as F +from torch import Tensor +from torch.nn.common_types import _size_any_t + +from .module import Module + + +__all__ = ["Fold", "Unfold"] + + +class Fold(Module): + ( + r"""Combines an array of sliding local blocks into a large containing tensor. + + Consider a batched :attr:`input` tensor containing sliding local blocks, + e.g., patches of images, of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, + where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})` + is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})` + spatial locations each containing a :math:`C`-channeled vector), and + :math:`L` is the total number of blocks. (This is exactly the + same specification as the output shape of :class:`~torch.nn.Unfold`.) This + operation combines these local blocks into the large :attr:`output` tensor + of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` + by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the + arguments must satisfy + + .. math:: + L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] % + - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, + + where :math:`d` is over all spatial dimensions. + + * :attr:`output_size` describes the spatial shape of the large containing + tensor of the sliding local blocks. It is useful to resolve the ambiguity + when multiple input shapes map to same number of sliding blocks, e.g., + with ``stride > 0``. + + The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify + how the sliding blocks are retrieved. + + * :attr:`stride` controls the stride for the sliding blocks. + + * :attr:`padding` controls the amount of implicit zero-paddings on both + sides for :attr:`padding` number of points for each dimension before + reshaping. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + Args: + output_size (int or tuple): the shape of the spatial dimensions of the + output (i.e., ``output.sizes()[2:]``) + kernel_size (int or tuple): the size of the sliding blocks + dilation (int or tuple, optional): a parameter that controls the + stride of elements within the + neighborhood. Default: 1 + padding (int or tuple, optional): implicit zero padding to be added on + both sides of input. Default: 0 + stride (int or tuple): the stride of the sliding blocks in the input + spatial dimensions. Default: 1 + + * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`, + :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then + their values will be replicated across all spatial dimensions. + + * For the case of two output spatial dimensions this operation is sometimes + called ``col2im``. + + .. note:: + :class:`~torch.nn.Fold` calculates each combined value in the resulting + large tensor by summing all values from all containing blocks. + :class:`~torch.nn.Unfold` extracts the values in the local blocks by + copying from the large tensor. So, if the blocks overlap, they are not + inverses of each other. + + In general, folding and unfolding operations are related as + follows. Consider :class:`~torch.nn.Fold` and + :class:`~torch.nn.Unfold` instances created with the same + parameters: + + >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...) + >>> fold = nn.Fold(output_size=..., **fold_params) + >>> unfold = nn.Unfold(**fold_params) + + Then for any (supported) ``input`` tensor the following + equality holds: + + :: + + fold(unfold(input)) == divisor * input + + where ``divisor`` is a tensor that depends only on the shape + and dtype of the ``input``: + + >>> # xdoctest: +SKIP + >>> input_ones = torch.ones(input.shape, dtype=input.dtype) + >>> divisor = fold(unfold(input_ones)) + + When the ``divisor`` tensor contains no zero elements, then + ``fold`` and ``unfold`` operations are inverses of each + other (up to constant divisor). + + .. warning:: + Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported. + + Shape: + - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)` + - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` + or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above + + Examples:: + + >>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2)) + >>> input = torch.randn(1, 3 * 2 * 2, 12) + >>> output = fold(input) + >>> output.size() + torch.Size([1, 3, 4, 5]) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + + """ + ) + + __constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"] + output_size: _size_any_t + kernel_size: _size_any_t + dilation: _size_any_t + padding: _size_any_t + stride: _size_any_t + + def __init__( + self, + output_size: _size_any_t, + kernel_size: _size_any_t, + dilation: _size_any_t = 1, + padding: _size_any_t = 0, + stride: _size_any_t = 1, + ) -> None: + super().__init__() + self.output_size = output_size + self.kernel_size = kernel_size + self.dilation = dilation + self.padding = padding + self.stride = stride + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.fold( + input, + self.output_size, + self.kernel_size, + self.dilation, + self.padding, + self.stride, + ) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return ( + "output_size={output_size}, kernel_size={kernel_size}, " + "dilation={dilation}, padding={padding}, stride={stride}".format( + **self.__dict__ + ) + ) + + +class Unfold(Module): + ( + r"""Extracts sliding local blocks from a batched input tensor. + + Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`, + where :math:`N` is the batch dimension, :math:`C` is the channel dimension, + and :math:`*` represent arbitrary spatial dimensions. This operation flattens + each sliding :attr:`kernel_size`-sized block within the spatial dimensions + of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output` + tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where + :math:`C \times \prod(\text{kernel\_size})` is the total number of values + within each block (a block has :math:`\prod(\text{kernel\_size})` spatial + locations each containing a :math:`C`-channeled vector), and :math:`L` is + the total number of such blocks: + + .. math:: + L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] % + - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, + + where :math:`\text{spatial\_size}` is formed by the spatial dimensions + of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial + dimensions. + + Therefore, indexing :attr:`output` at the last dimension (column dimension) + gives all values within a certain block. + + The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify + how the sliding blocks are retrieved. + + * :attr:`stride` controls the stride for the sliding blocks. + + * :attr:`padding` controls the amount of implicit zero-paddings on both + sides for :attr:`padding` number of points for each dimension before + reshaping. +""" + """ + * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. +""" + r""" + Args: + kernel_size (int or tuple): the size of the sliding blocks + dilation (int or tuple, optional): a parameter that controls the + stride of elements within the + neighborhood. Default: 1 + padding (int or tuple, optional): implicit zero padding to be added on + both sides of input. Default: 0 + stride (int or tuple, optional): the stride of the sliding blocks in the input + spatial dimensions. Default: 1 + + * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or + :attr:`stride` is an int or a tuple of length 1, their values will be + replicated across all spatial dimensions. + + * For the case of two input spatial dimensions this operation is sometimes + called ``im2col``. + + .. note:: + :class:`~torch.nn.Fold` calculates each combined value in the resulting + large tensor by summing all values from all containing blocks. + :class:`~torch.nn.Unfold` extracts the values in the local blocks by + copying from the large tensor. So, if the blocks overlap, they are not + inverses of each other. + + In general, folding and unfolding operations are related as + follows. Consider :class:`~torch.nn.Fold` and + :class:`~torch.nn.Unfold` instances created with the same + parameters: + + >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...) + >>> fold = nn.Fold(output_size=..., **fold_params) + >>> unfold = nn.Unfold(**fold_params) + + Then for any (supported) ``input`` tensor the following + equality holds: + + :: + + fold(unfold(input)) == divisor * input + + where ``divisor`` is a tensor that depends only on the shape + and dtype of the ``input``: + + >>> # xdoctest: +SKIP + >>> input_ones = torch.ones(input.shape, dtype=input.dtype) + >>> divisor = fold(unfold(input_ones)) + + When the ``divisor`` tensor contains no zero elements, then + ``fold`` and ``unfold`` operations are inverses of each + other (up to constant divisor). + + .. warning:: + Currently, only 4-D input tensors (batched image-like tensors) are + supported. + + Shape: + - Input: :math:`(N, C, *)` + - Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above + + Examples:: + + >>> unfold = nn.Unfold(kernel_size=(2, 3)) + >>> input = torch.randn(2, 5, 3, 4) + >>> output = unfold(input) + >>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels) + >>> # 4 blocks (2x3 kernels) in total in the 3x4 input + >>> output.size() + torch.Size([2, 30, 4]) + + >>> # xdoctest: +IGNORE_WANT + >>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape) + >>> inp = torch.randn(1, 3, 10, 12) + >>> w = torch.randn(2, 3, 4, 5) + >>> inp_unf = torch.nn.functional.unfold(inp, (4, 5)) + >>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2) + >>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1)) + >>> # or equivalently (and avoiding a copy), + >>> # out = out_unf.view(1, 2, 7, 8) + >>> (torch.nn.functional.conv2d(inp, w) - out).abs().max() + tensor(1.9073e-06) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + + """ + ) + + __constants__ = ["kernel_size", "dilation", "padding", "stride"] + kernel_size: _size_any_t + dilation: _size_any_t + padding: _size_any_t + stride: _size_any_t + + def __init__( + self, + kernel_size: _size_any_t, + dilation: _size_any_t = 1, + padding: _size_any_t = 0, + stride: _size_any_t = 1, + ) -> None: + super().__init__() + self.kernel_size = kernel_size + self.dilation = dilation + self.padding = padding + self.stride = stride + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.unfold( + input, self.kernel_size, self.dilation, self.padding, self.stride + ) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return ( + "kernel_size={kernel_size}, dilation={dilation}, padding={padding}," + " stride={stride}".format(**self.__dict__) + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py new file mode 100644 index 0000000000000000000000000000000000000000..058ffb3ed9aa9fa9bf496c709b4f3e6c48e72178 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py @@ -0,0 +1,472 @@ +# mypy: allow-untyped-defs + +import warnings + +import torch.nn.functional as F +from torch import Tensor + +from .batchnorm import _LazyNormBase, _NormBase + + +__all__ = [ + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", + "LazyInstanceNorm1d", + "LazyInstanceNorm2d", + "LazyInstanceNorm3d", +] + + +class _InstanceNorm(_NormBase): + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float = 0.1, + affine: bool = False, + track_running_stats: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + + def _check_input_dim(self, input): + raise NotImplementedError + + def _get_no_batch_dim(self): + raise NotImplementedError + + def _handle_no_batch_input(self, input): + return self._apply_instance_norm(input.unsqueeze(0)).squeeze(0) + + def _apply_instance_norm(self, input): + return F.instance_norm( + input, + self.running_mean, + self.running_var, + self.weight, + self.bias, + self.training or not self.track_running_stats, + self.momentum if self.momentum is not None else 0.0, + self.eps, + ) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) -> None: + version = local_metadata.get("version", None) + # at version 1: removed running_mean and running_var when + # track_running_stats=False (default) + if version is None and not self.track_running_stats: + running_stats_keys = [] + for name in ("running_mean", "running_var"): + key = prefix + name + if key in state_dict: + running_stats_keys.append(key) + if len(running_stats_keys) > 0: + error_msgs.append( + "Unexpected running stats buffer(s) {names} for {klass} " + "with track_running_stats=False. If state_dict is a " + "checkpoint saved before 0.4.0, this may be expected " + "because {klass} does not track running stats by default " + "since 0.4.0. Please remove these keys from state_dict. If " + "the running stats are actually needed, instead set " + "track_running_stats=True in {klass} to enable them. See " + "the documentation of {klass} for details.".format( + names=" and ".join(f'"{k}"' for k in running_stats_keys), + klass=self.__class__.__name__, + ) + ) + for key in running_stats_keys: + state_dict.pop(key) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + + feature_dim = input.dim() - self._get_no_batch_dim() + if input.size(feature_dim) != self.num_features: + if self.affine: + raise ValueError( + f"expected input's size at dim={feature_dim} to match num_features" + f" ({self.num_features}), but got: {input.size(feature_dim)}." + ) + else: + warnings.warn( + f"input's size at dim={feature_dim} does not match num_features. " + "You can silence this warning by not passing in num_features, " + "which is not used because affine=False", + stacklevel=2, + ) + + if input.dim() == self._get_no_batch_dim(): + return self._handle_no_batch_input(input) + + return self._apply_instance_norm(input) + + +class InstanceNorm1d(_InstanceNorm): + r"""Applies Instance Normalization. + + This operation applies Instance Normalization + over a 2D (unbatched) or 3D (batched) input as described in the paper + `Instance Normalization: The Missing Ingredient for Fast Stylization + `__. + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension separately + for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``. + The variance is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + By default, this layer uses instance statistics computed from input data in + both training and evaluation modes. + + If :attr:`track_running_stats` is set to ``True``, during training this + layer keeps running estimates of its computed mean and variance, which are + then used for normalization during evaluation. The running estimates are + kept with a default :attr:`momentum` of 0.1. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + .. note:: + :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm1d` is applied + on each channel of channeled data like multidimensional time series, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionally, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm1d` usually don't apply affine + transform. + + Args: + num_features: number of features or channels :math:`C` of the input + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, L)` or :math:`(C, L)` + - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input) + + Examples:: + + >>> # Without Learnable Parameters + >>> m = nn.InstanceNorm1d(100) + >>> # With Learnable Parameters + >>> m = nn.InstanceNorm1d(100, affine=True) + >>> input = torch.randn(20, 100, 40) + >>> output = m(input) + """ + + def _get_no_batch_dim(self) -> int: + return 2 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (2, 3): + raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)") + + +class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm): + r"""A :class:`torch.nn.InstanceNorm1d` module with lazy initialization of the ``num_features`` argument. + + The ``num_features`` argument of the :class:`InstanceNorm1d` is inferred from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, L)` or :math:`(C, L)` + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, L)` or :math:`(C, L)` + - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input) + """ + + cls_to_become = InstanceNorm1d # type: ignore[assignment] + + def _get_no_batch_dim(self) -> int: + return 2 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (2, 3): + raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)") + + +class InstanceNorm2d(_InstanceNorm): + r"""Applies Instance Normalization. + + This operation applies Instance Normalization + over a 4D input (a mini-batch of 2D inputs + with additional channel dimension) as described in the paper + `Instance Normalization: The Missing Ingredient for Fast Stylization + `__. + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension separately + for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the input size) if :attr:`affine` is ``True``. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + By default, this layer uses instance statistics computed from input data in + both training and evaluation modes. + + If :attr:`track_running_stats` is set to ``True``, during training this + layer keeps running estimates of its computed mean and variance, which are + then used for normalization during evaluation. The running estimates are + kept with a default :attr:`momentum` of 0.1. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + .. note:: + :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm2d` is applied + on each channel of channeled data like RGB images, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionally, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm2d` usually don't apply affine + transform. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, H, W)` or :math:`(C, H, W)` + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)` + - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input) + + Examples:: + + >>> # Without Learnable Parameters + >>> m = nn.InstanceNorm2d(100) + >>> # With Learnable Parameters + >>> m = nn.InstanceNorm2d(100, affine=True) + >>> input = torch.randn(20, 100, 35, 45) + >>> output = m(input) + """ + + def _get_no_batch_dim(self) -> int: + return 3 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (3, 4): + raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)") + + +class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm): + r"""A :class:`torch.nn.InstanceNorm2d` module with lazy initialization of the ``num_features`` argument. + + The ``num_features`` argument of the :class:`InstanceNorm2d` is inferred from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, H, W)` or :math:`(C, H, W)` + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)` + - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input) + """ + + cls_to_become = InstanceNorm2d # type: ignore[assignment] + + def _get_no_batch_dim(self) -> int: + return 3 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (3, 4): + raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)") + + +class InstanceNorm3d(_InstanceNorm): + r"""Applies Instance Normalization. + + This operation applies Instance Normalization + over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper + `Instance Normalization: The Missing Ingredient for Fast Stylization + `__. + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension separately + for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size C (where C is the input size) if :attr:`affine` is ``True``. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + By default, this layer uses instance statistics computed from input data in + both training and evaluation modes. + + If :attr:`track_running_stats` is set to ``True``, during training this + layer keeps running estimates of its computed mean and variance, which are + then used for normalization during evaluation. The running estimates are + kept with a default :attr:`momentum` of 0.1. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + .. note:: + :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm3d` is applied + on each channel of channeled data like 3D models with RGB color, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionally, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm3d` usually don't apply affine + transform. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input) + + Examples:: + + >>> # Without Learnable Parameters + >>> m = nn.InstanceNorm3d(100) + >>> # With Learnable Parameters + >>> m = nn.InstanceNorm3d(100, affine=True) + >>> input = torch.randn(20, 100, 35, 45, 10) + >>> output = m(input) + """ + + def _get_no_batch_dim(self) -> int: + return 4 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (4, 5): + raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)") + + +class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm): + r"""A :class:`torch.nn.InstanceNorm3d` module with lazy initialization of the ``num_features`` argument. + + The ``num_features`` argument of the :class:`InstanceNorm3d` is inferred from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` + eps: a value added to the denominator for numerical stability. Default: 1e-5 + momentum: the value used for the running_mean and running_var computation. Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters, initialized the same way as done for batch normalization. + Default: ``False``. + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics and always uses batch + statistics in both training and eval modes. Default: ``False`` + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input) + """ + + cls_to_become = InstanceNorm3d # type: ignore[assignment] + + def _get_no_batch_dim(self) -> int: + return 4 + + def _check_input_dim(self, input) -> None: + if input.dim() not in (4, 5): + raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py new file mode 100644 index 0000000000000000000000000000000000000000..72d90d1c10364ea380b1f27069dd69dda6ec80cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py @@ -0,0 +1,278 @@ +# mypy: allow-untyped-defs +import itertools +from typing import Any, Protocol + +import torch +from torch.nn.parameter import is_lazy + + +__all__ = ["LazyModuleMixin"] + + +class _LazyProtocol(Protocol): + """This class is used to avoid errors with mypy checks for the attributes in a mixin. + + https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes + """ + + def _register_load_state_dict_pre_hook(self, hook): ... + + def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False): ... + + def _lazy_load_hook( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): ... + + def _get_name(self): ... + + def _infer_parameters(self, module, input): ... + + @property + def _parameters(self): ... + + @property + def _buffers(self): ... + + @property + def _non_persistent_buffers_set(self): ... + + @property + def _load_hook(self): ... + + @property + def _initialize_hook(self): ... + + +class LazyModuleMixin: + r"""A mixin for modules that lazily initialize parameters, also known as "lazy modules". + + .. warning: + Lazy modules are an experimental new feature under active development, + and their API is likely to change. + + Modules that lazily initialize parameters, or "lazy modules", + derive the shapes of their parameters from the first input(s) + to their forward method. Until that first forward they contain + :class:`torch.nn.UninitializedParameter` s that should not be accessed + or used, and afterward they contain regular :class:`torch.nn.Parameter` s. + Lazy modules are convenient since they don't require computing some + module arguments, like the :attr:`in_features` argument of a + typical :class:`torch.nn.Linear`. + + After construction, networks with lazy modules should first + be converted to the desired dtype and placed on the expected device. + This is because lazy modules only perform shape inference so the usual dtype + and device placement behavior applies. + The lazy modules should then perform "dry runs" to initialize all the components in the module. + These "dry runs" send inputs of the correct size, dtype, and device through + the network and to each one of its lazy modules. After this the network can be used as usual. + + >>> # xdoctest: +SKIP + >>> class LazyMLP(torch.nn.Module): + ... def __init__(self) -> None: + ... super().__init__() + ... self.fc1 = torch.nn.LazyLinear(10) + ... self.relu1 = torch.nn.ReLU() + ... self.fc2 = torch.nn.LazyLinear(1) + ... self.relu2 = torch.nn.ReLU() + ... + ... def forward(self, input): + ... x = self.relu1(self.fc1(input)) + ... y = self.relu2(self.fc2(x)) + ... return y + >>> # constructs a network with lazy modules + >>> lazy_mlp = LazyMLP() + >>> # transforms the network's device and dtype + >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs' + >>> lazy_mlp = lazy_mlp.cuda() + >>> lazy_mlp + LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True) + (relu1): ReLU() + (fc2): LazyLinear(in_features=0, out_features=1, bias=True) + (relu2): ReLU() + ) + >>> # performs a dry run to initialize the network's lazy modules + >>> lazy_mlp(torch.ones(10, 10).cuda()) + >>> # after initialization, LazyLinear modules become regular Linear modules + >>> lazy_mlp + LazyMLP( + (fc1): Linear(in_features=10, out_features=10, bias=True) + (relu1): ReLU() + (fc2): Linear(in_features=10, out_features=1, bias=True) + (relu2): ReLU() + ) + >>> # attaches an optimizer, since parameters can now be used as usual + >>> optim = torch.optim.SGD(lazy_mlp.parameters(), lr=0.01) + + A final caveat when using lazy modules is that the order of initialization of a network's + parameters may change, since the lazy modules are always initialized after other modules. + For example, if the LazyMLP class defined above had a :class:`torch.nn.LazyLinear` module + first and then a regular :class:`torch.nn.Linear` second, the second module would be + initialized on construction and the first module would be initialized during the first dry run. + This can cause the parameters of a network using lazy modules to be initialized differently + than the parameters of a network without lazy modules as the order of parameter initializations, + which often depends on a stateful random number generator, is different. + Check :doc:`/notes/randomness` for more details. + + Lazy modules can be serialized with a state dict like other modules. For example: + + >>> lazy_mlp = LazyMLP() + >>> # The state dict shows the uninitialized parameters + >>> lazy_mlp.state_dict() + OrderedDict({'fc1.weight': , + 'fc1.bias': , + 'fc2.weight': , + 'fc2.bias': }) + + Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize + initialized LazyModules and they will remain initialized) + + + >>> full_mlp = LazyMLP() + >>> # Dry run to initialize another module + >>> full_mlp.forward(torch.ones(10, 1)) + >>> # Load an initialized state into a lazy module + >>> lazy_mlp.load_state_dict(full_mlp.state_dict()) + >>> # The state dict now holds valid values + >>> lazy_mlp.state_dict() + OrderedDict([('fc1.weight', + tensor([[-0.3837], + [ 0.0907], + [ 0.6708], + [-0.5223], + [-0.9028], + [ 0.2851], + [-0.4537], + [ 0.6813], + [ 0.5766], + [-0.8678]])), + ('fc1.bias', + tensor([-1.8832e+25, 4.5636e-41, -1.8832e+25, 4.5636e-41, -6.1598e-30, + 4.5637e-41, -1.8788e+22, 4.5636e-41, -2.0042e-31, 4.5637e-41])), + ('fc2.weight', + tensor([[ 0.1320, 0.2938, 0.0679, 0.2793, 0.1088, -0.1795, -0.2301, 0.2807, + 0.2479, 0.1091]])), + ('fc2.bias', tensor([0.0019]))]) + + Note, however, that the loaded parameters will not be replaced when doing a "dry run" if they are initialized + when the state is loaded. This prevents using initialized modules in different contexts. + """ + + # modules inheriting from this will change their __class__ to the specified + # one after they are fully initialized + cls_to_become: type[Any] | None = None + + def __init__(self: _LazyProtocol, *args, **kwargs): + # Mypy doesn't like this super call in a mixin + super().__init__(*args, **kwargs) # type: ignore[misc] + # pyrefly: ignore [read-only] + self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook) + # pyrefly: ignore [read-only] + self._initialize_hook = self.register_forward_pre_hook( + self._infer_parameters, with_kwargs=True + ) + + def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars): + # This should be ideally implemented as a hook, + # but we should override `detach` in the UninitializedParameter to return itself + # which is not clean + for name, param in self._parameters.items(): + if param is not None: + if not (is_lazy(param) or keep_vars): + param = param.detach() + destination[prefix + name] = param + for name, buf in self._buffers.items(): + if buf is not None and name not in self._non_persistent_buffers_set: + if not (is_lazy(buf) or keep_vars): + buf = buf.detach() + destination[prefix + name] = buf + + def _lazy_load_hook( + self: _LazyProtocol, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + """load_state_dict pre-hook function for lazy buffers and parameters. + + The purpose of this hook is to adjust the current state and/or + ``state_dict`` being loaded so that a module instance serialized in + both un/initialized state can be deserialized onto both un/initialized + module instance. + See comment in ``torch.nn.Module._register_load_state_dict_pre_hook`` + for the details of the hook specification. + """ + for name, param in itertools.chain( + self._parameters.items(), self._buffers.items() + ): + key = prefix + name + if key in state_dict and param is not None: + input_param = state_dict[key] + if is_lazy(param): + # The current parameter is not initialized but the one being loaded one is + # create a new parameter based on the uninitialized one + if not is_lazy(input_param): + with torch.no_grad(): + param.materialize(input_param.shape) + + def initialize_parameters(self: _LazyProtocol, *args, **kwargs): + r"""Initialize parameters according to the input batch properties. + + This adds an interface to isolate parameter initialization from the + forward pass when doing parameter shape inference. + """ + raise NotImplementedError( + f"initialize_parameters is not implemented for {self.__class__.__name__}" + ) + + def has_uninitialized_params(self: _LazyProtocol): + r"""Check if a module has parameters that are not initialized.""" + # This is to avoid the JIT to track this parameter and force + # custom modules __setstate__ to add it + params = self._parameters.values() + buffers = self._buffers.values() + for param in itertools.chain(params, buffers): + if is_lazy(param): + return True + return False + + # torchrec tests the code consistency with the following code + # fmt: off + def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None): + r"""Infers the size and initializes the parameters according to the provided input batch. + + Given a module that contains parameters that were declared inferable + using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass + in the complete module using the provided input to initialize all the parameters + as needed. + The module is set into evaluation mode before running the forward pass in order + to avoid saving statistics or calculating gradients + """ + kwargs = kwargs if kwargs else {} + module.initialize_parameters(*args, **kwargs) + if module.has_uninitialized_params(): + raise RuntimeError(f'module {self._get_name()} has not been fully initialized') + module._initialize_hook.remove() + module._load_hook.remove() + delattr(module, '_initialize_hook') + delattr(module, '_load_hook') + if module.cls_to_become is not None: + module.__class__ = module.cls_to_become + # fmt: on + + def _replicate_for_data_parallel(self: _LazyProtocol): + raise RuntimeError( + "Modules with uninitialized parameters can't be used with `DataParallel`. " + "Run a dummy forward pass to correctly initialize the modules" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..c58bdcefd0e0a9212d44891d6ade694e55c5f529 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py @@ -0,0 +1,337 @@ +# mypy: allow-untyped-defs +import math +from typing import Any + +import torch +from torch import Tensor +from torch.nn import functional as F, init +from torch.nn.parameter import Parameter, UninitializedParameter + +from .lazy import LazyModuleMixin +from .module import Module + + +__all__ = [ + "Bilinear", + "Identity", + "LazyLinear", + "Linear", +] + + +class Identity(Module): + r"""A placeholder identity operator that is argument-insensitive. + + Args: + args: any argument (unused) + kwargs: any keyword argument (unused) + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + Examples:: + + >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 20]) + + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__() + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return input + + +class Linear(Module): + r"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`. + + This module supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + in_features: size of each input sample + out_features: size of each output sample + bias: If set to ``False``, the layer will not learn an additive bias. + Default: ``True`` + + Shape: + - Input: :math:`(*, H_\text{in})` where :math:`*` means any number of + dimensions including none and :math:`H_\text{in} = \text{in\_features}`. + - Output: :math:`(*, H_\text{out})` where all but the last dimension + are the same shape as the input and :math:`H_\text{out} = \text{out\_features}`. + + Attributes: + weight: the learnable weights of the module of shape + :math:`(\text{out\_features}, \text{in\_features})`. The values are + initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where + :math:`k = \frac{1}{\text{in\_features}}` + bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{1}{\text{in\_features}}` + + Examples:: + + >>> m = nn.Linear(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + + __constants__ = ["in_features", "out_features"] + in_features: int + out_features: int + weight: Tensor + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter( + torch.empty((out_features, in_features), **factory_kwargs) + ) + if bias: + self.bias = Parameter(torch.empty(out_features, **factory_kwargs)) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in ``__init__``. + """ + # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with + # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see + # https://github.com/pytorch/pytorch/issues/57109 + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(self.bias, -bound, bound) + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.linear(input, self.weight, self.bias) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}" + + +# This class exists solely to avoid triggering an obscure error when scripting +# an improperly quantized attention layer. See this issue for details: +# https://github.com/pytorch/pytorch/issues/58969 +# TODO: fail fast on quantization API usage error, then remove this class +# and replace uses of it with plain Linear +class NonDynamicallyQuantizableLinear(Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + super().__init__( + in_features, out_features, bias=bias, device=device, dtype=dtype + ) + + +class Bilinear(Module): + r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`. + + Args: + in1_features: size of each first input sample, must be > 0 + in2_features: size of each second input sample, must be > 0 + out_features: size of each output sample, must be > 0 + bias: If set to ``False``, the layer will not learn an additive bias. + Default: ``True`` + + Shape: + - Input1: :math:`(*, H_\text{in1})` where :math:`H_\text{in1}=\text{in1\_features}` and + :math:`*` means any number of additional dimensions including none. All but the last dimension + of the inputs should be the same. + - Input2: :math:`(*, H_\text{in2})` where :math:`H_\text{in2}=\text{in2\_features}`. + - Output: :math:`(*, H_\text{out})` where :math:`H_\text{out}=\text{out\_features}` + and all but the last dimension are the same shape as the input. + + Attributes: + weight: the learnable weights of the module of shape + :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`. + The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where + :math:`k = \frac{1}{\text{in1\_features}}` + bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where + :math:`k = \frac{1}{\text{in1\_features}}` + + Examples:: + + >>> m = nn.Bilinear(20, 30, 40) + >>> input1 = torch.randn(128, 20) + >>> input2 = torch.randn(128, 30) + >>> output = m(input1, input2) + >>> print(output.size()) + torch.Size([128, 40]) + """ + + __constants__ = ["in1_features", "in2_features", "out_features"] + in1_features: int + in2_features: int + out_features: int + weight: Tensor + + def __init__( + self, + in1_features: int, + in2_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in1_features = in1_features + self.in2_features = in2_features + self.out_features = out_features + self.weight = Parameter( + torch.empty((out_features, in1_features, in2_features), **factory_kwargs) + ) + + if bias: + self.bias = Parameter(torch.empty(out_features, **factory_kwargs)) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in ``__init__``. + """ + if self.in1_features <= 0: + raise ValueError( + f"in1_features must be > 0, but got (in1_features={self.in1_features})" + ) + bound = 1 / math.sqrt(self.weight.size(1)) + init.uniform_(self.weight, -bound, bound) + if self.bias is not None: + init.uniform_(self.bias, -bound, bound) + + def forward(self, input1: Tensor, input2: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.bilinear(input1, input2, self.weight, self.bias) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return ( + f"in1_features={self.in1_features}, in2_features={self.in2_features}, " + f"out_features={self.out_features}, bias={self.bias is not None}" + ) + + +class LazyLinear(LazyModuleMixin, Linear): + r"""A :class:`torch.nn.Linear` module where `in_features` is inferred. + + In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter` + class. They will be initialized after the first call to ``forward`` is done and the + module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument + of the :class:`Linear` is inferred from the ``input.shape[-1]``. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + out_features: size of each output sample + bias: If set to ``False``, the layer will not learn an additive bias. + Default: ``True`` + + Attributes: + weight: the learnable weights of the module of shape + :math:`(\text{out\_features}, \text{in\_features})`. The values are + initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where + :math:`k = \frac{1}{\text{in\_features}}` + bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{1}{\text{in\_features}}` + + + """ + + cls_to_become = Linear # type: ignore[assignment] + # pyrefly: ignore [bad-override] + weight: UninitializedParameter + bias: UninitializedParameter # type: ignore[assignment] + + def __init__( + self, out_features: int, bias: bool = True, device=None, dtype=None + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + # bias is hardcoded to False to avoid creating tensor + # that will soon be overwritten. + # pyrefly: ignore [bad-argument-type] + super().__init__(0, 0, False) + # pyrefly: ignore [bad-argument-type] + self.weight = UninitializedParameter(**factory_kwargs) + self.out_features = out_features + if bias: + # pyrefly: ignore [bad-argument-type] + self.bias = UninitializedParameter(**factory_kwargs) + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in ``__init__``. + """ + # pyrefly: ignore [bad-argument-type] + if not self.has_uninitialized_params() and self.in_features != 0: + super().reset_parameters() + + def initialize_parameters(self, input) -> None: # type: ignore[override] + """ + Infers ``in_features`` based on ``input`` and initializes parameters. + """ + # pyrefly: ignore [bad-argument-type] + if self.has_uninitialized_params(): + with torch.no_grad(): + self.in_features = input.shape[-1] + self.weight.materialize((self.out_features, self.in_features)) + if self.bias is not None: + self.bias.materialize((self.out_features,)) + self.reset_parameters() + if self.in_features == 0: + assert input.shape[-1] == self.weight.shape[-1], ( + f"The in_features inferred from input: {input.shape[-1]} " + f"is not equal to in_features from self.weight: " + f"{self.weight.shape[-1]}" + ) + self.in_features = input.shape[-1] + + +# TODO: PartialLinear - maybe in sparse? diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..00ada62febded14af25c6a32ec8c1e5998349d74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py @@ -0,0 +1,2083 @@ +# mypy: allow-untyped-defs +from collections.abc import Callable +from typing_extensions import deprecated + +from torch import Tensor +from torch.nn import _reduction as _Reduction, functional as F + +from .distance import PairwiseDistance +from .module import Module + + +__all__ = [ + "L1Loss", + "NLLLoss", + "NLLLoss2d", + "PoissonNLLLoss", + "GaussianNLLLoss", + "KLDivLoss", + "MSELoss", + "BCELoss", + "BCEWithLogitsLoss", + "HingeEmbeddingLoss", + "MultiLabelMarginLoss", + "SmoothL1Loss", + "HuberLoss", + "SoftMarginLoss", + "CrossEntropyLoss", + "MultiLabelSoftMarginLoss", + "CosineEmbeddingLoss", + "MarginRankingLoss", + "MultiMarginLoss", + "TripletMarginLoss", + "TripletMarginWithDistanceLoss", + "CTCLoss", +] + + +class _Loss(Module): + reduction: str + + def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None: + super().__init__() + if size_average is not None or reduce is not None: + self.reduction: str = _Reduction.legacy_get_string(size_average, reduce) + else: + self.reduction = reduction + + +class _WeightedLoss(_Loss): + def __init__( + self, + weight: Tensor | None = None, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + self.register_buffer("weight", weight) + self.weight: Tensor | None + + +class L1Loss(_Loss): + r"""Creates a criterion that measures the mean absolute error (MAE) between each element in + the input :math:`x` and target :math:`y`. + + The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = \left| x_n - y_n \right|, + + where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + :math:`x` and :math:`y` are tensors of arbitrary shapes with a total + of :math:`N` elements each. + + The sum operation still operates over all the elements, and divides by :math:`N`. + + The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``. + + Supports real-valued and complex-valued inputs. + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then + :math:`(*)`, same shape as the input. + + Examples: + + >>> loss = nn.L1Loss() + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.randn(3, 5) + >>> output = loss(input, target) + >>> output.backward() + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.l1_loss(input, target, reduction=self.reduction) + + +class NLLLoss(_WeightedLoss): + r"""The negative log likelihood loss. It is useful to train a classification + problem with `C` classes. + + If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning + weight to each of the classes. This is particularly useful when you have an + unbalanced training set. + + The `input` given through a forward call is expected to contain + log-probabilities of each class. `input` has to be a Tensor of size either + :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` + with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for + higher dimension inputs, such as computing NLL loss per-pixel for 2D images. + + Obtaining log-probabilities in a neural network is easily achieved by + adding a `LogSoftmax` layer in the last layer of your network. + You may use `CrossEntropyLoss` instead, if you prefer not to add an extra + layer. + + The `target` that this loss expects should be a class index in the range :math:`[0, C-1]` + where `C = number of classes`; if `ignore_index` is specified, this loss also accepts + this class index (this index may not necessarily be in the class range). + + The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \\ + l_n = - w_{y_n} x_{n,y_n}, \\ + w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\}, + + where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and + :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then + + .. math:: + \ell(x, y) = \begin{cases} + \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & + \text{if reduction} = \text{`mean';}\\ + \sum_{n=1}^N l_n, & + \text{if reduction} = \text{`sum'.} + \end{cases} + + Args: + weight (Tensor, optional): a manual rescaling weight given to each + class. If given, it has to be a Tensor of size `C`. Otherwise, it is + treated as if having all ones. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``None`` + ignore_index (int, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. When + :attr:`size_average` is ``True``, the loss is averaged over + non-ignored targets. + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``None`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will + be applied, ``'mean'``: the weighted mean of the output is taken, + ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in + the meantime, specifying either of those two args will override + :attr:`reduction`. Default: ``'mean'`` + + Shape:: + - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, `N = batch size`, or + :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` + in the case of `K`-dimensional loss. + - Target: :math:`(N)` or :math:`()`, where each value is + :math:`0 \leq \text{targets}[i] \leq C-1`, or + :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of + K-dimensional loss. + - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or + :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss. + Otherwise, scalar. + + Examples: + + >>> log_softmax = nn.LogSoftmax(dim=1) + >>> loss_fn = nn.NLLLoss() + >>> # input to NLLLoss is of size N x C = 3 x 5 + >>> input = torch.randn(3, 5, requires_grad=True) + >>> # each element in target must have 0 <= value < C + >>> target = torch.tensor([1, 0, 4]) + >>> loss = loss_fn(log_softmax(input), target) + >>> loss.backward() + >>> + >>> + >>> # 2D loss example (used, for example, with image inputs) + >>> N, C = 5, 4 + >>> loss_fn = nn.NLLLoss() + >>> data = torch.randn(N, 16, 10, 10) + >>> conv = nn.Conv2d(16, C, (3, 3)) + >>> log_softmax = nn.LogSoftmax(dim=1) + >>> # output of conv forward is of shape [N, C, 8, 8] + >>> output = log_softmax(conv(data)) + >>> # each element in target must have 0 <= value < C + >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) + >>> # input to NLLLoss is of size N x C x height (8) x width (8) + >>> loss = loss_fn(output, target) + >>> loss.backward() + """ + + __constants__ = ["ignore_index", "reduction"] + ignore_index: int + + def __init__( + self, + weight: Tensor | None = None, + size_average=None, + ignore_index: int = -100, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(weight, size_average, reduce, reduction) + self.ignore_index = ignore_index + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.nll_loss( + input, + target, + weight=self.weight, + ignore_index=self.ignore_index, + reduction=self.reduction, + ) + + +@deprecated( + "`NLLLoss2d` has been deprecated. " + "Please use `NLLLoss` instead as a drop-in replacement and see " + "https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss for more details.", + category=FutureWarning, +) +class NLLLoss2d(NLLLoss): + def __init__( + self, + weight: Tensor | None = None, + size_average=None, + ignore_index: int = -100, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(weight, size_average, ignore_index, reduce, reduction) + + +class PoissonNLLLoss(_Loss): + r"""Negative log likelihood loss with Poisson distribution of target. + + The loss can be described as: + + .. math:: + \text{target} \sim \mathrm{Poisson}(\text{input}) + + \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input}) + + \log(\text{target!}) + + The last term can be omitted or approximated with Stirling formula. The + approximation is used for target values more than 1. For targets less or + equal to 1 zeros are added to the loss. + + Args: + log_input (bool, optional): if ``True`` the loss is computed as + :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is + :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`. + full (bool, optional): whether to compute full loss, i. e. to add the + Stirling approximation term + + .. math:: + \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}). + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when + :attr:`log_input = False`. Default: 1e-8 + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Examples: + + >>> loss = nn.PoissonNLLLoss() + >>> log_input = torch.randn(5, 2, requires_grad=True) + >>> target = torch.randn(5, 2) + >>> output = loss(log_input, target) + >>> output.backward() + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`, + the same shape as the input. + """ + + __constants__ = ["log_input", "full", "eps", "reduction"] + log_input: bool + full: bool + eps: float + + def __init__( + self, + log_input: bool = True, + full: bool = False, + size_average=None, + eps: float = 1e-8, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + self.log_input = log_input + self.full = full + self.eps = eps + + def forward(self, log_input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.poisson_nll_loss( + log_input, + target, + log_input=self.log_input, + full=self.full, + eps=self.eps, + reduction=self.reduction, + ) + + +class GaussianNLLLoss(_Loss): + r"""Gaussian negative log likelihood loss. + + The targets are treated as samples from Gaussian distributions with + expectations and variances predicted by the neural network. For a + ``target`` tensor modelled as having Gaussian distribution with a tensor + of expectations ``input`` and a tensor of positive variances ``var`` the loss is: + + .. math:: + \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var}, + \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2} + {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.} + + where :attr:`eps` is used for stability. By default, the constant term of + the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same + size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension + of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting. + + Args: + full (bool, optional): include the constant term in the loss + calculation. Default: ``False``. + eps (float, optional): value used to clamp ``var`` (see note below), for + stability. Default: 1e-6. + reduction (str, optional): specifies the reduction to apply to the + output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction + will be applied, ``'mean'``: the output is the average of all batch + member losses, ``'sum'``: the output is the sum of all batch member + losses. Default: ``'mean'``. + + Shape: + - Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional + dimensions + - Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input + but with one dimension equal to 1 (to allow for broadcasting) + - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but + with one dimension equal to 1, or same shape as the input but with one fewer + dimension (to allow for broadcasting), or a scalar value + - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or + ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same + shape as the input + + Examples: + >>> loss = nn.GaussianNLLLoss() + >>> input = torch.randn(5, 2, requires_grad=True) + >>> target = torch.randn(5, 2) + >>> var = torch.ones(5, 2, requires_grad=True) # heteroscedastic + >>> output = loss(input, target, var) + >>> output.backward() + + >>> loss = nn.GaussianNLLLoss() + >>> input = torch.randn(5, 2, requires_grad=True) + >>> target = torch.randn(5, 2) + >>> var = torch.ones(5, 1, requires_grad=True) # homoscedastic + >>> output = loss(input, target, var) + >>> output.backward() + + Note: + The clamping of ``var`` is ignored with respect to autograd, and so the + gradients are unaffected by it. + + Reference: + Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the + target probability distribution", Proceedings of 1994 IEEE International + Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60 + vol.1, doi: 10.1109/ICNN.1994.374138. + """ + + __constants__ = ["full", "eps", "reduction"] + full: bool + eps: float + + def __init__( + self, *, full: bool = False, eps: float = 1e-6, reduction: str = "mean" + ) -> None: + super().__init__(None, None, reduction) + self.full = full + self.eps = eps + + def forward(self, input: Tensor, target: Tensor, var: Tensor | float) -> Tensor: + """ + Runs the forward pass. + """ + return F.gaussian_nll_loss( + input, target, var, full=self.full, eps=self.eps, reduction=self.reduction + ) + + +class KLDivLoss(_Loss): + r"""The Kullback-Leibler divergence loss. + + For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`, + where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the + :attr:`target`, we define the **pointwise KL-divergence** as + + .. math:: + + L(y_{\text{pred}},\ y_{\text{true}}) + = y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}} + = y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}}) + + To avoid underflow issues when computing this quantity, this loss expects the argument + :attr:`input` in the log-space. The argument :attr:`target` may also be provided in the + log-space if :attr:`log_target`\ `= True`. + + To summarise, this function is roughly equivalent to computing + + .. code-block:: python + + if not log_target: # default + loss_pointwise = target * (target.log() - input) + else: + loss_pointwise = target.exp() * (target - input) + + and then reducing this result depending on the argument :attr:`reduction` as + + .. code-block:: python + + if reduction == "mean": # default + loss = loss_pointwise.mean() + elif reduction == "batchmean": # mathematically correct + loss = loss_pointwise.sum() / input.size(0) + elif reduction == "sum": + loss = loss_pointwise.sum() + else: # reduction == "none" + loss = loss_pointwise + + .. note:: + As all the other losses in PyTorch, this function expects the first argument, + :attr:`input`, to be the output of the model (e.g. the neural network) + and the second, :attr:`target`, to be the observations in the dataset. + This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where + :math:`P` denotes the distribution of the observations and :math:`Q` denotes the model. + + .. warning:: + :attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use + :attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition. + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to `False`, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is `False`. Default: `True` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: `True` + reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"` + log_target (bool, optional): Specifies whether `target` is the log space. Default: `False` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`, + same shape as the input. + + Examples: + >>> kl_loss = nn.KLDivLoss(reduction="batchmean") + >>> # input should be a distribution in the log space + >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1) + >>> # Sample a batch of distributions. Usually this would come from the dataset + >>> target = F.softmax(torch.rand(3, 5), dim=1) + >>> output = kl_loss(input, target) + >>> + >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True) + >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1) + >>> output = kl_loss(input, log_target) + """ + + __constants__ = ["reduction"] + + def __init__( + self, + size_average=None, + reduce=None, + reduction: str = "mean", + log_target: bool = False, + ) -> None: + super().__init__(size_average, reduce, reduction) + self.log_target = log_target + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.kl_div( + input, target, reduction=self.reduction, log_target=self.log_target + ) + + +class MSELoss(_Loss): + r"""Creates a criterion that measures the mean squared error (squared L2 norm) between + each element in the input :math:`x` and target :math:`y`. + + The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = \left( x_n - y_n \right)^2, + + where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + :math:`x` and :math:`y` are tensors of arbitrary shapes with a total + of :math:`N` elements each. + + The mean operation still operates over all the elements, and divides by :math:`N`. + + The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``. + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + + Examples: + + >>> loss = nn.MSELoss() + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.randn(3, 5) + >>> output = loss(input, target) + >>> output.backward() + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.mse_loss(input, target, reduction=self.reduction) + + +class BCELoss(_WeightedLoss): + r"""Creates a criterion that measures the Binary Cross Entropy between the target and + the input probabilities: + + The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right], + + where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then + + .. math:: + \ell(x, y) = \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + This is used for measuring the error of a reconstruction in for example + an auto-encoder. Note that the targets :math:`y` should be numbers + between 0 and 1. + + Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be + mathematically undefined in the above loss equation. PyTorch chooses to set + :math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`. + However, an infinite term in the loss equation is not desirable for several reasons. + + For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be + multiplying 0 with infinity. Secondly, if we have an infinite loss value, then + we would also have an infinite term in our gradient, since + :math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`. + This would make BCELoss's backward method nonlinear with respect to :math:`x_n`, + and using it for things like linear regression would not be straight-forward. + + Our solution is that BCELoss clamps its log function outputs to be greater than + or equal to -100. This way, we can always have a finite loss value and a linear + backward method. + + + Args: + weight (Tensor, optional): a manual rescaling weight given to the loss + of each batch element. If given, has to be a Tensor of size `nbatch`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same + shape as input. + + Examples: + + >>> m = nn.Sigmoid() + >>> loss = nn.BCELoss() + >>> input = torch.randn(3, 2, requires_grad=True) + >>> target = torch.rand(3, 2, requires_grad=False) + >>> output = loss(m(input), target) + >>> output.backward() + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.binary_cross_entropy( + input, target, weight=self.weight, reduction=self.reduction + ) + + +class BCEWithLogitsLoss(_Loss): + r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single + class. This version is more numerically stable than using a plain `Sigmoid` + followed by a `BCELoss` as, by combining the operations into one layer, + we take advantage of the log-sum-exp trick for numerical stability. + + The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - w_n \left[ y_n \cdot \log \sigma(x_n) + + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right], + + where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then + + .. math:: + \ell(x, y) = \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + This is used for measuring the error of a reconstruction in for example + an auto-encoder. Note that the targets `t[i]` should be numbers + between 0 and 1. + + It's possible to trade off recall and precision by adding weights to positive examples. + In the case of multi-label classification the loss can be described as: + + .. math:: + \ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad + l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c}) + + (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right], + + where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification, + :math:`c = 1` for single-label binary classification), + :math:`n` is the number of the sample in the batch and + :math:`p_c` is the weight of the positive answer for the class :math:`c`. + + :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision. + + For example, if a dataset contains 100 positive and 300 negative examples of a single class, + then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`. + The loss would act as if the dataset contains :math:`3\times 100=300` positive examples. + + Examples: + + >>> target = torch.ones([10, 64], dtype=torch.float32) # 64 classes, batch size = 10 + >>> output = torch.full([10, 64], 1.5) # A prediction (logit) + >>> pos_weight = torch.ones([64]) # All weights are equal to 1 + >>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) + >>> criterion(output, target) # -log(sigmoid(1.5)) + tensor(0.20...) + + In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes + in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the + loss function based on the imbalance between negative and positive samples for the respective class. + This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss + calculation accurately accounts for the distribution in each class. + + Args: + weight (Tensor, optional): a manual rescaling weight given to the loss + of each batch element. If given, has to be a Tensor of size `nbatch`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target. + Must be a tensor with equal size along the class dimension to the number of classes. + Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired + operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of + size [B, C, H, W] will apply different pos_weights to each element of the batch or + [C, H, W] the same pos_weights across the batch. To apply the same positive weight + along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1]. + Default: ``None`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same + shape as input. + + Examples: + + >>> loss = nn.BCEWithLogitsLoss() + >>> input = torch.randn(3, requires_grad=True) + >>> target = torch.empty(3).random_(2) + >>> output = loss(input, target) + >>> output.backward() + """ + + def __init__( + self, + weight: Tensor | None = None, + size_average=None, + reduce=None, + reduction: str = "mean", + pos_weight: Tensor | None = None, + ) -> None: + super().__init__(size_average, reduce, reduction) + self.register_buffer("weight", weight) + self.register_buffer("pos_weight", pos_weight) + self.weight: Tensor | None + self.pos_weight: Tensor | None + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.binary_cross_entropy_with_logits( + input, + target, + self.weight, + pos_weight=self.pos_weight, + reduction=self.reduction, + ) + + +class HingeEmbeddingLoss(_Loss): + r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y` + (containing 1 or -1). + This is usually used for measuring whether two inputs are similar or + dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically + used for learning nonlinear embeddings or semi-supervised learning. + + The loss function for :math:`n`-th sample in the mini-batch is + + .. math:: + l_n = \begin{cases} + x_n, & \text{if}\; y_n = 1,\\ + \max \{0, margin - x_n\}, & \text{if}\; y_n = -1, + \end{cases} + + and the total loss functions is + + .. math:: + \ell(x, y) = \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + where :math:`L = \{l_1,\dots,l_N\}^\top`. + + Args: + margin (float, optional): Has a default value of `1`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation + operates over all the elements. + - Target: :math:`(*)`, same shape as the input + - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input + """ + + __constants__ = ["margin", "reduction"] + margin: float + + def __init__( + self, + margin: float = 1.0, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + self.margin = margin + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.hinge_embedding_loss( + input, target, margin=self.margin, reduction=self.reduction + ) + + +class MultiLabelMarginLoss(_Loss): + r"""Creates a criterion that optimizes a multi-class multi-classification + hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) + and output :math:`y` (which is a 2D `Tensor` of target class indices). + For each sample in the mini-batch: + + .. math:: + \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)} + + where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \ + :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \ + :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \ + and :math:`i \neq y[j]` for all :math:`i` and :math:`j`. + + :math:`y` and :math:`x` must have the same size. + + The criterion only considers a contiguous block of non-negative targets that + starts at the front. + + This allows for different samples to have variable amounts of target classes. + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C` + is the number of classes. + - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`. + + Examples: + + >>> loss = nn.MultiLabelMarginLoss() + >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]]) + >>> # for target y, only consider labels 3 and 0, not after label -1 + >>> y = torch.LongTensor([[3, 0, -1, 1]]) + >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4))) + >>> loss(x, y) + tensor(0.85...) + + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.multilabel_margin_loss(input, target, reduction=self.reduction) + + +class SmoothL1Loss(_Loss): + r"""Creates a criterion that uses a squared term if the absolute + element-wise error falls below beta and an L1 term otherwise. + It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases + prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick). + + For a batch of size :math:`N`, the unreduced loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1, ..., l_N\}^T + + with + + .. math:: + l_n = \begin{cases} + 0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\ + |x_n - y_n| - 0.5 * beta, & \text{otherwise } + \end{cases} + + If `reduction` is not `none`, then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + .. note:: + Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta` + portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`. + The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`. + + .. note:: + Smooth L1 loss is closely related to :class:`HuberLoss`, being + equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is + also known as delta for Huber). This leads to the following differences: + + * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss` + converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss. + * As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while + :class:`HuberLoss` converges to :class:`MSELoss`. + * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1. + For :class:`HuberLoss`, the slope of the L1 segment is beta. + + .. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083 + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss. + The value must be non-negative. Default: 1.0 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input. + """ + + __constants__ = ["reduction"] + + def __init__( + self, size_average=None, reduce=None, reduction: str = "mean", beta: float = 1.0 + ) -> None: + super().__init__(size_average, reduce, reduction) + self.beta = beta + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta) + + +class HuberLoss(_Loss): + r"""Creates a criterion that uses a squared term if the absolute + element-wise error falls below delta and a delta-scaled L1 term otherwise. + This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the + delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`, + while the L2 region provides smoothness over :class:`L1Loss` near 0. See + `Huber loss `_ for more information. + + For a batch of size :math:`N`, the unreduced loss can be described as: + + .. math:: + \ell(x, y) = L = \{l_1, ..., l_N\}^T + + with + + .. math:: + l_n = \begin{cases} + 0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\ + delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise } + \end{cases} + + If `reduction` is not `none`, then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + .. note:: + When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`. + In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta + in Smooth L1). + See :class:`SmoothL1Loss` for additional discussion on the differences in behavior + between the two losses. + + Args: + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'`` + delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss. + The value must be positive. Default: 1.0 + + Shape: + - Input: :math:`(*)` where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input. + """ + + __constants__ = ["reduction", "delta"] + + def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None: + super().__init__(reduction=reduction) + self.delta = delta + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta) + + +class SoftMarginLoss(_Loss): + r"""Creates a criterion that optimizes a two-class classification + logistic loss between input tensor :math:`x` and target tensor :math:`y` + (containing 1 or -1). + + .. math:: + \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()} + + Args: + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Target: :math:`(*)`, same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same + shape as input. + + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.soft_margin_loss(input, target, reduction=self.reduction) + + +class CrossEntropyLoss(_WeightedLoss): + r"""This criterion computes the cross entropy loss between input logits + and target. + + It is useful when training a classification problem with `C` classes. + If provided, the optional argument :attr:`weight` should be a 1D `Tensor` + assigning weight to each of the classes. + This is particularly useful when you have an unbalanced training set. + + The `input` is expected to contain the unnormalized logits for each class (which do `not` need + to be positive or sum to 1, in general). + `input` has to be a Tensor of size :math:`(C)` for unbatched input, + :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the + `K`-dimensional case. The last being useful for higher dimension inputs, such + as computing cross entropy loss per-pixel for 2D images. + + The `target` that this criterion expects should contain either: + + - Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if + `ignore_index` is specified, this loss also accepts this class index (this index + may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction` + set to ``'none'``) loss for this case can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})} + \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\} + + where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, + :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as + :math:`d_1, ..., d_k` for the `K`-dimensional case. If + :attr:`reduction` is not ``'none'`` (default ``'mean'``), then + + .. math:: + \ell(x, y) = \begin{cases} + \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, & + \text{if reduction} = \text{`mean';}\\ + \sum_{n=1}^N l_n, & + \text{if reduction} = \text{`sum'.} + \end{cases} + + Note that this case is equivalent to applying :class:`~torch.nn.LogSoftmax` + on an input, followed by :class:`~torch.nn.NLLLoss`. + + - Probabilities for each class; useful when labels beyond a single class per minibatch item + are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with + :attr:`reduction` set to ``'none'``) loss for this case can be described as: + + .. math:: + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c} + + where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, + :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as + :math:`d_1, ..., d_k` for the `K`-dimensional case. If + :attr:`reduction` is not ``'none'`` (default ``'mean'``), then + + .. math:: + \ell(x, y) = \begin{cases} + \frac{\sum_{n=1}^N l_n}{N}, & + \text{if reduction} = \text{`mean';}\\ + \sum_{n=1}^N l_n, & + \text{if reduction} = \text{`sum'.} + \end{cases} + + .. note:: + The performance of this criterion is generally better when `target` contains class + indices, as this allows for optimized computation. Consider providing `target` as + class probabilities only when a single class label per minibatch item is too restrictive. + + Args: + weight (Tensor, optional): a manual rescaling weight given to each class. + If given, has to be a Tensor of size `C`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + ignore_index (int, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. When :attr:`size_average` is + ``True``, the loss is averaged over non-ignored targets. Note that + :attr:`ignore_index` is only applicable when the target contains class indices. + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will + be applied, ``'mean'``: the weighted mean of the output is taken, + ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in + the meantime, specifying either of those two args will override + :attr:`reduction`. Default: ``'mean'`` + label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount + of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + + Shape: + - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` + in the case of `K`-dimensional loss. + - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with + :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The + target data type is required to be long when using class indices. If containing class probabilities, the + target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target + data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce + probability constraints on the class probabilities and that it is the user's responsibility to ensure + ``target`` contains valid probability distributions (see below examples section for more details). + - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` + in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar. + + + where: + + .. math:: + \begin{aligned} + C ={} & \text{number of classes} \\ + N ={} & \text{batch size} \\ + \end{aligned} + + Examples: + + >>> # Example of target with class indices + >>> loss = nn.CrossEntropyLoss() + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.empty(3, dtype=torch.long).random_(5) + >>> output = loss(input, target) + >>> output.backward() + >>> + >>> # Example of target with class probabilities + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.randn(3, 5).softmax(dim=1) + >>> output = loss(input, target) + >>> output.backward() + + .. note:: + When ``target`` contains class probabilities, it should consist of soft labels—that is, + each ``target`` entry should represent a probability distribution over the possible classes for a given data sample, + with individual probabilities between ``[0,1]`` and the total distribution summing to 1. + This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above. + + PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]`` + or whether the distribution of each data sample sums to ``1``. + No warning will be raised and it is the user's responsibility + to ensure that ``target`` contains valid probability distributions. + Providing arbitrary values may yield misleading loss values and unstable gradients during training. + + Examples: + >>> # xdoctest: +SKIP + >>> # Example of target with incorrectly specified class probabilities + >>> loss = nn.CrossEntropyLoss() + >>> torch.manual_seed(283) + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.randn(3, 5) + >>> # Provided target class probabilities are not in range [0,1] + >>> target + tensor([[ 0.7105, 0.4446, 2.0297, 0.2671, -0.6075], + [-1.0496, -0.2753, -0.3586, 0.9270, 1.0027], + [ 0.7551, 0.1003, 1.3468, -0.3581, -0.9569]]) + >>> # Provided target class probabilities do not sum to 1 + >>> target.sum(axis=1) + tensor([2.8444, 0.2462, 0.8873]) + >>> # No error message and possible misleading loss value + >>> loss(input, target).item() + 4.6379876136779785 + >>> + >>> # Example of target with correctly specified class probabilities + >>> # Use .softmax() to ensure true probability distribution + >>> target_new = target.softmax(dim=1) + >>> # New target class probabilities all in range [0,1] + >>> target_new + tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417], + [0.0496, 0.1075, 0.0990, 0.3579, 0.3860], + [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]]) + >>> # New target class probabilities sum to 1 + >>> target_new.sum(axis=1) + tensor([1.0000, 1.0000, 1.0000]) + >>> loss(input, target_new).item() + 2.55349063873291 + """ + + __constants__ = ["ignore_index", "reduction", "label_smoothing"] + ignore_index: int + label_smoothing: float + + def __init__( + self, + weight: Tensor | None = None, + size_average=None, + ignore_index: int = -100, + reduce=None, + reduction: str = "mean", + label_smoothing: float = 0.0, + ) -> None: + super().__init__(weight, size_average, reduce, reduction) + self.ignore_index = ignore_index + self.label_smoothing = label_smoothing + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.cross_entropy( + input, + target, + weight=self.weight, + ignore_index=self.ignore_index, + reduction=self.reduction, + label_smoothing=self.label_smoothing, + ) + + +class MultiLabelSoftMarginLoss(_WeightedLoss): + r"""Creates a criterion that optimizes a multi-label one-versus-all + loss based on max-entropy, between input :math:`x` and target :math:`y` of size + :math:`(N, C)`. + For each sample in the minibatch: + + .. math:: + loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1}) + + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right) + + where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`, + :math:`y[i] \in \left\{0, \; 1\right\}`. + + Args: + weight (Tensor, optional): a manual rescaling weight given to each + class. If given, it has to be a Tensor of size `C`. Otherwise, it is + treated as if having all ones. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes. + - Target: :math:`(N, C)`, label targets must have the same shape as the input. + - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`. + """ + + __constants__ = ["reduction"] + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.multilabel_soft_margin_loss( + input, target, weight=self.weight, reduction=self.reduction + ) + + +class CosineEmbeddingLoss(_Loss): + r"""Creates a criterion that measures the loss given input tensors + :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1. + Use (:math:`y=1`) to maximize the cosine similarity of two inputs, and (:math:`y=-1`) otherwise. + This is typically used for learning nonlinear + embeddings or semi-supervised learning. + + The loss function for each sample is: + + .. math:: + \text{loss}(x, y) = + \begin{cases} + 1 - \cos(x_1, x_2), & \text{if } y = 1 \\ + \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1 + \end{cases} + + Args: + margin (float, optional): Should be a number from :math:`-1` to :math:`1`, + :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the + default value is :math:`0`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension. + - Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1. + - Target: :math:`(N)` or :math:`()`. + - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar. + + Examples: + + >>> loss = nn.CosineEmbeddingLoss() + >>> input1 = torch.randn(3, 5, requires_grad=True) + >>> input2 = torch.randn(3, 5, requires_grad=True) + >>> target = torch.ones(3) + >>> output = loss(input1, input2, target) + >>> output.backward() + """ + + __constants__ = ["margin", "reduction"] + margin: float + + def __init__( + self, + margin: float = 0.0, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + self.margin = margin + + def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.cosine_embedding_loss( + input1, input2, target, margin=self.margin, reduction=self.reduction + ) + + +class MarginRankingLoss(_Loss): + r"""Creates a criterion that measures the loss given + inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`, + and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1). + + If :math:`y = 1` then it assumed the first input should be ranked higher + (have a larger value) than the second input, and vice-versa for :math:`y = -1`. + + The loss function for each pair of samples in the mini-batch is: + + .. math:: + \text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin}) + + Args: + margin (float, optional): Has a default value of :math:`0`. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input1: :math:`(N)` or :math:`()` where `N` is the batch size. + - Input2: :math:`(N)` or :math:`()`, same shape as the Input1. + - Target: :math:`(N)` or :math:`()`, same shape as the inputs. + - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`. + + Examples: + + >>> loss = nn.MarginRankingLoss() + >>> input1 = torch.randn(3, requires_grad=True) + >>> input2 = torch.randn(3, requires_grad=True) + >>> target = torch.randn(3).sign() + >>> output = loss(input1, input2, target) + >>> output.backward() + """ + + __constants__ = ["margin", "reduction"] + margin: float + + def __init__( + self, + margin: float = 0.0, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + self.margin = margin + + def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.margin_ranking_loss( + input1, input2, target, margin=self.margin, reduction=self.reduction + ) + + +class MultiMarginLoss(_WeightedLoss): + r"""Creates a criterion that optimizes a multi-class classification hinge + loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and + output :math:`y` (which is a 1D tensor of target class indices, + :math:`0 \leq y \leq \text{x.size}(1)-1`): + + For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar + output :math:`y` is: + + .. math:: + \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)} + + where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}` + and :math:`i \neq y`. + + Optionally, you can give non-equal weighting on the classes by passing + a 1D :attr:`weight` tensor into the constructor. + + The loss function then becomes: + + .. math:: + \text{loss}(x, y) = \frac{\sum_i w[y] * \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)} + + Args: + p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2` + are the only supported values. + margin (float, optional): Has a default value of :math:`1`. + weight (Tensor, optional): a manual rescaling weight given to each + class. If given, it has to be a Tensor of size `C`. Otherwise, it is + treated as if having all ones. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes. + - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`. + - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target. + + Examples: + + >>> loss = nn.MultiMarginLoss() + >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]]) + >>> y = torch.tensor([3]) + >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4))) + >>> loss(x, y) + tensor(0.32...) + """ + + __constants__ = ["p", "margin", "reduction"] + margin: float + p: int + + def __init__( + self, + p: int = 1, + margin: float = 1.0, + weight: Tensor | None = None, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(weight, size_average, reduce, reduction) + if p != 1 and p != 2: + raise ValueError("only p == 1 and p == 2 supported") + if weight is not None and weight.dim() != 1: + raise ValueError( + f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead" + ) + self.p = p + self.margin = margin + + def forward(self, input: Tensor, target: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.multi_margin_loss( + input, + target, + p=self.p, + margin=self.margin, + weight=self.weight, + reduction=self.reduction, + ) + + +class TripletMarginLoss(_Loss): + r"""Creates a criterion that measures the triplet loss given an input + tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`. + This is used for measuring a relative similarity between samples. A triplet + is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative + examples` respectively). The shapes of all input tensors should be + :math:`(N, D)`. + + The distance swap is described in detail in the paper `Learning shallow + convolutional feature descriptors with triplet losses`_ by + V. Balntas, E. Riba et al. + + The loss function for each sample in the mini-batch is: + + .. math:: + L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} + + + where + + .. math:: + d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p + + The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is + added for numerical stability. + + See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the + triplet margin loss for input tensors using a custom distance function. + + Args: + margin (float, optional): Default: :math:`1`. + p (int, optional): The norm degree for pairwise distance. Default: :math:`2`. + eps (float, optional): Small constant for numerical stability. Default: :math:`1e-6`. + swap (bool, optional): The distance swap is described in detail in the paper + `Learning shallow convolutional feature descriptors with triplet losses` by + V. Balntas, E. Riba et al. Default: ``False``. + size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, + the losses are averaged over each loss element in the batch. Note that for + some losses, there are multiple elements per sample. If the field :attr:`size_average` + is set to ``False``, the losses are instead summed for each minibatch. Ignored + when :attr:`reduce` is ``False``. Default: ``True`` + reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the + losses are averaged or summed over observations for each minibatch depending + on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per + batch element instead and ignores :attr:`size_average`. Default: ``True`` + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` + and :attr:`reduce` are in the process of being deprecated, and in the meantime, + specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension. + - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and + input shape is :math:`(N, D)`; a scalar otherwise. + + Examples: + + >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7) + >>> anchor = torch.randn(100, 128, requires_grad=True) + >>> positive = torch.randn(100, 128, requires_grad=True) + >>> negative = torch.randn(100, 128, requires_grad=True) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + + .. _Learning shallow convolutional feature descriptors with triplet losses: + https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html + """ + + __constants__ = ["margin", "p", "eps", "swap", "reduction"] + margin: float + p: float + eps: float + swap: bool + + def __init__( + self, + margin: float = 1.0, + p: float = 2.0, + eps: float = 1e-6, + swap: bool = False, + size_average=None, + reduce=None, + reduction: str = "mean", + ) -> None: + super().__init__(size_average, reduce, reduction) + if margin <= 0: + raise ValueError( + f"TripletMarginLoss: expected margin to be greater than 0, got {margin} instead" + ) + self.margin = margin + self.p = p + self.eps = eps + self.swap = swap + + def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.triplet_margin_loss( + anchor, + positive, + negative, + margin=self.margin, + p=self.p, + eps=self.eps, + swap=self.swap, + reduction=self.reduction, + ) + + +class TripletMarginWithDistanceLoss(_Loss): + r"""Creates a criterion that measures the triplet loss given input + tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor, + positive, and negative examples, respectively), and a nonnegative, + real-valued function ("distance function") used to compute the relationship + between the anchor and positive example ("positive distance") and the + anchor and negative example ("negative distance"). + + The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``) + can be described as: + + .. math:: + \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad + l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} + + where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function + quantifying the closeness of two tensors, referred to as the :attr:`distance_function`; + and :math:`margin` is a nonnegative margin representing the minimum difference + between the positive and negative distances that is required for the loss to + be 0. The input tensors have :math:`N` elements each and can be of any shape + that the distance function can handle. + + If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet + loss for input tensors using the :math:`l_p` distance as the distance function. + + Args: + distance_function (Callable, optional): A nonnegative, real-valued function that + quantifies the closeness of two tensors. If not specified, + `nn.PairwiseDistance` will be used. Default: ``None`` + margin (float, optional): A nonnegative margin representing the minimum difference + between the positive and negative distances required for the loss to be 0. Larger + margins penalize cases where the negative examples are not distant enough from the + anchors, relative to the positives. Default: :math:`1`. + swap (bool, optional): Whether to use the distance swap described in the paper + `Learning shallow convolutional feature descriptors with triplet losses` by + V. Balntas, E. Riba et al. If True, and if the positive example is closer to the + negative example than the anchor is, swaps the positive example and the anchor in + the loss computation. Default: ``False``. + reduction (str, optional): Specifies the (optional) reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'`` + + + Shape: + - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions + as supported by the distance function. + - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar + otherwise. + + Examples: + + >>> # Initialize embeddings + >>> embedding = nn.Embedding(1000, 128) + >>> anchor_ids = torch.randint(0, 1000, (1,)) + >>> positive_ids = torch.randint(0, 1000, (1,)) + >>> negative_ids = torch.randint(0, 1000, (1,)) + >>> anchor = embedding(anchor_ids) + >>> positive = embedding(positive_ids) + >>> negative = embedding(negative_ids) + >>> + >>> # Built-in Distance Function + >>> triplet_loss = \ + >>> nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance()) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + >>> + >>> # Custom Distance Function + >>> def l_infinity(x1, x2): + >>> return torch.max(torch.abs(x1 - x2), dim=1).values + >>> + >>> # xdoctest: +SKIP("FIXME: Would call backwards a second time") + >>> triplet_loss = ( + >>> nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5)) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + >>> + >>> # Custom Distance Function (Lambda) + >>> triplet_loss = ( + >>> nn.TripletMarginWithDistanceLoss( + >>> distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + + Reference: + V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses: + https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html + """ + + __constants__ = ["margin", "swap", "reduction"] + margin: float + swap: bool + + def __init__( + self, + *, + distance_function: Callable[[Tensor, Tensor], Tensor] | None = None, + margin: float = 1.0, + swap: bool = False, + reduction: str = "mean", + ) -> None: + super().__init__(size_average=None, reduce=None, reduction=reduction) + if margin <= 0: + raise ValueError( + f"TripletMarginWithDistanceLoss: expected margin to be greater than 0, got {margin} instead" + ) + self.distance_function: Callable[[Tensor, Tensor], Tensor] | None = ( + distance_function if distance_function is not None else PairwiseDistance() + ) + self.margin = margin + self.swap = swap + + def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.triplet_margin_with_distance_loss( + anchor, + positive, + negative, + distance_function=self.distance_function, + margin=self.margin, + swap=self.swap, + reduction=self.reduction, + ) + + +class CTCLoss(_Loss): + r"""The Connectionist Temporal Classification loss. + + Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the + probability of possible alignments of input to target, producing a loss value which is differentiable + with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which + limits the length of the target sequence such that it must be :math:`\leq` the input length. + + Args: + blank (int, optional): blank label. Default :math:`0`. + reduction (str, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the output losses will be divided by the target lengths and + then the mean over the batch is taken, ``'sum'``: the output losses will be summed. + Default: ``'mean'`` + zero_infinity (bool, optional): + Whether to zero infinite losses and the associated gradients. + Default: ``False`` + Infinite losses mainly occur when the inputs are too short + to be aligned to the targets. + + Shape: + - Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`, + where :math:`T = \text{input length}`, + :math:`N = \text{batch size}`, and + :math:`C = \text{number of classes (including blank)}`. + The logarithmized probabilities of the outputs (e.g. obtained with + :func:`torch.nn.functional.log_softmax`). + - Targets: Tensor of size :math:`(N, S)` or + :math:`(\operatorname{sum}(\text{target\_lengths}))`, + where :math:`N = \text{batch size}` and + :math:`S = \text{max target length, if shape is } (N, S)`. + It represents the target sequences. Each element in the target + sequence is a class index. And the target index cannot be blank (default=0). + In the :math:`(N, S)` form, targets are padded to the + length of the longest sequence, and stacked. + In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form, + the targets are assumed to be un-padded and + concatenated within 1 dimension. + - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`, + where :math:`N = \text{batch size}`. It represents the lengths of the + inputs (must each be :math:`\leq T`). And the lengths are specified + for each sequence to achieve masking under the assumption that sequences + are padded to equal lengths. + - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`, + where :math:`N = \text{batch size}`. It represents lengths of the targets. + Lengths are specified for each sequence to achieve masking under the + assumption that sequences are padded to equal lengths. If target shape is + :math:`(N,S)`, target_lengths are effectively the stop index + :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for + each target in a batch. Lengths must each be :math:`\leq S` + If the targets are given as a 1d tensor that is the concatenation of individual + targets, the target_lengths must add up to the total length of the tensor. + - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or + ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or + :math:`()` if input is unbatched, where :math:`N = \text{batch size}`. + + Examples: + + >>> # Target are to be padded + >>> T = 50 # Input sequence length + >>> C = 20 # Number of classes (including blank) + >>> N = 16 # Batch size + >>> S = 30 # Target sequence length of longest target in batch (padding length) + >>> S_min = 10 # Minimum target length, for demonstration purposes + >>> + >>> # Initialize random batch of input vectors, for *size = (T,N,C) + >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() + >>> + >>> # Initialize random batch of targets (0 = blank, 1:C = classes) + >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long) + >>> + >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) + >>> target_lengths = torch.randint( + ... low=S_min, + ... high=S, + ... size=(N,), + ... dtype=torch.long, + ... ) + >>> ctc_loss = nn.CTCLoss() + >>> loss = ctc_loss(input, target, input_lengths, target_lengths) + >>> loss.backward() + >>> + >>> + >>> # Target are to be un-padded + >>> T = 50 # Input sequence length + >>> C = 20 # Number of classes (including blank) + >>> N = 16 # Batch size + >>> + >>> # Initialize random batch of input vectors, for *size = (T,N,C) + >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() + >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) + >>> + >>> # Initialize random batch of targets (0 = blank, 1:C = classes) + >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) + >>> target = torch.randint( + ... low=1, + ... high=C, + ... size=(sum(target_lengths),), + ... dtype=torch.long, + ... ) + >>> ctc_loss = nn.CTCLoss() + >>> loss = ctc_loss(input, target, input_lengths, target_lengths) + >>> loss.backward() + >>> + >>> + >>> # Target are to be un-padded and unbatched (effectively N=1) + >>> T = 50 # Input sequence length + >>> C = 20 # Number of classes (including blank) + >>> + >>> # Initialize random batch of input vectors, for *size = (T,C) + >>> # xdoctest: +SKIP("FIXME: error in doctest") + >>> input = torch.randn(T, C).log_softmax(1).detach().requires_grad_() + >>> input_lengths = torch.tensor(T, dtype=torch.long) + >>> + >>> # Initialize random batch of targets (0 = blank, 1:C = classes) + >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long) + >>> target = torch.randint( + ... low=1, + ... high=C, + ... size=(target_lengths,), + ... dtype=torch.long, + ... ) + >>> ctc_loss = nn.CTCLoss() + >>> loss = ctc_loss(input, target, input_lengths, target_lengths) + >>> loss.backward() + + Reference: + A. Graves et al.: Connectionist Temporal Classification: + Labelling Unsegmented Sequence Data with Recurrent Neural Networks: + https://www.cs.toronto.edu/~graves/icml_2006.pdf + + Note: + In order to use CuDNN, the following must be satisfied: the :attr:`targets` must be + in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`, + :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of + dtype :attr:`torch.int32`, and the :attr:`log_probs` itself must be of + dtype :attr:`torch.float32`. + + The regular implementation uses the (more common in PyTorch) `torch.long` dtype. + + + Note: + In some circumstances when using the CUDA backend with CuDNN, this operator + may select a nondeterministic algorithm to increase performance. If this is + undesirable, you can try to make the operation deterministic (potentially at + a performance cost) by setting ``torch.backends.cudnn.deterministic = + True``. + Please see the notes on :doc:`/notes/randomness` for background. + """ + + __constants__ = ["blank", "reduction"] + blank: int + zero_infinity: bool + + def __init__( + self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False + ) -> None: + super().__init__(reduction=reduction) + self.blank = blank + self.zero_infinity = zero_infinity + + def forward( + self, + log_probs: Tensor, + targets: Tensor, + input_lengths: Tensor, + target_lengths: Tensor, + ) -> Tensor: + """Runs the forward pass.""" + return F.ctc_loss( + log_probs, + targets, + input_lengths, + target_lengths, + self.blank, + self.reduction, + self.zero_infinity, + ) + + +# TODO: L1HingeEmbeddingCriterion +# TODO: MSECriterion weight +# TODO: ClassSimplexCriterion diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py new file mode 100644 index 0000000000000000000000000000000000000000..e9123f76b75c31d71c2e863c2cdb3c87f862291f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py @@ -0,0 +1,3046 @@ +# mypy: allow-untyped-defs + +import functools +import inspect +import itertools +import warnings +import weakref +from collections import namedtuple, OrderedDict +from collections.abc import Callable, Iterator, Mapping +from typing import Any, Optional, overload, TypeVar, Union +from typing_extensions import Self + +import torch +from torch import device, dtype, Tensor +from torch._prims_common import DeviceLikeType +from torch.nn.parameter import Buffer, Parameter +from torch.utils._python_dispatch import is_traceable_wrapper_subclass +from torch.utils.hooks import BackwardHook, RemovableHandle + + +__all__ = [ + "register_module_forward_pre_hook", + "register_module_forward_hook", + "register_module_full_backward_pre_hook", + "register_module_backward_hook", + "register_module_full_backward_hook", + "register_module_buffer_registration_hook", + "register_module_module_registration_hook", + "register_module_parameter_registration_hook", + "Module", +] + +_grad_t = Union[tuple[Tensor, ...], Tensor] +# See https://mypy.readthedocs.io/en/latest/generics.html#generic-methods-and-generic-self for the use +# of `T` to annotate `self`. Many methods of `Module` return `self` and we want those return values to be +# the type of the subclass, not the looser type of `Module`. +T = TypeVar("T", bound="Module") + + +class _IncompatibleKeys( + # pyrefly: ignore [invalid-inheritance] + namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]), +): + __slots__ = () + + def __repr__(self) -> str: + # pyrefly: ignore [missing-attribute] + if not self.missing_keys and not self.unexpected_keys: + return "" + return super().__repr__() + + __str__ = __repr__ + + +def _addindent(s_, numSpaces): + s = s_.split("\n") + # don't do anything for single-line stuff + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(numSpaces * " ") + line for line in s] + s = "\n".join(s) + s = first + "\n" + s + return s + + +r"""This tracks hooks common to all modules that are executed immediately before +.registering the buffer/module/parameter""" +_global_buffer_registration_hooks: dict[int, Callable] = OrderedDict() +_global_module_registration_hooks: dict[int, Callable] = OrderedDict() +_global_parameter_registration_hooks: dict[int, Callable] = OrderedDict() + + +class _WrappedHook: + def __init__(self, hook: Callable, module: Optional["Module"] = None) -> None: + self.hook: Callable = hook + functools.update_wrapper(self, hook) + + self.with_module: bool = False + + if module is not None: + self.module: weakref.ReferenceType[Module] = weakref.ref(module) + self.with_module = True + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + if self.with_module: + module = self.module() + if module is None: + raise RuntimeError("You are trying to call the hook of a dead Module!") + return self.hook(module, *args, **kwargs) + return self.hook(*args, **kwargs) + + def __getstate__(self) -> dict: + result = {"hook": self.hook, "with_module": self.with_module} + if self.with_module: + # pyrefly: ignore [unsupported-operation] + result["module"] = self.module() + + return result + + def __setstate__(self, state: dict): + self.hook = state["hook"] + self.with_module = state["with_module"] + + if self.with_module: + if state["module"] is None: + raise RuntimeError( + "You are trying to revive the hook of a dead Module!" + ) + self.module = weakref.ref(state["module"]) + + +r"""This tracks hooks common to all modules that are executed before/after +calling forward and backward. This is global state used for debugging/profiling +purposes""" +_global_backward_pre_hooks: dict[int, Callable] = OrderedDict() +_global_backward_hooks: dict[int, Callable] = OrderedDict() +_global_is_full_backward_hook: bool | None = None +_global_forward_pre_hooks: dict[int, Callable] = OrderedDict() +_global_forward_hooks: dict[int, Callable] = OrderedDict() +_global_forward_hooks_always_called: dict[int, bool] = OrderedDict() +_global_forward_hooks_with_kwargs: dict[int, bool] = OrderedDict() + + +def _has_any_global_hook(): + return ( + _global_backward_pre_hooks + or _global_backward_hooks + or _global_forward_pre_hooks + or _global_forward_hooks + or _global_forward_hooks_always_called + or _global_forward_hooks_with_kwargs + ) + + +_EXTRA_STATE_KEY_SUFFIX = "_extra_state" + + +def register_module_buffer_registration_hook( + hook: Callable[..., None], +) -> RemovableHandle: + r"""Register a buffer registration hook common to all modules. + + .. warning :: + + This adds global state to the `nn.Module` module + + The hook will be called every time :func:`register_buffer` is invoked. + It should have the following signature:: + + hook(module, name, buffer) -> None or new buffer + + The hook can modify the input or return a single modified value in the hook. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle(_global_buffer_registration_hooks) + _global_buffer_registration_hooks[handle.id] = hook + return handle + + +def register_module_module_registration_hook( + hook: Callable[..., None], +) -> RemovableHandle: + r"""Register a module registration hook common to all modules. + + .. warning :: + + This adds global state to the `nn.Module` module + + The hook will be called every time :func:`register_module` is invoked. + It should have the following signature:: + + hook(module, name, submodule) -> None or new submodule + + The hook can modify the input or return a single modified value in the hook. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle(_global_module_registration_hooks) + _global_module_registration_hooks[handle.id] = hook + return handle + + +def register_module_parameter_registration_hook( + hook: Callable[..., None], +) -> RemovableHandle: + r"""Register a parameter registration hook common to all modules. + + .. warning :: + + This adds global state to the `nn.Module` module + + The hook will be called every time :func:`register_parameter` is invoked. + It should have the following signature:: + + hook(module, name, param) -> None or new parameter + + The hook can modify the input or return a single modified value in the hook. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle(_global_parameter_registration_hooks) + _global_parameter_registration_hooks[handle.id] = hook + return handle + + +def register_module_forward_pre_hook(hook: Callable[..., None]) -> RemovableHandle: + r"""Register a forward pre-hook common to all modules. + + .. warning :: + + This adds global state to the `nn.module` module + and it is only intended for debugging/profiling purposes. + + The hook will be called every time before :func:`forward` is invoked. + It should have the following signature:: + + hook(module, input) -> None or modified input + + The input contains only the positional arguments given to the module. + Keyword arguments won't be passed to the hooks and only to the ``forward``. + The hook can modify the input. User can either return a tuple or a + single modified value in the hook. We will wrap the value into a tuple + if a single value is returned(unless that value is already a tuple). + + This hook has precedence over the specific module hooks registered with + ``register_forward_pre_hook``. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle(_global_forward_pre_hooks) + _global_forward_pre_hooks[handle.id] = hook + return handle + + +def register_module_forward_hook( + hook: Callable[..., None], + *, + with_kwargs: bool = False, + always_call: bool = False, +) -> RemovableHandle: + r"""Register a global forward hook for all the modules. + + .. warning :: + + This adds global state to the `nn.module` module + and it is only intended for debugging/profiling purposes. + + The hook will be called every time after :func:`forward` has computed an output. + It should have the following signature:: + + hook(module, input, output) -> None or modified output + + The input contains only the positional arguments given to the module. + Keyword arguments won't be passed to the hooks and only to the ``forward``. + You can optionally modify the output of the module by returning a new value + that will replace the output from the :func:`forward` function. + + Parameters: + hook (Callable): The user defined hook to be registered. + always_call (bool): If ``True`` the ``hook`` will be run regardless of + whether an exception is raised while calling the Module. + Default: ``False`` + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + This hook will be executed before specific module hooks registered with + ``register_forward_hook``. + """ + handle = RemovableHandle( + _global_forward_hooks, extra_dict=_global_forward_hooks_always_called + ) + _global_forward_hooks[handle.id] = hook + if with_kwargs: + _global_forward_hooks_with_kwargs[handle.id] = True + if always_call: + _global_forward_hooks_always_called[handle.id] = True + return handle + + +def register_module_backward_hook( + hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t], +) -> RemovableHandle: + r"""Register a backward hook common to all the modules. + + This function is deprecated in favor of + :func:`torch.nn.modules.module.register_module_full_backward_hook` + and the behavior of this function will change in future versions. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + global _global_is_full_backward_hook + if _global_is_full_backward_hook is True: + raise RuntimeError( + "Cannot use both regular backward hooks and full backward hooks as a " + "global Module hook. Please use only one of them." + ) + + _global_is_full_backward_hook = False + + handle = RemovableHandle(_global_backward_hooks) + _global_backward_hooks[handle.id] = hook + return handle + + +def register_module_full_backward_pre_hook( + hook: Callable[["Module", _grad_t], None | _grad_t], +) -> RemovableHandle: + r"""Register a backward pre-hook common to all the modules. + + .. warning :: + This adds global state to the `nn.module` module + and it is only intended for debugging/profiling purposes. + + Hooks registered using this function behave in the same way as those + registered by :meth:`torch.nn.Module.register_full_backward_pre_hook`. + Refer to its documentation for more details. + + Hooks registered using this function will be called before hooks registered + using :meth:`torch.nn.Module.register_full_backward_pre_hook`. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + handle = RemovableHandle(_global_backward_pre_hooks) + _global_backward_pre_hooks[handle.id] = hook + return handle + + +def register_module_full_backward_hook( + hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t], +) -> RemovableHandle: + r"""Register a backward hook common to all the modules. + + .. warning :: + This adds global state to the `nn.module` module + and it is only intended for debugging/profiling purposes. + + Hooks registered using this function behave in the same way as those + registered by :meth:`torch.nn.Module.register_full_backward_hook`. + Refer to its documentation for more details. + + Hooks registered using this function will be called before hooks registered + using :meth:`torch.nn.Module.register_full_backward_hook`. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + global _global_is_full_backward_hook + if _global_is_full_backward_hook is False: + raise RuntimeError( + "Cannot use both regular backward hooks and full backward hooks as a " + "global Module hook. Please use only one of them." + ) + + _global_is_full_backward_hook = True + + handle = RemovableHandle(_global_backward_hooks) + _global_backward_hooks[handle.id] = hook + return handle + + +# Trick mypy into not applying contravariance rules to inputs by defining +# forward as a value, rather than a function. See also +# https://github.com/python/mypy/issues/8795 +def _forward_unimplemented(self, *input: Any) -> None: + r"""Define the computation performed at every call. + + Should be overridden by all subclasses. + + .. note:: + Although the recipe for forward pass needs to be defined within + this function, one should call the :class:`Module` instance afterwards + instead of this since the former takes care of running the + registered hooks while the latter silently ignores them. + """ + raise NotImplementedError( + f'Module [{type(self).__name__}] is missing the required "forward" function' + ) + + +class Module: + r"""Base class for all neural network modules. + + Your models should also subclass this class. + + Modules can also contain other Modules, allowing them to be nested in + a tree structure. You can assign the submodules as regular attributes:: + + import torch.nn as nn + import torch.nn.functional as F + + + class Model(nn.Module): + def __init__(self) -> None: + super().__init__() + self.conv1 = nn.Conv2d(1, 20, 5) + self.conv2 = nn.Conv2d(20, 20, 5) + + def forward(self, x): + x = F.relu(self.conv1(x)) + return F.relu(self.conv2(x)) + + Submodules assigned in this way will be registered, and will also have their + parameters converted when you call :meth:`to`, etc. + + .. note:: + As per the example above, an ``__init__()`` call to the parent class + must be made before assignment on the child. + + :ivar training: Boolean represents whether this module is in training or + evaluation mode. + :vartype training: bool + """ + + dump_patches: bool = False + + _version: int = 1 + r"""This allows better BC support for :meth:`load_state_dict`. In + :meth:`state_dict`, the version number will be saved as in the attribute + `_metadata` of the returned state dict, and thus pickled. `_metadata` is a + dictionary with keys that follow the naming convention of state dict. See + ``_load_from_state_dict`` on how to use this information in loading. + + If new parameters/buffers are added/removed from a module, this number shall + be bumped, and the module's `_load_from_state_dict` method can compare the + version number and do appropriate changes if the state dict is from before + the change.""" + + training: bool + _parameters: dict[str, Parameter | None] + _buffers: dict[str, Tensor | None] + _non_persistent_buffers_set: set[str] + _backward_pre_hooks: dict[int, Callable] + _backward_hooks: dict[int, Callable] + _is_full_backward_hook: bool | None + _forward_hooks: dict[int, Callable] + # Marks whether the corresponding _forward_hooks accept kwargs or not. + # As JIT does not support set[int], this dict is used as a set, where all + # hooks represented in this dict accept kwargs. + _forward_hooks_with_kwargs: dict[int, bool] + # forward hooks that should always be called even if an exception is raised + _forward_hooks_always_called: dict[int, bool] + _forward_pre_hooks: dict[int, Callable] + # Marks whether the corresponding _forward_hooks accept kwargs or not. + # As JIT does not support set[int], this dict is used as a set, where all + # hooks represented in this dict accept kwargs. + _forward_pre_hooks_with_kwargs: dict[int, bool] + _state_dict_hooks: dict[int, Callable] + _load_state_dict_pre_hooks: dict[int, Callable] + _state_dict_pre_hooks: dict[int, Callable] + _load_state_dict_post_hooks: dict[int, Callable] + _modules: dict[str, Optional["Module"]] + call_super_init: bool = False + _compiled_call_impl: Callable | None = None + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize internal Module state, shared by both nn.Module and ScriptModule.""" + torch._C._log_api_usage_once("python.nn_module") + + # Backward compatibility: no args used to be allowed when call_super_init=False + if self.call_super_init is False and bool(kwargs): + raise TypeError( + f"{type(self).__name__}.__init__() got an unexpected keyword argument '{next(iter(kwargs))}'" + "" + ) + + if self.call_super_init is False and bool(args): + raise TypeError( + f"{type(self).__name__}.__init__() takes 1 positional argument but {len(args) + 1} were" + " given" + ) + + """ + Calls super().__setattr__('a', a) instead of the typical self.a = a + to avoid Module.__setattr__ overhead. Module's __setattr__ has special + handling for parameters, submodules, and buffers but simply calls into + super().__setattr__ for all other attributes. + """ + super().__setattr__("training", True) + super().__setattr__("_parameters", {}) + super().__setattr__("_buffers", {}) + super().__setattr__("_non_persistent_buffers_set", set()) + super().__setattr__("_backward_pre_hooks", OrderedDict()) + super().__setattr__("_backward_hooks", OrderedDict()) + super().__setattr__("_is_full_backward_hook", None) + super().__setattr__("_forward_hooks", OrderedDict()) + super().__setattr__("_forward_hooks_with_kwargs", OrderedDict()) + super().__setattr__("_forward_hooks_always_called", OrderedDict()) + super().__setattr__("_forward_pre_hooks", OrderedDict()) + super().__setattr__("_forward_pre_hooks_with_kwargs", OrderedDict()) + super().__setattr__("_state_dict_hooks", OrderedDict()) + super().__setattr__("_state_dict_pre_hooks", OrderedDict()) + super().__setattr__("_load_state_dict_pre_hooks", OrderedDict()) + super().__setattr__("_load_state_dict_post_hooks", OrderedDict()) + super().__setattr__("_modules", {}) + + if self.call_super_init: + super().__init__(*args, **kwargs) + + forward: Callable[..., Any] = _forward_unimplemented + + def register_buffer( + self, name: str, tensor: Tensor | None, persistent: bool = True + ) -> None: + r"""Add a buffer to the module. + + This is typically used to register a buffer that should not be + considered a model parameter. For example, BatchNorm's ``running_mean`` + is not a parameter, but is part of the module's state. Buffers, by + default, are persistent and will be saved alongside parameters. This + behavior can be changed by setting :attr:`persistent` to ``False``. The + only difference between a persistent buffer and a non-persistent buffer + is that the latter will not be a part of this module's + :attr:`state_dict`. + + Buffers can be accessed as attributes using given names. + + Args: + name (str): name of the buffer. The buffer can be accessed + from this module using the given name + tensor (Tensor or None): buffer to be registered. If ``None``, then operations + that run on buffers, such as :attr:`cuda`, are ignored. If ``None``, + the buffer is **not** included in the module's :attr:`state_dict`. + persistent (bool): whether the buffer is part of this module's + :attr:`state_dict`. + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> self.register_buffer('running_mean', torch.zeros(num_features)) + + """ + if persistent is False and isinstance(self, torch.jit.ScriptModule): + raise RuntimeError("ScriptModule does not support non-persistent buffers") + + if "_buffers" not in self.__dict__: + raise AttributeError("cannot assign buffer before Module.__init__() call") + elif not isinstance(name, str): + raise TypeError( + f"buffer name should be a string. Got {torch.typename(name)}" + ) + elif "." in name: + raise KeyError('buffer name can\'t contain "."') + elif name == "": + raise KeyError('buffer name can\'t be empty string ""') + elif hasattr(self, name) and name not in self._buffers: + raise KeyError(f"attribute '{name}' already exists") + elif tensor is not None and not ( + isinstance(tensor, torch.Tensor) or hasattr(tensor, "__torch_function__") + ): + raise TypeError( + f"cannot assign '{torch.typename(tensor)}' object to buffer '{name}' " + "(torch Tensor or None required)" + ) + else: + for hook in _global_buffer_registration_hooks.values(): + output = hook(self, name, tensor) + if output is not None: + tensor = output + self._buffers[name] = tensor + if persistent: + self._non_persistent_buffers_set.discard(name) + else: + self._non_persistent_buffers_set.add(name) + + def register_parameter(self, name: str, param: Parameter | None) -> None: + r"""Add a parameter to the module. + + The parameter can be accessed as an attribute using given name. + + Args: + name (str): name of the parameter. The parameter can be accessed + from this module using the given name + param (Parameter or None): parameter to be added to the module. If + ``None``, then operations that run on parameters, such as :attr:`cuda`, + are ignored. If ``None``, the parameter is **not** included in the + module's :attr:`state_dict`. + """ + if "_parameters" not in self.__dict__: + raise AttributeError( + "cannot assign parameter before Module.__init__() call" + ) + + elif not isinstance(name, str): + raise TypeError( + f"parameter name should be a string. Got {torch.typename(name)}" + ) + elif "." in name: + raise KeyError('parameter name can\'t contain "."') + elif name == "": + raise KeyError('parameter name can\'t be empty string ""') + elif hasattr(self, name) and name not in self._parameters: + raise KeyError(f"attribute '{name}' already exists") + + if param is None: + self._parameters[name] = None + elif not isinstance(param, Parameter): + raise TypeError( + f"cannot assign '{torch.typename(param)}' object to parameter '{name}' " + "(torch.nn.Parameter or None required)" + ) + elif param.grad_fn: + raise ValueError( + f"Cannot assign non-leaf Tensor to parameter '{name}'. Model " + f"parameters must be created explicitly. To express '{name}' " + "as a function of another Tensor, compute the value in " + "the forward() method." + ) + else: + for hook in _global_parameter_registration_hooks.values(): + output = hook(self, name, param) + if output is not None: + param = output + self._parameters[name] = param + + def add_module(self, name: str, module: Optional["Module"]) -> None: + r"""Add a child module to the current module. + + The module can be accessed as an attribute using the given name. + + Args: + name (str): name of the child module. The child module can be + accessed from this module using the given name + module (Module): child module to be added to the module. + """ + if not isinstance(module, Module) and module is not None: + raise TypeError(f"{torch.typename(module)} is not a Module subclass") + elif not isinstance(name, str): + raise TypeError( + f"module name should be a string. Got {torch.typename(name)}" + ) + elif hasattr(self, name) and name not in self._modules: + raise KeyError(f"attribute '{name}' already exists") + elif "." in name: + raise KeyError(f'module name can\'t contain ".", got: {name}') + elif name == "": + raise KeyError('module name can\'t be empty string ""') + for hook in _global_module_registration_hooks.values(): + output = hook(self, name, module) + if output is not None: + module = output + self._modules[name] = module + + def register_module(self, name: str, module: Optional["Module"]) -> None: + r"""Alias for :func:`add_module`.""" + self.add_module(name, module) + + def get_submodule(self, target: str) -> "Module": + """Return the submodule given by ``target`` if it exists, otherwise throw an error. + + For example, let's say you have an ``nn.Module`` ``A`` that + looks like this: + + .. code-block:: text + + A( + (net_b): Module( + (net_c): Module( + (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2)) + ) + (linear): Linear(in_features=100, out_features=200, bias=True) + ) + ) + + (The diagram shows an ``nn.Module`` ``A``. ``A`` which has a nested + submodule ``net_b``, which itself has two submodules ``net_c`` + and ``linear``. ``net_c`` then has a submodule ``conv``.) + + To check whether or not we have the ``linear`` submodule, we + would call ``get_submodule("net_b.linear")``. To check whether + we have the ``conv`` submodule, we would call + ``get_submodule("net_b.net_c.conv")``. + + The runtime of ``get_submodule`` is bounded by the degree + of module nesting in ``target``. A query against + ``named_modules`` achieves the same result, but it is O(N) in + the number of transitive modules. So, for a simple check to see + if some submodule exists, ``get_submodule`` should always be + used. + + Args: + target: The fully-qualified string name of the submodule + to look for. (See above example for how to specify a + fully-qualified string.) + + Returns: + torch.nn.Module: The submodule referenced by ``target`` + + Raises: + AttributeError: If at any point along the path resulting from + the target string the (sub)path resolves to a non-existent + attribute name or an object that is not an instance of ``nn.Module``. + """ + if target == "": + return self + + atoms: list[str] = target.split(".") + mod: torch.nn.Module = self + + for item in atoms: + if not hasattr(mod, item): + raise AttributeError( + mod._get_name() + " has no attribute `" + item + "`" + ) + + mod = getattr(mod, item) + + if not isinstance(mod, torch.nn.Module): + raise AttributeError("`" + item + "` is not an nn.Module") + + return mod + + def set_submodule( + self, target: str, module: "Module", strict: bool = False + ) -> None: + """ + Set the submodule given by ``target`` if it exists, otherwise throw an error. + + .. note:: + If ``strict`` is set to ``False`` (default), the method will replace an existing submodule + or create a new submodule if the parent module exists. If ``strict`` is set to ``True``, + the method will only attempt to replace an existing submodule and throw an error if + the submodule does not exist. + + For example, let's say you have an ``nn.Module`` ``A`` that + looks like this: + + .. code-block:: text + + A( + (net_b): Module( + (net_c): Module( + (conv): Conv2d(3, 3, 3) + ) + (linear): Linear(3, 3) + ) + ) + + (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested + submodule ``net_b``, which itself has two submodules ``net_c`` + and ``linear``. ``net_c`` then has a submodule ``conv``.) + + To override the ``Conv2d`` with a new submodule ``Linear``, you + could call ``set_submodule("net_b.net_c.conv", nn.Linear(1, 1))`` + where ``strict`` could be ``True`` or ``False`` + + To add a new submodule ``Conv2d`` to the existing ``net_b`` module, + you would call ``set_submodule("net_b.conv", nn.Conv2d(1, 1, 1))``. + + In the above if you set ``strict=True`` and call + ``set_submodule("net_b.conv", nn.Conv2d(1, 1, 1), strict=True)``, an AttributeError + will be raised because ``net_b`` does not have a submodule named ``conv``. + + Args: + target: The fully-qualified string name of the submodule + to look for. (See above example for how to specify a + fully-qualified string.) + module: The module to set the submodule to. + strict: If ``False``, the method will replace an existing submodule + or create a new submodule if the parent module exists. If ``True``, + the method will only attempt to replace an existing submodule and throw an error + if the submodule doesn't already exist. + + Raises: + ValueError: If the ``target`` string is empty or if ``module`` is not an instance of ``nn.Module``. + AttributeError: If at any point along the path resulting from + the ``target`` string the (sub)path resolves to a non-existent + attribute name or an object that is not an instance of ``nn.Module``. + """ + if target == "": + raise ValueError("Cannot set the submodule without a target name!") + + atoms: list[str] = target.split(".") + if not isinstance(module, torch.nn.Module): + raise ValueError( + "`" + "module" + f"` is not an nn.Module, found {type(module)}" + ) + if len(atoms) == 1: + parent: torch.nn.Module = self + else: + parent_key = ".".join(atoms[:-1]) + parent = self.get_submodule(parent_key) + + if strict and not hasattr(parent, atoms[-1]): + raise AttributeError( + parent._get_name() + " has no attribute `" + atoms[-1] + "`" + ) + if hasattr(parent, atoms[-1]): + mod = getattr(parent, atoms[-1]) + if not isinstance(mod, torch.nn.Module): + raise AttributeError("`" + atoms[-1] + "` is not an nn.Module") + setattr(parent, atoms[-1], module) + + def get_parameter(self, target: str) -> "Parameter": + """Return the parameter given by ``target`` if it exists, otherwise throw an error. + + See the docstring for ``get_submodule`` for a more detailed + explanation of this method's functionality as well as how to + correctly specify ``target``. + + Args: + target: The fully-qualified string name of the Parameter + to look for. (See ``get_submodule`` for how to specify a + fully-qualified string.) + + Returns: + torch.nn.Parameter: The Parameter referenced by ``target`` + + Raises: + AttributeError: If the target string references an invalid + path or resolves to something that is not an + ``nn.Parameter`` + """ + module_path, _, param_name = target.rpartition(".") + + mod: torch.nn.Module = self.get_submodule(module_path) + + if not hasattr(mod, param_name): + raise AttributeError( + mod._get_name() + " has no attribute `" + param_name + "`" + ) + + param: torch.nn.Parameter = getattr(mod, param_name) + + if not isinstance(param, torch.nn.Parameter): + raise AttributeError("`" + param_name + "` is not an nn.Parameter") + + return param + + def get_buffer(self, target: str) -> "Tensor": + """Return the buffer given by ``target`` if it exists, otherwise throw an error. + + See the docstring for ``get_submodule`` for a more detailed + explanation of this method's functionality as well as how to + correctly specify ``target``. + + Args: + target: The fully-qualified string name of the buffer + to look for. (See ``get_submodule`` for how to specify a + fully-qualified string.) + + Returns: + torch.Tensor: The buffer referenced by ``target`` + + Raises: + AttributeError: If the target string references an invalid + path or resolves to something that is not a + buffer + """ + module_path, _, buffer_name = target.rpartition(".") + + mod: torch.nn.Module = self.get_submodule(module_path) + + if not hasattr(mod, buffer_name): + raise AttributeError( + mod._get_name() + " has no attribute `" + buffer_name + "`" + ) + + buffer: torch.Tensor = getattr(mod, buffer_name) + + if buffer_name not in mod._buffers: + raise AttributeError("`" + buffer_name + "` is not a buffer") + + return buffer + + def get_extra_state(self) -> Any: + """Return any extra state to include in the module's state_dict. + + Implement this and a corresponding :func:`set_extra_state` for your module + if you need to store extra state. This function is called when building the + module's `state_dict()`. + + Note that extra state should be picklable to ensure working serialization + of the state_dict. We only provide backwards compatibility guarantees + for serializing Tensors; other objects may break backwards compatibility if + their serialized pickled form changes. + + Returns: + object: Any extra state to store in the module's state_dict + """ + raise RuntimeError( + "Reached a code path in Module.get_extra_state() that should never be called. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml " + "to report this bug." + ) + + def set_extra_state(self, state: Any) -> None: + """Set extra state contained in the loaded `state_dict`. + + This function is called from :func:`load_state_dict` to handle any extra state + found within the `state_dict`. Implement this function and a corresponding + :func:`get_extra_state` for your module if you need to store extra state within its + `state_dict`. + + Args: + state (dict): Extra state from the `state_dict` + """ + raise RuntimeError( + "Reached a code path in Module.set_extra_state() that should never be called. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml " + "to report this bug." + ) + + def _apply(self, fn, recurse=True): + if recurse: + for module in self.children(): + module._apply(fn) + + from torch._subclasses.fake_tensor import FakeTensor + + def compute_should_use_set_data(tensor, tensor_applied) -> bool: + if torch._has_compatible_shallow_copy_type( + tensor, tensor_applied + ) and not isinstance(tensor_applied, FakeTensor): + # If the new tensor has compatible tensor type as the existing tensor, + # the current behavior is to change the tensor in-place using `.data =`, + # and the future behavior is to overwrite the existing tensor. However, + # changing the current behavior is a BC-breaking change, and we want it + # to happen in future releases. So for now we introduce the + # `torch.__future__.get_overwrite_module_params_on_conversion()` + # global flag to let the user control whether they want the future + # behavior of overwriting the existing tensor or not. + return not torch.__future__.get_overwrite_module_params_on_conversion() + else: + return False + + should_use_swap_tensors = ( + torch.__future__.get_swap_module_params_on_conversion() + ) + + for key, param in self._parameters.items(): + if param is None: + continue + # Tensors stored in modules are graph leaves, and we don't want to + # track autograd history of `param_applied`, so we have to use + # `with torch.no_grad():` + with torch.no_grad(): + param_applied = fn(param) + p_should_use_set_data = compute_should_use_set_data(param, param_applied) + + # subclasses may have multiple child tensors so we need to use swap_tensors + p_should_use_swap_tensors = ( + should_use_swap_tensors + or is_traceable_wrapper_subclass(param_applied) + or isinstance(param, FakeTensor) + ) + + param_grad = param.grad + if p_should_use_swap_tensors: + try: + if param_grad is not None: + # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping. + # Decrement use count of the gradient by setting to None + param.grad = None + param_applied = torch.nn.Parameter( + # pyrefly: ignore [bad-argument-type] + param_applied, + requires_grad=param.requires_grad, + ) + torch.utils.swap_tensors(param, param_applied) + except Exception as e: + if param_grad is not None: + param.grad = param_grad + raise RuntimeError( + f"_apply(): Couldn't swap {self._get_name()}.{key}" + ) from e + out_param = param + elif p_should_use_set_data: + # pyrefly: ignore [bad-assignment] + param.data = param_applied + out_param = param + else: + assert isinstance(param, Parameter) + assert param.is_leaf + # pyrefly: ignore [bad-argument-type] + out_param = Parameter(param_applied, param.requires_grad) + self._parameters[key] = out_param + + if param_grad is not None: + with torch.no_grad(): + grad_applied = fn(param_grad) + g_should_use_set_data = compute_should_use_set_data( + param_grad, grad_applied + ) + if p_should_use_swap_tensors: + grad_applied.requires_grad_(param_grad.requires_grad) + try: + torch.utils.swap_tensors(param_grad, grad_applied) + except Exception as e: + raise RuntimeError( + f"_apply(): Couldn't swap {self._get_name()}.{key}.grad" + ) from e + out_param.grad = param_grad + elif g_should_use_set_data: + assert out_param.grad is not None + out_param.grad.data = grad_applied + else: + assert param_grad.is_leaf + out_param.grad = grad_applied.requires_grad_( + param_grad.requires_grad + ) + + for key, buf in self._buffers.items(): + if buf is not None: + self._buffers[key] = fn(buf) + + return self + + def apply(self, fn: Callable[["Module"], None]) -> Self: + r"""Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. + + Typical use includes initializing the parameters of a model + (see also :ref:`nn-init-doc`). + + Args: + fn (:class:`Module` -> None): function to be applied to each submodule + + Returns: + Module: self + + Example:: + + >>> @torch.no_grad() + >>> def init_weights(m): + >>> print(m) + >>> if type(m) is nn.Linear: + >>> m.weight.fill_(1.0) + >>> print(m.weight) + >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) + >>> net.apply(init_weights) + Linear(in_features=2, out_features=2, bias=True) + Parameter containing: + tensor([[1., 1.], + [1., 1.]], requires_grad=True) + Linear(in_features=2, out_features=2, bias=True) + Parameter containing: + tensor([[1., 1.], + [1., 1.]], requires_grad=True) + Sequential( + (0): Linear(in_features=2, out_features=2, bias=True) + (1): Linear(in_features=2, out_features=2, bias=True) + ) + + """ + for module in self.children(): + module.apply(fn) + fn(self) + return self + + def cuda(self, device: int | device | None = None) -> Self: + r"""Move all model parameters and buffers to the GPU. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing the optimizer if the module will + live on GPU while being optimized. + + .. note:: + This method modifies the module in-place. + + Args: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + return self._apply(lambda t: t.cuda(device)) + + def ipu(self, device: int | device | None = None) -> Self: + r"""Move all model parameters and buffers to the IPU. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing the optimizer if the module will + live on IPU while being optimized. + + .. note:: + This method modifies the module in-place. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + return self._apply(lambda t: t.ipu(device)) + + def xpu(self, device: int | device | None = None) -> Self: + r"""Move all model parameters and buffers to the XPU. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on XPU while being optimized. + + .. note:: + This method modifies the module in-place. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + return self._apply(lambda t: t.xpu(device)) + + def mtia(self, device: int | device | None = None) -> Self: + r"""Move all model parameters and buffers to the MTIA. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing the optimizer if the module will + live on MTIA while being optimized. + + .. note:: + This method modifies the module in-place. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + return self._apply(lambda t: t.mtia(device)) + + def cpu(self) -> Self: + r"""Move all model parameters and buffers to the CPU. + + .. note:: + This method modifies the module in-place. + + Returns: + Module: self + """ + return self._apply(lambda t: t.cpu()) + + def type(self, dst_type: dtype | str) -> Self: + r"""Casts all parameters and buffers to :attr:`dst_type`. + + .. note:: + This method modifies the module in-place. + + Args: + dst_type (type or string): the desired type + + Returns: + Module: self + """ + return self._apply(lambda t: t.type(dst_type)) + + def float(self) -> Self: + r"""Casts all floating point parameters and buffers to ``float`` datatype. + + .. note:: + This method modifies the module in-place. + + Returns: + Module: self + """ + return self._apply(lambda t: t.float() if t.is_floating_point() else t) + + def double(self) -> Self: + r"""Casts all floating point parameters and buffers to ``double`` datatype. + + .. note:: + This method modifies the module in-place. + + Returns: + Module: self + """ + return self._apply(lambda t: t.double() if t.is_floating_point() else t) + + def half(self) -> Self: + r"""Casts all floating point parameters and buffers to ``half`` datatype. + + .. note:: + This method modifies the module in-place. + + Returns: + Module: self + """ + return self._apply(lambda t: t.half() if t.is_floating_point() else t) + + def bfloat16(self) -> Self: + r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype. + + .. note:: + This method modifies the module in-place. + + Returns: + Module: self + """ + return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t) + + def to_empty(self, *, device: DeviceLikeType | None, recurse: bool = True) -> Self: + r"""Move the parameters and buffers to the specified device without copying storage. + + Args: + device (:class:`torch.device`): The desired device of the parameters + and buffers in this module. + recurse (bool): Whether parameters and buffers of submodules should + be recursively moved to the specified device. + + Returns: + Module: self + """ + return self._apply( + lambda t: torch.empty_like(t, device=device), recurse=recurse + ) + + @overload + def to( + self, + device: DeviceLikeType | None = ..., + dtype: dtype | None = ..., + non_blocking: bool = ..., + ) -> Self: ... + + @overload + def to(self, dtype: dtype, non_blocking: bool = ...) -> Self: ... + + @overload + def to(self, tensor: Tensor, non_blocking: bool = ...) -> Self: ... + + def to(self, *args, **kwargs): + r"""Move and/or cast the parameters and buffers. + + This can be called as + + .. function:: to(device=None, dtype=None, non_blocking=False) + :noindex: + + .. function:: to(dtype, non_blocking=False) + :noindex: + + .. function:: to(tensor, non_blocking=False) + :noindex: + + .. function:: to(memory_format=torch.channels_last) + :noindex: + + Its signature is similar to :meth:`torch.Tensor.to`, but only accepts + floating point or complex :attr:`dtype`\ s. In addition, this method will + only cast the floating point or complex parameters and buffers to :attr:`dtype` + (if given). The integral parameters and buffers will be moved + :attr:`device`, if that is given, but with dtypes unchanged. When + :attr:`non_blocking` is set, it tries to convert/move asynchronously + with respect to the host if possible, e.g., moving CPU Tensors with + pinned memory to CUDA devices. + + See below for examples. + + .. note:: + This method modifies the module in-place. + + Args: + device (:class:`torch.device`): the desired device of the parameters + and buffers in this module + dtype (:class:`torch.dtype`): the desired floating point or complex dtype of + the parameters and buffers in this module + tensor (torch.Tensor): Tensor whose dtype and device are the desired + dtype and device for all parameters and buffers in this module + memory_format (:class:`torch.memory_format`): the desired memory + format for 4D parameters and buffers in this module (keyword + only argument) + + Returns: + Module: self + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> linear = nn.Linear(2, 2) + >>> linear.weight + Parameter containing: + tensor([[ 0.1913, -0.3420], + [-0.5113, -0.2325]]) + >>> linear.to(torch.double) + Linear(in_features=2, out_features=2, bias=True) + >>> linear.weight + Parameter containing: + tensor([[ 0.1913, -0.3420], + [-0.5113, -0.2325]], dtype=torch.float64) + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1) + >>> gpu1 = torch.device("cuda:1") + >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) + Linear(in_features=2, out_features=2, bias=True) + >>> linear.weight + Parameter containing: + tensor([[ 0.1914, -0.3420], + [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') + >>> cpu = torch.device("cpu") + >>> linear.to(cpu) + Linear(in_features=2, out_features=2, bias=True) + >>> linear.weight + Parameter containing: + tensor([[ 0.1914, -0.3420], + [-0.5112, -0.2324]], dtype=torch.float16) + + >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble) + >>> linear.weight + Parameter containing: + tensor([[ 0.3741+0.j, 0.2382+0.j], + [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128) + >>> linear(torch.ones(3, 2, dtype=torch.cdouble)) + tensor([[0.6122+0.j, 0.1150+0.j], + [0.6122+0.j, 0.1150+0.j], + [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128) + + """ + device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to( + # pyrefly: ignore [not-iterable] + *args, + **kwargs, + ) + + if dtype is not None: + if not (dtype.is_floating_point or dtype.is_complex): + raise TypeError( + "nn.Module.to only accepts floating point or complex " + f"dtypes, but got desired dtype={dtype}" + ) + if dtype.is_complex: + warnings.warn( + "Complex modules are a new feature under active development whose design may change, " + "and some modules might not work as expected when using complex tensors as parameters or buffers. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml " + "if a complex module does not work as expected.", + stacklevel=2, + ) + + def convert(t): + try: + if convert_to_format is not None and t.dim() in (4, 5): + return t.to( + device, + dtype if t.is_floating_point() or t.is_complex() else None, + non_blocking, + memory_format=convert_to_format, + ) + return t.to( + device, + dtype if t.is_floating_point() or t.is_complex() else None, + non_blocking, + ) + except NotImplementedError as e: + if str(e) == "Cannot copy out of meta tensor; no data!": + raise NotImplementedError( + f"{e} Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() " + f"when moving module from meta to a different device." + ) from None + else: + raise + + return self._apply(convert) + + def register_full_backward_pre_hook( + self, + hook: Callable[["Module", _grad_t], None | _grad_t], + prepend: bool = False, + ) -> RemovableHandle: + r"""Register a backward pre-hook on the module. + + The hook will be called every time the gradients for the module are computed. + The hook should have the following signature:: + + hook(module, grad_output) -> tuple[Tensor, ...], Tensor or None + + The :attr:`grad_output` is a tuple. The hook should + not modify its arguments, but it can optionally return a new gradient with + respect to the output that will be used in place of :attr:`grad_output` in + subsequent computations. Entries in :attr:`grad_output` will be ``None`` for + all non-Tensor arguments. + + For technical reasons, when this hook is applied to a Module, its forward function will + receive a view of each Tensor passed to the Module. Similarly the caller will receive a view + of each Tensor returned by the Module's forward function. + + .. warning :: + Modifying inputs inplace is not allowed when using backward hooks and + will raise an error. + + Args: + hook (Callable): The user-defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``backward_pre`` hooks on this + :class:`torch.nn.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``backward_pre`` hooks + on this :class:`torch.nn.Module`. Note that global + ``backward_pre`` hooks registered with + :func:`register_module_full_backward_pre_hook` will fire before + all hooks registered by this method. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + handle = RemovableHandle(self._backward_pre_hooks) + self._backward_pre_hooks[handle.id] = hook + if prepend: + self._backward_pre_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] + return handle + + def register_backward_hook( + self, hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t] + ) -> RemovableHandle: + r"""Register a backward hook on the module. + + This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and + the behavior of this function will change in future versions. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + if self._is_full_backward_hook is True: + raise RuntimeError( + "Cannot use both regular backward hooks and full backward hooks on a " + "single Module. Please use only one of them." + ) + + self._is_full_backward_hook = False + + handle = RemovableHandle(self._backward_hooks) + self._backward_hooks[handle.id] = hook + return handle + + def register_full_backward_hook( + self, + hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t], + prepend: bool = False, + ) -> RemovableHandle: + r"""Register a backward hook on the module. + + The hook will be called every time the gradients with respect to a module are computed, and its firing rules are as follows: + + 1. Ordinarily, the hook fires when the gradients are computed with respect to the module inputs. + 2. If none of the module inputs require gradients, the hook will fire when the gradients are computed + with respect to module outputs. + 3. If none of the module outputs require gradients, then the hooks will not fire. + + The hook should have the following signature:: + + hook(module, grad_input, grad_output) -> tuple(Tensor) or None + + The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients + with respect to the inputs and outputs respectively. The hook should + not modify its arguments, but it can optionally return a new gradient with + respect to the input that will be used in place of :attr:`grad_input` in + subsequent computations. :attr:`grad_input` will only correspond to the inputs given + as positional arguments and all kwarg arguments are ignored. Entries + in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor + arguments. + + For technical reasons, when this hook is applied to a Module, its forward function will + receive a view of each Tensor passed to the Module. Similarly the caller will receive a view + of each Tensor returned by the Module's forward function. + + .. warning :: + Modifying inputs or outputs inplace is not allowed when using backward hooks and + will raise an error. + + Args: + hook (Callable): The user-defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``backward`` hooks on this + :class:`torch.nn.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``backward`` hooks on + this :class:`torch.nn.Module`. Note that global + ``backward`` hooks registered with + :func:`register_module_full_backward_hook` will fire before + all hooks registered by this method. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + + """ + if self._is_full_backward_hook is False: + raise RuntimeError( + "Cannot use both regular backward hooks and full backward hooks on a " + "single Module. Please use only one of them." + ) + + self._is_full_backward_hook = True + + handle = RemovableHandle(self._backward_hooks) + self._backward_hooks[handle.id] = hook + if prepend: + self._backward_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] + return handle + + def _get_backward_hooks(self): + r"""Return the backward hooks for use in the call function. + + It returns two lists, one with the full backward hooks and one with the non-full + backward hooks. + """ + full_backward_hooks: list[Callable] = [] + if _global_is_full_backward_hook is True: + full_backward_hooks += _global_backward_hooks.values() + if self._is_full_backward_hook is True: + full_backward_hooks += self._backward_hooks.values() + + non_full_backward_hooks: list[Callable] = [] + if _global_is_full_backward_hook is False: + non_full_backward_hooks += _global_backward_hooks.values() + if self._is_full_backward_hook is False: + non_full_backward_hooks += self._backward_hooks.values() + + return full_backward_hooks, non_full_backward_hooks + + def _get_backward_pre_hooks(self): + backward_pre_hooks: list[Callable] = [] + backward_pre_hooks += _global_backward_pre_hooks.values() + backward_pre_hooks += self._backward_pre_hooks.values() + + return backward_pre_hooks + + def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn) -> None: + if not isinstance(result, torch.Tensor): + if not ( + isinstance(result, tuple) + and all(isinstance(r, torch.Tensor) for r in result) + ): + warnings.warn( + "Using non-full backward hooks on a Module that does not return a " + "single Tensor or a tuple of Tensors is deprecated and will be removed " + "in future versions. This hook will be missing some of the grad_output. " + "Please use register_full_backward_hook to get the documented behavior.", + FutureWarning, + stacklevel=2, + ) + return + else: + result = (result,) + + if not isinstance(inputs, torch.Tensor): + if not ( + isinstance(inputs, tuple) + and all(isinstance(i, torch.Tensor) for i in inputs) + ): + warnings.warn( + "Using non-full backward hooks on a Module that does not take as input a " + "single Tensor or a tuple of Tensors is deprecated and will be removed " + "in future versions. This hook will be missing some of the grad_input. " + "Please use register_full_backward_hook to get the documented behavior.", + FutureWarning, + stacklevel=2, + ) + return + else: + inputs = (inputs,) + + # At this point we are sure that inputs and result are tuple of Tensors + out_grad_fn = {r.grad_fn for r in result if r.grad_fn is not None} + if len(out_grad_fn) == 0 or ( + len(out_grad_fn) == 1 and grad_fn not in out_grad_fn + ): + warnings.warn( + "Using a non-full backward hook when outputs are nested in python data structure " + "is deprecated and will be removed in future versions. This hook will be missing " + "some grad_output.", + FutureWarning, + stacklevel=2, + ) + elif len(out_grad_fn) > 1: + warnings.warn( + "Using a non-full backward hook when outputs are generated by different autograd Nodes " + "is deprecated and will be removed in future versions. This hook will be missing " + "some grad_output. Please use register_full_backward_hook to get the documented behavior.", + FutureWarning, + stacklevel=2, + ) + else: + # At this point the grad_output part of the hook will most likely be correct + inputs_grad_fn = {i.grad_fn for i in inputs if i.grad_fn is not None} + + next_functions = {n[0] for n in grad_fn.next_functions} + + if inputs_grad_fn != next_functions: + warnings.warn( + "Using a non-full backward hook when the forward contains multiple autograd Nodes " + "is deprecated and will be removed in future versions. This hook will be missing " + "some grad_input. Please use register_full_backward_hook to get the documented " + "behavior.", + FutureWarning, + stacklevel=2, + ) + + def register_forward_pre_hook( + self, + hook: Callable[[T, tuple[Any, ...]], Any | None] + | Callable[ + [T, tuple[Any, ...], dict[str, Any]], tuple[Any, dict[str, Any]] | None + ], + *, + prepend: bool = False, + with_kwargs: bool = False, + ) -> RemovableHandle: + r"""Register a forward pre-hook on the module. + + The hook will be called every time before :func:`forward` is invoked. + + + If ``with_kwargs`` is false or not specified, the input contains only + the positional arguments given to the module. Keyword arguments won't be + passed to the hooks and only to the ``forward``. The hook can modify the + input. User can either return a tuple or a single modified value in the + hook. We will wrap the value into a tuple if a single value is returned + (unless that value is already a tuple). The hook should have the + following signature:: + + hook(module, args) -> None or modified input + + If ``with_kwargs`` is true, the forward pre-hook will be passed the + kwargs given to the forward function. And if the hook modifies the + input, both the args and kwargs should be returned. The hook should have + the following signature:: + + hook(module, args, kwargs) -> None or a tuple of modified input and kwargs + + Args: + hook (Callable): The user defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``forward_pre`` hooks on this + :class:`torch.nn.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``forward_pre`` hooks + on this :class:`torch.nn.Module`. Note that global + ``forward_pre`` hooks registered with + :func:`register_module_forward_pre_hook` will fire before all + hooks registered by this method. + Default: ``False`` + with_kwargs (bool): If true, the ``hook`` will be passed the kwargs + given to the forward function. + Default: ``False`` + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle( + self._forward_pre_hooks, extra_dict=self._forward_pre_hooks_with_kwargs + ) + self._forward_pre_hooks[handle.id] = hook + if with_kwargs: + self._forward_pre_hooks_with_kwargs[handle.id] = True + + if prepend: + self._forward_pre_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] + return handle + + def register_forward_hook( + self, + hook: Callable[[T, tuple[Any, ...], Any], Any | None] + | Callable[[T, tuple[Any, ...], dict[str, Any], Any], Any | None], + *, + prepend: bool = False, + with_kwargs: bool = False, + always_call: bool = False, + ) -> RemovableHandle: + r"""Register a forward hook on the module. + + The hook will be called every time after :func:`forward` has computed an output. + + If ``with_kwargs`` is ``False`` or not specified, the input contains only + the positional arguments given to the module. Keyword arguments won't be + passed to the hooks and only to the ``forward``. The hook can modify the + output. It can modify the input inplace but it will not have effect on + forward since this is called after :func:`forward` is called. The hook + should have the following signature:: + + hook(module, args, output) -> None or modified output + + If ``with_kwargs`` is ``True``, the forward hook will be passed the + ``kwargs`` given to the forward function and be expected to return the + output possibly modified. The hook should have the following signature:: + + hook(module, args, kwargs, output) -> None or modified output + + Args: + hook (Callable): The user defined hook to be registered. + prepend (bool): If ``True``, the provided ``hook`` will be fired + before all existing ``forward`` hooks on this + :class:`torch.nn.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``forward`` hooks on + this :class:`torch.nn.Module`. Note that global + ``forward`` hooks registered with + :func:`register_module_forward_hook` will fire before all hooks + registered by this method. + Default: ``False`` + with_kwargs (bool): If ``True``, the ``hook`` will be passed the + kwargs given to the forward function. + Default: ``False`` + always_call (bool): If ``True`` the ``hook`` will be run regardless of + whether an exception is raised while calling the Module. + Default: ``False`` + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle( + self._forward_hooks, + extra_dict=[ + self._forward_hooks_with_kwargs, + self._forward_hooks_always_called, + ], + ) + self._forward_hooks[handle.id] = hook + if with_kwargs: + self._forward_hooks_with_kwargs[handle.id] = True + if always_call: + self._forward_hooks_always_called[handle.id] = True + if prepend: + self._forward_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] + return handle + + def _slow_forward(self, *input, **kwargs): + tracing_state = torch._C._get_tracing_state() + if not tracing_state or isinstance(self.forward, torch._C.ScriptMethod): + return self.forward(*input, **kwargs) + recording_scopes = torch.jit._trace._trace_module_map is not None + if recording_scopes: + # type ignore was added because at this point one knows that + # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any] + name = torch.jit._trace._trace_module_map.get(self, None) # type: ignore[operator, union-attr] + if name: + tracing_state.push_scope(name) + else: + recording_scopes = False + try: + result = self.forward(*input, **kwargs) + finally: + if recording_scopes: + tracing_state.pop_scope() + return result + + def _wrapped_call_impl(self, *args, **kwargs): + if self._compiled_call_impl is not None: + return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] + else: + return self._call_impl(*args, **kwargs) + + # torchrec tests the code consistency with the following code + # fmt: off + def _call_impl(self, *args, **kwargs): + forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward) + # If we don't have any hooks, we want to skip the rest of the logic in + # this function, and just call forward. + if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks + or _global_backward_pre_hooks or _global_backward_hooks + or _global_forward_hooks or _global_forward_pre_hooks): + return forward_call(*args, **kwargs) + + result = None + called_always_called_hooks = set() + + def inner(): + nonlocal result, args, kwargs + + full_backward_hooks, non_full_backward_hooks = [], [] + backward_pre_hooks = [] + if self._backward_pre_hooks or _global_backward_pre_hooks: + backward_pre_hooks = self._get_backward_pre_hooks() + + if self._backward_hooks or _global_backward_hooks: + full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks() + + if _global_forward_pre_hooks or self._forward_pre_hooks: + for hook_id, hook in ( + *_global_forward_pre_hooks.items(), + *self._forward_pre_hooks.items(), + ): + if hook_id in self._forward_pre_hooks_with_kwargs: + args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc] + if args_kwargs_result is not None: + if isinstance(args_kwargs_result, tuple) and len(args_kwargs_result) == 2: + args, kwargs = args_kwargs_result + else: + raise RuntimeError( + "forward pre-hook must return None or a tuple " + f"of (new_args, new_kwargs), but got {args_kwargs_result}." + ) + else: + args_result = hook(self, args) + if args_result is not None: + if not isinstance(args_result, tuple): + args_result = (args_result,) + args = args_result + + bw_hook = None + if full_backward_hooks or backward_pre_hooks: + bw_hook = BackwardHook(self, full_backward_hooks, backward_pre_hooks) + args = bw_hook.setup_input_hook(args) + + result = forward_call(*args, **kwargs) + if _global_forward_hooks or self._forward_hooks: + for hook_id, hook in ( + *_global_forward_hooks.items(), + *self._forward_hooks.items(), + ): + # mark that always called hook is run + if hook_id in self._forward_hooks_always_called or hook_id in _global_forward_hooks_always_called: + called_always_called_hooks.add(hook_id) + + if hook_id in self._forward_hooks_with_kwargs or hook_id in _global_forward_hooks_with_kwargs: + hook_result = hook(self, args, kwargs, result) + else: + hook_result = hook(self, args, result) + + if hook_result is not None: + result = hook_result + + if bw_hook: + if not isinstance(result, (torch.Tensor, tuple)): + warnings.warn("For backward hooks to be called," + " module output should be a Tensor or a tuple of Tensors" + f" but received {type(result)}", stacklevel=2) + result = bw_hook.setup_output_hook(result) + + # Handle the non-full backward hooks + if non_full_backward_hooks: + var = result + while not isinstance(var, torch.Tensor): + if isinstance(var, dict): + var = next(v for v in var.values() if isinstance(v, torch.Tensor)) + else: + var = var[0] + grad_fn = var.grad_fn + if grad_fn is not None: + for hook in non_full_backward_hooks: + grad_fn.register_hook(_WrappedHook(hook, self)) + self._maybe_warn_non_full_backward_hook(args, result, grad_fn) + + return result + + # This is technically not behavior equivalent when compiling, but it's + # incredibly unlikely we will ever support throwing an exception in NN + # module, and then catching it here, and then reraising it, and then + # catching it again, and expecting the resulting frame to be compiled. + # The reraise here just gunks up our exception handling for no good + # reason. Don't try to run the always called hooks in event of + # exception. + if torch.compiler.is_compiling(): + return inner() + + try: + return inner() + except Exception: + # run always called hooks if they have not already been run + # For now only forward hooks have the always_call option but perhaps + # this functionality should be added to full backward hooks as well. + for hook_id, hook in _global_forward_hooks.items(): + if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks: # type: ignore[possibly-undefined] + try: + hook_result = hook(self, args, result) # type: ignore[possibly-undefined] + if hook_result is not None: + result = hook_result + except Exception as e: + warnings.warn("global module forward hook with ``always_call=True`` raised an exception " + f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2) + continue + + for hook_id, hook in self._forward_hooks.items(): + if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks: # type: ignore[possibly-undefined] + try: + if hook_id in self._forward_hooks_with_kwargs: + hook_result = hook(self, args, kwargs, result) # type: ignore[possibly-undefined] + else: + hook_result = hook(self, args, result) # type: ignore[possibly-undefined] + if hook_result is not None: + result = hook_result + except Exception as e: + warnings.warn("module forward hook with ``always_call=True`` raised an exception " + f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2) + continue + # raise exception raised in try block + raise + # fmt: on + + __call__: Callable[..., Any] = _wrapped_call_impl + + def __getstate__(self): + state = self.__dict__.copy() + state.pop("_compiled_call_impl", None) + return state + + def __setstate__(self, state): + self.__dict__.update(state) + + # Support loading old checkpoints that don't have the following attrs: + if "_forward_pre_hooks" not in self.__dict__: + self._forward_pre_hooks = OrderedDict() + if "_forward_pre_hooks_with_kwargs" not in self.__dict__: + self._forward_pre_hooks_with_kwargs = OrderedDict() + if "_forward_hooks_with_kwargs" not in self.__dict__: + self._forward_hooks_with_kwargs = OrderedDict() + if "_forward_hooks_always_called" not in self.__dict__: + self._forward_hooks_always_called = OrderedDict() + if "_state_dict_hooks" not in self.__dict__: + self._state_dict_hooks = OrderedDict() + if "_state_dict_pre_hooks" not in self.__dict__: + self._state_dict_pre_hooks = OrderedDict() + if "_load_state_dict_pre_hooks" not in self.__dict__: + self._load_state_dict_pre_hooks = OrderedDict() + if "_load_state_dict_post_hooks" not in self.__dict__: + self._load_state_dict_post_hooks = OrderedDict() + if "_non_persistent_buffers_set" not in self.__dict__: + self._non_persistent_buffers_set = set() + if "_is_full_backward_hook" not in self.__dict__: + self._is_full_backward_hook = None + if "_backward_pre_hooks" not in self.__dict__: + self._backward_pre_hooks = OrderedDict() + + # It is crucial that the return type is not annotated as `Any`, otherwise type checking + # on `torch.nn.Module` and all its subclasses is largely disabled as a result. See: + # https://github.com/pytorch/pytorch/pull/115074 + def __getattr__(self, name: str) -> Union[Tensor, "Module"]: + if "_parameters" in self.__dict__: + _parameters = self.__dict__["_parameters"] + if name in _parameters: + return _parameters[name] + if "_buffers" in self.__dict__: + _buffers = self.__dict__["_buffers"] + if name in _buffers: + return _buffers[name] + if "_modules" in self.__dict__: + modules = self.__dict__["_modules"] + if name in modules: + return modules[name] + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None: + def remove_from(*dicts_or_sets) -> None: + for d in dicts_or_sets: + if name in d: + if isinstance(d, dict): + del d[name] + else: + d.discard(name) + + params = self.__dict__.get("_parameters") + if isinstance(value, Parameter): + if params is None: + raise AttributeError( + "cannot assign parameters before Module.__init__() call" + ) + remove_from( + self.__dict__, + self._buffers, + self._modules, + self._non_persistent_buffers_set, + ) + self.register_parameter(name, value) + elif params is not None and name in params: + if value is not None: + raise TypeError( + f"cannot assign '{torch.typename(value)}' as parameter '{name}' " + "(torch.nn.Parameter or None expected)" + ) + self.register_parameter(name, value) + else: + modules = self.__dict__.get("_modules") + if isinstance(value, Module): + if modules is None: + raise AttributeError( + "cannot assign module before Module.__init__() call" + ) + remove_from( + self.__dict__, + self._parameters, + self._buffers, + self._non_persistent_buffers_set, + ) + for hook in _global_module_registration_hooks.values(): + output = hook(self, name, value) + if output is not None: + value = output + modules[name] = value + elif modules is not None and name in modules: + if value is not None: + raise TypeError( + f"cannot assign '{torch.typename(value)}' as child module '{name}' " + "(torch.nn.Module or None expected)" + ) + for hook in _global_module_registration_hooks.values(): + output = hook(self, name, value) + if output is not None: + value = output + modules[name] = value + else: + buffers = self.__dict__.get("_buffers") + if isinstance(value, Buffer) or buffers is not None and name in buffers: + if value is not None and not ( + isinstance(value, torch.Tensor) + or hasattr(value, "__torch_function__") + ): + raise TypeError( + f"cannot assign '{torch.typename(value)}' as buffer '{name}' " + "(torch.nn.Buffer, torch.Tensor or None expected)" + ) + if isinstance(value, Buffer): + persistent = value.persistent + else: + persistent = name not in self._non_persistent_buffers_set + # === HACK === + # This whole block below should just be: + # self.register_buffer(name, value, persistent) + + # But to support subclasses of nn.Module that (wrongfully) implement a + # register_buffer() method that doesn't have the "persistent" + # argument. Only pass it in if it is accepted otherwise assume + # it is always true + if ( + getattr(self.register_buffer, "__func__", None) + is torch.nn.Module.register_buffer + ): + self.register_buffer(name, value, persistent) + else: + sign = inspect.signature(self.register_buffer) + if "persistent" in sign.parameters: + self.register_buffer(name, value, persistent) + else: + if not persistent: + raise RuntimeError( + "Registering a non-persistent buffer " + "on a Module subclass that implements " + "register_buffer() without the persistent " + "argument is not allowed." + ) + # Assume that the implementation without the argument has the + # behavior from before the argument was added: persistent=True + self.register_buffer(name, value) + # === HACK END === + else: + super().__setattr__(name, value) + + def __delattr__(self, name) -> None: + if name in self._parameters: + del self._parameters[name] + elif name in self._buffers: + del self._buffers[name] + self._non_persistent_buffers_set.discard(name) + elif name in self._modules: + del self._modules[name] + else: + super().__delattr__(name) + + def _register_state_dict_hook(self, hook): + r"""Register a post-hook for the :meth:`~torch.nn.Module.state_dict` method. + + It should have the following signature:: + hook(module, state_dict, prefix, local_metadata) -> None or state_dict + + The registered hooks can modify the ``state_dict`` inplace or return a new one. + If a new ``state_dict`` is returned, it will only be respected if it is the root + module that :meth:`~nn.Module.state_dict` is called from. + """ + if getattr(hook, "_from_public_api", False): + raise RuntimeError( + "Cannot register the same function as the state dict post hook that was " + "previously registered via register_state_dict_post_hook" + ) + handle = RemovableHandle(self._state_dict_hooks) + self._state_dict_hooks[handle.id] = hook + return handle + + def register_state_dict_post_hook(self, hook): + r"""Register a post-hook for the :meth:`~torch.nn.Module.state_dict` method. + + It should have the following signature:: + hook(module, state_dict, prefix, local_metadata) -> None + + The registered hooks can modify the ``state_dict`` inplace. + """ + # In _register_state_dict_hook there was a bug described in + # https://github.com/pytorch/pytorch/issues/117437 where the return value + # was only respected for the root module but not child submodules. + # We fix this in this public version by only allowing inplace modifications on + # the state_dict by the hook. However, since hooks registered via both these + # APIs will be added to `_state_dict_hooks` and the type of `_state_dict_hooks` + # cannot be changed due to many dependencies on it, we mark a hook + # as being registered via the public API by setting `_from_public_api` on it. + # In the implementation of `state_dict`, if the callable does not have this + # flag, the old behavior of respecting the return value will be preserved + # for the root module, otherwise, we ensure that the hook returns None. + hook._from_public_api = True + handle = RemovableHandle(self._state_dict_hooks) + self._state_dict_hooks[handle.id] = hook + return handle + + def register_state_dict_pre_hook(self, hook): + r"""Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method. + + It should have the following signature:: + hook(module, prefix, keep_vars) -> None + + The registered hooks can be used to perform pre-processing before the ``state_dict`` + call is made. + """ + handle = RemovableHandle(self._state_dict_pre_hooks) + self._state_dict_pre_hooks[handle.id] = hook + return handle + + def _save_to_state_dict(self, destination, prefix, keep_vars) -> None: + r"""Save module state to the `destination` dictionary. + + The `destination` dictionary will contain the state + of the module, but not its descendants. This is called on every + submodule in :meth:`~torch.nn.Module.state_dict`. + + In rare cases, subclasses can achieve class-specific behavior by + overriding this method with custom logic. + + Args: + destination (dict): a dict where state will be stored + prefix (str): the prefix for parameters and buffers used in this + module + """ + for name, param in self._parameters.items(): + if param is not None: + destination[prefix + name] = param if keep_vars else param.detach() + for name, buf in self._buffers.items(): + if buf is not None and name not in self._non_persistent_buffers_set: + destination[prefix + name] = buf if keep_vars else buf.detach() + extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX + if ( + getattr(self.__class__, "get_extra_state", Module.get_extra_state) + is not Module.get_extra_state + ): + destination[extra_state_key] = self.get_extra_state() + + # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns + # back that same object. But if they pass nothing, an `OrderedDict` is created and returned. + T_destination = TypeVar("T_destination", bound=dict[str, Any]) + + @overload + def state_dict( + self, + *, + destination: T_destination, + prefix: str = ..., + keep_vars: bool = ..., + ) -> T_destination: ... + + @overload + def state_dict( + self, + *, + prefix: str = ..., + keep_vars: bool = ..., + ) -> dict[str, Any]: ... + + # TODO: Change `*args` to `*` and remove the corresponding warning in docs when BC allows. + # Also remove the logic for arg parsing together. + def state_dict(self, *args, destination=None, prefix="", keep_vars=False): + r"""Return a dictionary containing references to the whole state of the module. + + Both parameters and persistent buffers (e.g. running averages) are + included. Keys are corresponding parameter and buffer names. + Parameters and buffers set to ``None`` are not included. + + .. note:: + The returned object is a shallow copy. It contains references + to the module's parameters and buffers. + + .. warning:: + Currently ``state_dict()`` also accepts positional arguments for + ``destination``, ``prefix`` and ``keep_vars`` in order. However, + this is being deprecated and keyword arguments will be enforced in + future releases. + + .. warning:: + Please avoid the use of argument ``destination`` as it is not + designed for end-users. + + Args: + destination (dict, optional): If provided, the state of module will + be updated into the dict and the same object is returned. + Otherwise, an ``OrderedDict`` will be created and returned. + Default: ``None``. + prefix (str, optional): a prefix added to parameter and buffer + names to compose the keys in state_dict. Default: ``''``. + keep_vars (bool, optional): by default the :class:`~torch.Tensor` s + returned in the state dict are detached from autograd. If it's + set to ``True``, detaching will not be performed. + Default: ``False``. + + Returns: + dict: + a dictionary containing a whole state of the module + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> module.state_dict().keys() + ['bias', 'weight'] + + """ + # TODO: Remove `args` and the parsing logic when BC allows. + if len(args) > 0: + # DeprecationWarning is ignored by default + warnings.warn( + "Positional args are being deprecated, use kwargs instead. Refer to " + "https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict" + " for details.", + FutureWarning, + stacklevel=2, + ) + if destination is None: + destination = args[0] + if len(args) > 1 and prefix == "": + prefix = args[1] + if len(args) > 2 and keep_vars is False: + keep_vars = args[2] + + if destination is None: + destination = OrderedDict() + # pyrefly: ignore [missing-attribute] + destination._metadata = OrderedDict() + + local_metadata = dict(version=self._version) + if hasattr(destination, "_metadata"): + destination._metadata[prefix[:-1]] = local_metadata + + for hook in self._state_dict_pre_hooks.values(): + hook(self, prefix, keep_vars) + self._save_to_state_dict(destination, prefix, keep_vars) + for name, module in self._modules.items(): + if module is not None: + module.state_dict( + destination=destination, + prefix=prefix + name + ".", + keep_vars=keep_vars, + ) + for hook in self._state_dict_hooks.values(): + hook_result = hook(self, destination, prefix, local_metadata) + if not getattr(hook, "_from_public_api", False): + if hook_result is not None: + destination = hook_result + else: + if hook_result is not None: + raise RuntimeError("state_dict post-hook must return None") + return destination + + def _register_load_state_dict_pre_hook(self, hook, with_module=False): + r"""See :meth:`~torch.nn.Module.register_load_state_dict_pre_hook` for details. + + A subtle difference is that if ``with_module`` is set to ``False``, then the + hook will not take the ``module`` as the first argument whereas + :meth:`~torch.nn.Module.register_load_state_dict_pre_hook` always takes the + ``module`` as the first argument. + + Arguments: + hook (Callable): Callable hook that will be invoked before + loading the state dict. + with_module (bool, optional): Whether or not to pass the module + instance to the hook as the first parameter. + """ + handle = RemovableHandle(self._load_state_dict_pre_hooks) + self._load_state_dict_pre_hooks[handle.id] = _WrappedHook( + hook, self if with_module else None + ) + return handle + + def register_load_state_dict_pre_hook(self, hook): + r"""Register a pre-hook to be run before module's :meth:`~nn.Module.load_state_dict` is called. + + It should have the following signature:: + hook(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) -> None # noqa: B950 + + Arguments: + hook (Callable): Callable hook that will be invoked before + loading the state dict. + """ + return self._register_load_state_dict_pre_hook(hook, with_module=True) + + def register_load_state_dict_post_hook(self, hook): + r"""Register a post-hook to be run after module's :meth:`~nn.Module.load_state_dict` is called. + + It should have the following signature:: + hook(module, incompatible_keys) -> None + + The ``module`` argument is the current module that this hook is registered + on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting + of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys`` + is a ``list`` of ``str`` containing the missing keys and + ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys. + + The given incompatible_keys can be modified inplace if needed. + + Note that the checks performed when calling :func:`load_state_dict` with + ``strict=True`` are affected by modifications the hook makes to + ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either + set of keys will result in an error being thrown when ``strict=True``, and + clearing out both missing and unexpected keys will avoid an error. + + Returns: + :class:`torch.utils.hooks.RemovableHandle`: + a handle that can be used to remove the added hook by calling + ``handle.remove()`` + """ + handle = RemovableHandle(self._load_state_dict_post_hooks) + self._load_state_dict_post_hooks[handle.id] = hook + return handle + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) -> None: + r"""Copy parameters and buffers from :attr:`state_dict` into only this module, but not its descendants. + + This is called on every submodule + in :meth:`~torch.nn.Module.load_state_dict`. Metadata saved for this + module in input :attr:`state_dict` is provided as :attr:`local_metadata`. + For state dicts without metadata, :attr:`local_metadata` is empty. + Subclasses can achieve class-specific backward compatible loading using + the version number at `local_metadata.get("version", None)`. + Additionally, :attr:`local_metadata` can also contain the key + `assign_to_params_buffers` that indicates whether keys should be + assigned their corresponding tensor in the state_dict. + + .. note:: + :attr:`state_dict` is not the same object as the input + :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So + it can be modified. + + Args: + state_dict (dict): a dict containing parameters and + persistent buffers. + prefix (str): the prefix for parameters and buffers used in this + module + local_metadata (dict): a dict containing the metadata for this module. + See + strict (bool): whether to strictly enforce that the keys in + :attr:`state_dict` with :attr:`prefix` match the names of + parameters and buffers in this module + missing_keys (list of str): if ``strict=True``, add missing keys to + this list + unexpected_keys (list of str): if ``strict=True``, add unexpected + keys to this list + error_msgs (list of str): error messages should be added to this + list, and will be reported together in + :meth:`~torch.nn.Module.load_state_dict` + """ + for hook in self._load_state_dict_pre_hooks.values(): + hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + persistent_buffers = { + k: v + for k, v in self._buffers.items() + if k not in self._non_persistent_buffers_set + } + local_name_params = itertools.chain( + self._parameters.items(), + # pyrefly: ignore [bad-argument-type] + persistent_buffers.items(), + ) + local_state = {k: v for k, v in local_name_params if v is not None} + assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False) + use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion() + + for name, param in local_state.items(): + key = prefix + name + if key in state_dict: + input_param = state_dict[key] + if not torch.overrides.is_tensor_like(input_param): + error_msgs.append( + f'While copying the parameter named "{key}", ' + "expected torch.Tensor or Tensor-like object from checkpoint but " + f"received {type(input_param)}" + ) + continue + + # This is used to avoid copying uninitialized parameters into + # non-lazy modules, since they dont have the hook to do the checks + # in such case, it will error when accessing the .shape attribute. + is_param_lazy = torch.nn.parameter.is_lazy(param) + # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+ + if ( + not is_param_lazy + and len(param.shape) == 0 + and len(input_param.shape) == 1 + and input_param.shape[0] == 1 + ): + input_param = input_param[0] + + if not is_param_lazy and input_param.shape != param.shape: + # local shape should match the one in checkpoint + error_msgs.append( + f"size mismatch for {key}: copying a param with shape {input_param.shape} from checkpoint, " + f"the shape in current model is {param.shape}." + ) + continue + + if ( + param.is_meta + and not input_param.is_meta + and not assign_to_params_buffers + ): + warnings.warn( + f"for {key}: copying from a non-meta parameter in the checkpoint to a meta " + "parameter in the current model, which is a no-op. (Did you mean to " + "pass `assign=True` to assign items in the state dictionary to their " + "corresponding key in the module instead of copying them in place?)", + stacklevel=2, + ) + + try: + with torch.no_grad(): + if use_swap_tensors: + new_input_param = param.module_load( + input_param, assign=assign_to_params_buffers + ) + if id(new_input_param) == id(input_param) or id( + new_input_param + ) == id(param): + raise RuntimeError( + "module_load returned one of self or other, please .detach() " + "the result if returning one of the inputs in module_load" + ) + if isinstance(param, torch.nn.Parameter): + if not isinstance(new_input_param, torch.nn.Parameter): + new_input_param = torch.nn.Parameter( + new_input_param, + requires_grad=param.requires_grad, + ) + else: + new_input_param.requires_grad_(param.requires_grad) + torch.utils.swap_tensors(param, new_input_param) + del new_input_param + elif assign_to_params_buffers: + # Shape checks are already done above + if isinstance(param, torch.nn.Parameter): + if not isinstance(input_param, torch.nn.Parameter): + input_param = torch.nn.Parameter( + input_param, requires_grad=param.requires_grad + ) + else: + input_param.requires_grad_(param.requires_grad) + setattr(self, name, input_param) + else: + param.copy_(input_param) + except Exception as ex: + action = "swapping" if use_swap_tensors else "copying" + error_msgs.append( + f'While {action} the parameter named "{key}", ' + f"whose dimensions in the model are {param.size()} and " + f"whose dimensions in the checkpoint are {input_param.size()}, " + f"an exception occurred : {ex.args}." + ) + elif strict: + missing_keys.append(key) + + extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX + if ( + getattr(self.__class__, "set_extra_state", Module.set_extra_state) + is not Module.set_extra_state + ): + if extra_state_key in state_dict: + self.set_extra_state(state_dict[extra_state_key]) + elif strict: + missing_keys.append(extra_state_key) + elif strict and (extra_state_key in state_dict): + unexpected_keys.append(extra_state_key) + + if strict: + for key in state_dict: + if key.startswith(prefix) and key != extra_state_key: + input_name = key[len(prefix) :].split(".", 1) + # Must be Module if it have attributes + if len(input_name) > 1: + if input_name[0] not in self._modules: + unexpected_keys.append(key) + elif input_name[0] not in local_state: + unexpected_keys.append(key) + + def load_state_dict( + self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False + ): + r"""Copy parameters and buffers from :attr:`state_dict` into this module and its descendants. + + If :attr:`strict` is ``True``, then + the keys of :attr:`state_dict` must exactly match the keys returned + by this module's :meth:`~torch.nn.Module.state_dict` function. + + .. warning:: + If :attr:`assign` is ``True`` the optimizer must be created after + the call to :attr:`load_state_dict` unless + :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``. + + Args: + state_dict (dict): a dict containing parameters and + persistent buffers. + strict (bool, optional): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` + assign (bool, optional): When set to ``False``, the properties of the tensors + in the current module are preserved whereas setting it to ``True`` preserves + properties of the Tensors in the state dict. The only + exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter` + for which the value from the module is preserved. Default: ``False`` + + Returns: + ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: + * ``missing_keys`` is a list of str containing any keys that are expected + by this module but missing from the provided ``state_dict``. + * ``unexpected_keys`` is a list of str containing the keys that are not + expected by this module but present in the provided ``state_dict``. + + Note: + If a parameter or buffer is registered as ``None`` and its corresponding key + exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a + ``RuntimeError``. + """ + if not isinstance(state_dict, Mapping): + raise TypeError( + f"Expected state_dict to be dict-like, got {type(state_dict)}." + ) + + missing_keys: list[str] = [] + unexpected_keys: list[str] = [] + error_msgs: list[str] = [] + + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, "_metadata", None) + state_dict = OrderedDict(state_dict) + if metadata is not None: + # mypy isn't aware that "_metadata" exists in state_dict + state_dict._metadata = metadata # type: ignore[attr-defined] + + def load(module, local_state_dict, prefix="") -> None: + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + if assign: + local_metadata["assign_to_params_buffers"] = assign + module._load_from_state_dict( + local_state_dict, + prefix, + local_metadata, + True, + missing_keys, + unexpected_keys, + error_msgs, + ) + for name, child in module._modules.items(): + if child is not None: + child_prefix = prefix + name + "." + child_state_dict = { + k: v + for k, v in local_state_dict.items() + if k.startswith(child_prefix) + } + load(child, child_state_dict, child_prefix) # noqa: F821 + + # Note that the hook can modify missing_keys and unexpected_keys. + incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys) + for hook in module._load_state_dict_post_hooks.values(): + out = hook(module, incompatible_keys) + assert out is None, ( + "Hooks registered with ``register_load_state_dict_post_hook`` are not" + "expected to return new values, if incompatible_keys need to be modified," + "it should be done inplace." + ) + + load(self, state_dict) + del load + + if strict: + if len(unexpected_keys) > 0: + error_msgs.insert( + 0, + "Unexpected key(s) in state_dict: {}. ".format( + ", ".join(f'"{k}"' for k in unexpected_keys) + ), + ) + if len(missing_keys) > 0: + error_msgs.insert( + 0, + "Missing key(s) in state_dict: {}. ".format( + ", ".join(f'"{k}"' for k in missing_keys) + ), + ) + + if len(error_msgs) > 0: + raise RuntimeError( + "Error(s) in loading state_dict for {}:\n\t{}".format( + self.__class__.__name__, "\n\t".join(error_msgs) + ) + ) + return _IncompatibleKeys(missing_keys, unexpected_keys) + + def _named_members( + self, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True + ): + r"""Help yield various names + members of modules.""" + memo = set() + modules = ( + self.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) + if recurse + else [(prefix, self)] + ) + for module_prefix, module in modules: + members = get_members_fn(module) + for k, v in members: + if v is None or v in memo: + continue + if remove_duplicate: + memo.add(v) + name = module_prefix + ("." if module_prefix else "") + k + yield name, v + + def parameters(self, recurse: bool = True) -> Iterator[Parameter]: + r"""Return an iterator over module parameters. + + This is typically passed to an optimizer. + + Args: + recurse (bool): if True, then yields parameters of this module + and all submodules. Otherwise, yields only parameters that + are direct members of this module. + + Yields: + Parameter: module parameter + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> for param in model.parameters(): + >>> print(type(param), param.size()) + (20L,) + (20L, 1L, 5L, 5L) + + """ + for _name, param in self.named_parameters(recurse=recurse): + yield param + + def named_parameters( + self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True + ) -> Iterator[tuple[str, Parameter]]: + r"""Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. + + Args: + prefix (str): prefix to prepend to all parameter names. + recurse (bool): if True, then yields parameters of this module + and all submodules. Otherwise, yields only parameters that + are direct members of this module. + remove_duplicate (bool, optional): whether to remove the duplicated + parameters in the result. Defaults to True. + + Yields: + (str, Parameter): Tuple containing the name and parameter + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> for name, param in self.named_parameters(): + >>> if name in ['bias']: + >>> print(param.size()) + + """ + gen = self._named_members( + lambda module: module._parameters.items(), + prefix=prefix, + recurse=recurse, + remove_duplicate=remove_duplicate, + ) + yield from gen + + def buffers(self, recurse: bool = True) -> Iterator[Tensor]: + r"""Return an iterator over module buffers. + + Args: + recurse (bool): if True, then yields buffers of this module + and all submodules. Otherwise, yields only buffers that + are direct members of this module. + + Yields: + torch.Tensor: module buffer + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> for buf in model.buffers(): + >>> print(type(buf), buf.size()) + (20L,) + (20L, 1L, 5L, 5L) + + """ + for _, buf in self.named_buffers(recurse=recurse): + yield buf + + def named_buffers( + self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True + ) -> Iterator[tuple[str, Tensor]]: + r"""Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. + + Args: + prefix (str): prefix to prepend to all buffer names. + recurse (bool, optional): if True, then yields buffers of this module + and all submodules. Otherwise, yields only buffers that + are direct members of this module. Defaults to True. + remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True. + + Yields: + (str, torch.Tensor): Tuple containing the name and buffer + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> for name, buf in self.named_buffers(): + >>> if name in ['running_var']: + >>> print(buf.size()) + + """ + gen = self._named_members( + lambda module: module._buffers.items(), + prefix=prefix, + recurse=recurse, + remove_duplicate=remove_duplicate, + ) + yield from gen + + def children(self) -> Iterator["Module"]: + r"""Return an iterator over immediate children modules. + + Yields: + Module: a child module + """ + for _name, module in self.named_children(): + yield module + + def named_children(self) -> Iterator[tuple[str, "Module"]]: + r"""Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself. + + Yields: + (str, Module): Tuple containing a name and child module + + Example:: + + >>> # xdoctest: +SKIP("undefined vars") + >>> for name, module in model.named_children(): + >>> if name in ['conv4', 'conv5']: + >>> print(module) + + """ + memo = set() + for name, module in self._modules.items(): + if module is not None and module not in memo: + memo.add(module) + yield name, module + + def modules(self) -> Iterator["Module"]: + r"""Return an iterator over all modules in the network. + + Yields: + Module: a module in the network + + Note: + Duplicate modules are returned only once. In the following + example, ``l`` will be returned only once. + + Example:: + + >>> l = nn.Linear(2, 2) + >>> net = nn.Sequential(l, l) + >>> for idx, m in enumerate(net.modules()): + ... print(idx, '->', m) + + 0 -> Sequential( + (0): Linear(in_features=2, out_features=2, bias=True) + (1): Linear(in_features=2, out_features=2, bias=True) + ) + 1 -> Linear(in_features=2, out_features=2, bias=True) + + """ + for _, module in self.named_modules(): + yield module + + def named_modules( + self, + memo: set["Module"] | None = None, + prefix: str = "", + remove_duplicate: bool = True, + ): + r"""Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself. + + Args: + memo: a memo to store the set of modules already added to the result + prefix: a prefix that will be added to the name of the module + remove_duplicate: whether to remove the duplicated module instances in the result + or not + + Yields: + (str, Module): Tuple of name and module + + Note: + Duplicate modules are returned only once. In the following + example, ``l`` will be returned only once. + + Example:: + + >>> l = nn.Linear(2, 2) + >>> net = nn.Sequential(l, l) + >>> for idx, m in enumerate(net.named_modules()): + ... print(idx, '->', m) + + 0 -> ('', Sequential( + (0): Linear(in_features=2, out_features=2, bias=True) + (1): Linear(in_features=2, out_features=2, bias=True) + )) + 1 -> ('0', Linear(in_features=2, out_features=2, bias=True)) + + """ + if memo is None: + memo = set() + if self not in memo: + if remove_duplicate: + memo.add(self) + yield prefix, self + for name, module in self._modules.items(): + if module is None: + continue + submodule_prefix = prefix + ("." if prefix else "") + name + yield from module.named_modules( + memo, submodule_prefix, remove_duplicate + ) + + def train(self, mode: bool = True) -> Self: + r"""Set the module in training mode. + + This has an effect only on certain modules. See the documentation of + particular modules for details of their behaviors in training/evaluation + mode, i.e., whether they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, + etc. + + Args: + mode (bool): whether to set training mode (``True``) or evaluation + mode (``False``). Default: ``True``. + + Returns: + Module: self + """ + if not isinstance(mode, bool): + raise ValueError("training mode is expected to be boolean") + self.training = mode + for module in self.children(): + module.train(mode) + return self + + def eval(self) -> Self: + r"""Set the module in evaluation mode. + + This has an effect only on certain modules. See the documentation of + particular modules for details of their behaviors in training/evaluation + mode, i.e. whether they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, + etc. + + This is equivalent with :meth:`self.train(False) `. + + See :ref:`locally-disable-grad-doc` for a comparison between + `.eval()` and several similar mechanisms that may be confused with it. + + Returns: + Module: self + """ + return self.train(False) + + def requires_grad_(self, requires_grad: bool = True) -> Self: + r"""Change if autograd should record operations on parameters in this module. + + This method sets the parameters' :attr:`requires_grad` attributes + in-place. + + This method is helpful for freezing part of the module for finetuning + or training parts of a model individually (e.g., GAN training). + + See :ref:`locally-disable-grad-doc` for a comparison between + `.requires_grad_()` and several similar mechanisms that may be confused with it. + + Args: + requires_grad (bool): whether autograd should record operations on + parameters in this module. Default: ``True``. + + Returns: + Module: self + """ + for p in self.parameters(): + p.requires_grad_(requires_grad) + return self + + def zero_grad(self, set_to_none: bool = True) -> None: + r"""Reset gradients of all model parameters. + + See similar function under :class:`torch.optim.Optimizer` for more context. + + Args: + set_to_none (bool): instead of setting to zero, set the grads to None. + See :meth:`torch.optim.Optimizer.zero_grad` for details. + """ + if getattr(self, "_is_replica", False): + warnings.warn( + "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. " + "The parameters are copied (in a differentiable manner) from the original module. " + "This means they are not leaf nodes in autograd and so don't accumulate gradients. " + "If you need gradients in your forward method, consider using autograd.grad instead.", + stacklevel=2, + ) + + for p in self.parameters(): + if p.grad is not None: + if set_to_none: + p.grad = None + else: + if p.grad.grad_fn is not None: + p.grad.detach_() + else: + p.grad.requires_grad_(False) + p.grad.zero_() + + def share_memory(self) -> Self: + r"""See :meth:`torch.Tensor.share_memory_`.""" + return self._apply(lambda t: t.share_memory_()) + + def _get_name(self): + return self.__class__.__name__ + + def extra_repr(self) -> str: + r"""Return the extra representation of the module. + + To print customized extra information, you should re-implement + this method in your own modules. Both single-line and multi-line + strings are acceptable. + """ + return "" + + def __repr__(self) -> str: + # We treat the extra repr like the sub-module, one item per line + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split("\n") + child_lines = [] + for key, module in self._modules.items(): + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append("(" + key + "): " + mod_str) + lines = extra_lines + child_lines + + main_str = self._get_name() + "(" + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += "\n " + "\n ".join(lines) + "\n" + + main_str += ")" + return main_str + + def __dir__(self): + module_attrs = dir(self.__class__) + attrs = list(self.__dict__.keys()) + parameters = list(self._parameters.keys()) + modules = list(self._modules.keys()) + buffers = list(self._buffers.keys()) + keys = module_attrs + attrs + parameters + modules + buffers + + # Eliminate attrs that are not legal Python variable names + keys = [key for key in keys if not key[0].isdigit()] + + return sorted(keys) + + def _replicate_for_data_parallel(self): + replica = self.__new__(type(self)) + replica.__dict__ = self.__dict__.copy() + + # replicas do not have parameters themselves, the replicas reference the original + # module. + replica._parameters = {} + replica._buffers = replica._buffers.copy() + replica._modules = replica._modules.copy() + replica._is_replica = True # type: ignore[assignment] + + return replica + + def compile(self, *args, **kwargs) -> None: + """ + Compile this Module's forward using :func:`torch.compile`. + + This Module's `__call__` method is compiled and all arguments are passed as-is + to :func:`torch.compile`. + + See :func:`torch.compile` for details on the arguments for this function. + """ + self._compiled_call_impl = torch.compile(self._call_impl, *args, **kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..d492cdb3cf5a03c647760401fcc6f8709d87f1bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py @@ -0,0 +1,430 @@ +# mypy: allow-untyped-defs +import numbers +from typing import Union + +import torch +from torch import Size, Tensor +from torch.nn import functional as F, init +from torch.nn.parameter import Parameter + +from ._functions import CrossMapLRN2d as _cross_map_lrn2d +from .module import Module + + +__all__ = ["LocalResponseNorm", "CrossMapLRN2d", "LayerNorm", "GroupNorm", "RMSNorm"] + + +class LocalResponseNorm(Module): + r"""Applies local response normalization over an input signal. + + The input signal is composed of several input planes, where channels occupy the second dimension. + Applies normalization across channels. + + .. math:: + b_{c} = a_{c}\left(k + \frac{\alpha}{n} + \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta} + + Args: + size: amount of neighbouring channels used for normalization + alpha: multiplicative factor. Default: 0.0001 + beta: exponent. Default: 0.75 + k: additive factor. Default: 1 + + Shape: + - Input: :math:`(N, C, *)` + - Output: :math:`(N, C, *)` (same shape as input) + + Examples:: + + >>> lrn = nn.LocalResponseNorm(2) + >>> signal_2d = torch.randn(32, 5, 24, 24) + >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7) + >>> output_2d = lrn(signal_2d) + >>> output_4d = lrn(signal_4d) + + """ + + __constants__ = ["size", "alpha", "beta", "k"] + size: int + alpha: float + beta: float + k: float + + def __init__( + self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0 + ) -> None: + super().__init__() + self.size = size + self.alpha = alpha + self.beta = beta + self.k = k + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.local_response_norm(input, self.size, self.alpha, self.beta, self.k) + + def extra_repr(self): + """ + Return the extra representation of the module. + """ + return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__) + + +class CrossMapLRN2d(Module): + size: int + alpha: float + beta: float + k: float + + def __init__( + self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1 + ) -> None: + super().__init__() + self.size = size + self.alpha = alpha + self.beta = beta + self.k = k + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, self.k) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__) + + +_shape_t = Union[int, list[int], Size] + + +class LayerNorm(Module): + r"""Applies Layer Normalization over a mini-batch of inputs. + + This layer implements the operation as described in + the paper `Layer Normalization `__ + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated over the last `D` dimensions, where `D` + is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape` + is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over + the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``). + :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of + :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``. + The variance is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + .. note:: + Unlike Batch Normalization and Instance Normalization, which applies + scalar scale and bias for each entire channel/plane with the + :attr:`affine` option, Layer Normalization applies per-element scale and + bias with :attr:`elementwise_affine`. + + This layer uses statistics computed from input data in both training and + evaluation modes. + + Args: + normalized_shape (int or list or torch.Size): input shape from an expected input + of size + + .. math:: + [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1] + \times \ldots \times \text{normalized\_shape}[-1]] + + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps: a value added to the denominator for numerical stability. Default: 1e-5 + elementwise_affine: a boolean value that when set to ``True``, this module + has learnable per-element affine parameters initialized to ones (for weights) + and zeros (for biases). Default: ``True``. + bias: If set to ``False``, the layer will not learn an additive bias (only relevant if + :attr:`elementwise_affine` is ``True``). Default: ``True``. + + Attributes: + weight: the learnable weights of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 1. + bias: the learnable bias of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 0. + + Shape: + - Input: :math:`(N, *)` + - Output: :math:`(N, *)` (same shape as input) + + Examples:: + + >>> # NLP Example + >>> batch, sentence_length, embedding_dim = 20, 5, 10 + >>> embedding = torch.randn(batch, sentence_length, embedding_dim) + >>> layer_norm = nn.LayerNorm(embedding_dim) + >>> # Activate module + >>> layer_norm(embedding) + >>> + >>> # Image Example + >>> N, C, H, W = 20, 5, 10, 10 + >>> input = torch.randn(N, C, H, W) + >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions) + >>> # as shown in the image below + >>> layer_norm = nn.LayerNorm([C, H, W]) + >>> output = layer_norm(input) + + .. image:: ../_static/img/nn/layer_norm.jpg + :scale: 50 % + + """ + + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + if bias: + self.bias = Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + else: + self.register_parameter("bias", None) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + init.ones_(self.weight) + if self.bias is not None: + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps + ) + + def extra_repr(self) -> str: + return ( + "{normalized_shape}, eps={eps}, " + "elementwise_affine={elementwise_affine}".format(**self.__dict__) + ) + + +class GroupNorm(Module): + r"""Applies Group Normalization over a mini-batch of inputs. + + This layer implements the operation as described in + the paper `Group Normalization `__ + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The input channels are separated into :attr:`num_groups` groups, each containing + ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by + :attr:`num_groups`. The mean and standard-deviation are calculated + separately over each group. :math:`\gamma` and :math:`\beta` are learnable + per-channel affine transform parameter vectors of size :attr:`num_channels` if + :attr:`affine` is ``True``. + The variance is calculated via the biased estimator, equivalent to + `torch.var(input, correction=0)`. + + This layer uses statistics computed from input data in both training and + evaluation modes. + + Args: + num_groups (int): number of groups to separate the channels into + num_channels (int): number of channels expected in input + eps: a value added to the denominator for numerical stability. Default: 1e-5 + affine: a boolean value that when set to ``True``, this module + has learnable per-channel affine parameters initialized to ones (for weights) + and zeros (for biases). Default: ``True``. + + Shape: + - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}` + - Output: :math:`(N, C, *)` (same shape as input) + + Examples:: + + >>> input = torch.randn(20, 6, 10, 10) + >>> # Separate 6 channels into 3 groups + >>> m = nn.GroupNorm(3, 6) + >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm) + >>> m = nn.GroupNorm(6, 6) + >>> # Put all 6 channels into a single group (equivalent with LayerNorm) + >>> m = nn.GroupNorm(1, 6) + >>> # Activating the module + >>> output = m(input) + """ + + __constants__ = ["num_groups", "num_channels", "eps", "affine"] + num_groups: int + num_channels: int + eps: float + affine: bool + + def __init__( + self, + num_groups: int, + num_channels: int, + eps: float = 1e-5, + affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + if num_channels % num_groups != 0: + raise ValueError( + f"num_channels ({num_channels}) must be divisible by num_groups ({num_groups})" + ) + + self.num_groups = num_groups + self.num_channels = num_channels + self.eps = eps + self.affine = affine + if self.affine: + self.weight = Parameter(torch.empty(num_channels, **factory_kwargs)) + self.bias = Parameter(torch.empty(num_channels, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.affine: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return "{num_groups}, {num_channels}, eps={eps}, affine={affine}".format( + **self.__dict__ + ) + + +class RMSNorm(Module): + r"""Applies Root Mean Square Layer Normalization over a mini-batch of inputs. + + This layer implements the operation as described in + the paper `Root Mean Square Layer Normalization `__ + + .. math:: + y_i = \frac{x_i}{\mathrm{RMS}(x)} * \gamma_i, \quad + \text{where} \quad \text{RMS}(x) = \sqrt{\epsilon + \frac{1}{n} \sum_{i=1}^{n} x_i^2} + + The RMS is taken over the last ``D`` dimensions, where ``D`` + is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape` + is ``(3, 5)`` (a 2-dimensional shape), the RMS is computed over + the last 2 dimensions of the input. + + Args: + normalized_shape (int or list or torch.Size): input shape from an expected input + of size + + .. math:: + [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1] + \times \ldots \times \text{normalized\_shape}[-1]] + + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps: a value added to the denominator for numerical stability. Default: ``torch.finfo(x.dtype).eps`` + elementwise_affine: a boolean value that when set to ``True``, this module + has learnable per-element affine parameters initialized to ones (for weights). Default: ``True``. + + Shape: + - Input: :math:`(N, *)` + - Output: :math:`(N, *)` (same shape as input) + + Examples:: + + >>> rms_norm = nn.RMSNorm([2, 3]) + >>> input = torch.randn(2, 2, 3) + >>> rms_norm(input) + + """ + + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: tuple[int, ...] + eps: float | None + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float | None = None, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + else: + self.register_parameter("weight", None) + self.reset_parameters() + + def reset_parameters(self) -> None: + """ + Resets parameters based on their initialization used in __init__. + """ + if self.elementwise_affine: + init.ones_(self.weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Runs the forward pass. + """ + return F.rms_norm(x, self.normalized_shape, self.weight, self.eps) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return ( + "{normalized_shape}, eps={eps}, " + "elementwise_affine={elementwise_affine}".format(**self.__dict__) + ) + + +# TODO: ContrastiveNorm2d +# TODO: DivisiveNorm2d +# TODO: SubtractiveNorm2d diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..d5aa1e0d425548857d20b093041b190bc7f2f645 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py @@ -0,0 +1,842 @@ +# mypy: allow-untyped-defs +from collections.abc import Sequence + +import torch.nn.functional as F +from torch import Tensor +from torch.nn.common_types import _size_2_t, _size_4_t, _size_6_t + +from .module import Module +from .utils import _ntuple, _pair, _quadruple + + +# TODO: grad_output size asserts in THNN + +__all__ = [ + "CircularPad1d", + "CircularPad2d", + "CircularPad3d", + "ConstantPad1d", + "ConstantPad2d", + "ConstantPad3d", + "ReflectionPad1d", + "ReflectionPad2d", + "ReflectionPad3d", + "ReplicationPad1d", + "ReplicationPad2d", + "ReplicationPad3d", + "ZeroPad1d", + "ZeroPad2d", + "ZeroPad3d", +] + + +class _CircularPadNd(Module): + __constants__ = ["padding"] + padding: Sequence[int] + + def _check_input_dim(self, input): + raise NotImplementedError + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + return F.pad(input, self.padding, "circular") + + def extra_repr(self) -> str: + return f"{self.padding}" + + +class CircularPad1d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + Note that padding size should be less than or equal to the corresponding input dimension. + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.CircularPad1d(2) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[2., 3., 0., 1., 2., 3., 0., 1.], + [6., 7., 4., 5., 6., 7., 4., 5.]]]) + >>> # using different paddings for different sides + >>> m = nn.CircularPad1d((3, 1)) + >>> m(input) + tensor([[[1., 2., 3., 0., 1., 2., 3., 0.], + [5., 6., 7., 4., 5., 6., 7., 4.]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + def _check_input_dim(self, input) -> None: + if input.dim() != 2 and input.dim() != 3: + raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)") + + +class CircularPad2d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + Note that padding size should be less than or equal to the corresponding input dimension. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.CircularPad2d(2) + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[4., 5., 3., 4., 5., 3., 4.], + [7., 8., 6., 7., 8., 6., 7.], + [1., 2., 0., 1., 2., 0., 1.], + [4., 5., 3., 4., 5., 3., 4.], + [7., 8., 6., 7., 8., 6., 7.], + [1., 2., 0., 1., 2., 0., 1.], + [4., 5., 3., 4., 5., 3., 4.]]]]) + >>> # using different paddings for different sides + >>> m = nn.CircularPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[5., 3., 4., 5., 3.], + [8., 6., 7., 8., 6.], + [2., 0., 1., 2., 0.], + [5., 3., 4., 5., 3.], + [8., 6., 7., 8., 6.]]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + def _check_input_dim(self, input) -> None: + if input.dim() != 3 and input.dim() != 4: + raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)") + + +class CircularPad3d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + Note that padding size should be less than or equal to the corresponding input dimension. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.CircularPad3d(3) + >>> input = torch.randn(16, 3, 8, 320, 480) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1)) + >>> output = m(input) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + def _check_input_dim(self, input) -> None: + if input.dim() != 4 and input.dim() != 5: + raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)") + + +class _ConstantPadNd(Module): + __constants__ = ["padding", "value"] + value: float + padding: Sequence[int] + + def __init__(self, value: float) -> None: + super().__init__() + self.value = value + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, "constant", self.value) + + def extra_repr(self) -> str: + return f"padding={self.padding}, value={self.value}" + + +class ConstantPad1d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in both boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ConstantPad1d(2, 3.5) + >>> input = torch.randn(1, 2, 4) + >>> input + tensor([[[-1.0491, -0.7152, -0.0749, 0.8530], + [-1.3287, 1.8966, 0.1466, -0.2771]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, -1.0491, -0.7152, -0.0749, 0.8530, 3.5000, + 3.5000], + [ 3.5000, 3.5000, -1.3287, 1.8966, 0.1466, -0.2771, 3.5000, + 3.5000]]]) + >>> m = nn.ConstantPad1d(2, 3.5) + >>> input = torch.randn(1, 2, 3) + >>> input + tensor([[[ 1.6616, 1.4523, -1.1255], + [-3.6372, 0.1182, -1.8652]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000, 3.5000], + [ 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000, 3.5000]]]) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad1d((3, 1), 3.5) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000], + [ 3.5000, 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int] + + def __init__(self, padding: _size_2_t, value: float) -> None: + super().__init__(value) + self.padding = _pair(padding) + + +class ConstantPad2d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ConstantPad2d(2, 3.5) + >>> input = torch.randn(1, 2, 2) + >>> input + tensor([[[ 1.6585, 0.4320], + [-0.8701, -0.4649]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 1.6585, 0.4320, 3.5000, 3.5000], + [ 3.5000, 3.5000, -0.8701, -0.4649, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]]) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 1.6585, 0.4320], + [ 3.5000, 3.5000, 3.5000, -0.8701, -0.4649], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]]) + """ + + __constants__ = ["padding", "value"] + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t, value: float) -> None: + super().__init__(value) + self.padding = _quadruple(padding) + + +class ConstantPad3d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ConstantPad3d(3, 3.5) + >>> input = torch.randn(16, 3, 10, 20, 30) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5) + >>> output = m(input) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t, value: float) -> None: + super().__init__(value) + self.padding = _ntuple(6)(padding) + + +class _ReflectionPadNd(Module): + __constants__ = ["padding"] + padding: Sequence[int] + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, "reflect") + + def extra_repr(self) -> str: + return f"{self.padding}" + + +class ReflectionPad1d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + Note that padding size should be less than the corresponding input dimension. + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ReflectionPad1d(2) + >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles") + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[2., 1., 0., 1., 2., 3., 2., 1.], + [6., 5., 4., 5., 6., 7., 6., 5.]]]) + >>> # using different paddings for different sides + >>> m = nn.ReflectionPad1d((3, 1)) + >>> m(input) + tensor([[[3., 2., 1., 0., 1., 2., 3., 2.], + [7., 6., 5., 4., 5., 6., 7., 6.]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + +class ReflectionPad2d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + Note that padding size should be less than the corresponding input dimension. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReflectionPad2d(2) + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[8., 7., 6., 7., 8., 7., 6.], + [5., 4., 3., 4., 5., 4., 3.], + [2., 1., 0., 1., 2., 1., 0.], + [5., 4., 3., 4., 5., 4., 3.], + [8., 7., 6., 7., 8., 7., 6.], + [5., 4., 3., 4., 5., 4., 3.], + [2., 1., 0., 1., 2., 1., 0.]]]]) + >>> # using different paddings for different sides + >>> m = nn.ReflectionPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[7., 6., 7., 8., 7.], + [4., 3., 4., 5., 4.], + [1., 0., 1., 2., 1.], + [4., 3., 4., 5., 4.], + [7., 6., 7., 8., 7.]]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + +class ReflectionPad3d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + Note that padding size should be less than the corresponding input dimension. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReflectionPad3d(1) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2) + >>> m(input) + tensor([[[[[7., 6., 7., 6.], + [5., 4., 5., 4.], + [7., 6., 7., 6.], + [5., 4., 5., 4.]], + [[3., 2., 3., 2.], + [1., 0., 1., 0.], + [3., 2., 3., 2.], + [1., 0., 1., 0.]], + [[7., 6., 7., 6.], + [5., 4., 5., 4.], + [7., 6., 7., 6.], + [5., 4., 5., 4.]], + [[3., 2., 3., 2.], + [1., 0., 1., 0.], + [3., 2., 3., 2.], + [1., 0., 1., 0.]]]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + +class _ReplicationPadNd(Module): + __constants__ = ["padding"] + padding: Sequence[int] + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, "replicate") + + def extra_repr(self) -> str: + return f"{self.padding}" + + +class ReplicationPad1d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + Note that the output dimensions must remain positive. + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReplicationPad1d(2) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[0., 0., 0., 1., 2., 3., 3., 3.], + [4., 4., 4., 5., 6., 7., 7., 7.]]]) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad1d((3, 1)) + >>> m(input) + tensor([[[0., 0., 0., 0., 1., 2., 3., 3.], + [4., 4., 4., 4., 5., 6., 7., 7.]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + +class ReplicationPad2d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + Note that the output dimensions must remain positive. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ReplicationPad2d(2) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[0., 0., 0., 1., 2., 2., 2.], + [0., 0., 0., 1., 2., 2., 2.], + [0., 0., 0., 1., 2., 2., 2.], + [3., 3., 3., 4., 5., 5., 5.], + [6., 6., 6., 7., 8., 8., 8.], + [6., 6., 6., 7., 8., 8., 8.], + [6., 6., 6., 7., 8., 8., 8.]]]]) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[0., 0., 1., 2., 2.], + [0., 0., 1., 2., 2.], + [0., 0., 1., 2., 2.], + [3., 3., 4., 5., 5.], + [6., 6., 7., 8., 8.]]]]) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + +class ReplicationPad3d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + Note that the output dimensions must remain positive. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ReplicationPad3d(3) + >>> input = torch.randn(16, 3, 8, 320, 480) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1)) + >>> output = m(input) + """ + + # pyrefly: ignore [bad-override] + padding: tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + +class ZeroPad1d(ConstantPad1d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in both boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ZeroPad1d(2) + >>> input = torch.randn(1, 2, 4) + >>> input + tensor([[[-1.0491, -0.7152, -0.0749, 0.8530], + [-1.3287, 1.8966, 0.1466, -0.2771]]]) + >>> m(input) + tensor([[[ 0.0000, 0.0000, -1.0491, -0.7152, -0.0749, 0.8530, 0.0000, + 0.0000], + [ 0.0000, 0.0000, -1.3287, 1.8966, 0.1466, -0.2771, 0.0000, + 0.0000]]]) + >>> m = nn.ZeroPad1d(2) + >>> input = torch.randn(1, 2, 3) + >>> input + tensor([[[ 1.6616, 1.4523, -1.1255], + [-3.6372, 0.1182, -1.8652]]]) + >>> m(input) + tensor([[[ 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000, 0.0000], + [ 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000, 0.0000]]]) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad1d((3, 1)) + >>> m(input) + tensor([[[ 0.0000, 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000], + [ 0.0000, 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000]]]) + """ + + padding: tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__(padding, 0.0) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"{self.padding}" + + +class ZeroPad2d(ConstantPad2d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ZeroPad2d(2) + >>> input = torch.randn(1, 1, 3, 3) + >>> input + tensor([[[[-0.1678, -0.4418, 1.9466], + [ 0.9604, -0.4219, -0.5241], + [-0.9162, -0.5436, -0.6446]]]]) + >>> m(input) + tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.1678, -0.4418, 1.9466, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.9604, -0.4219, -0.5241, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.9162, -0.5436, -0.6446, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]]) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, -0.1678, -0.4418, 1.9466, 0.0000], + [ 0.0000, 0.9604, -0.4219, -0.5241, 0.0000], + [ 0.0000, -0.9162, -0.5436, -0.6446, 0.0000]]]]) + """ + + padding: tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__(padding, 0.0) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"{self.padding}" + + +class ZeroPad3d(ConstantPad3d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ZeroPad3d(3) + >>> input = torch.randn(16, 3, 10, 20, 30) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1)) + >>> output = m(input) + """ + + padding: tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__(padding, 0.0) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"{self.padding}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..74c9e0878f0b5ecc48878c63115aafc2128b3afd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py @@ -0,0 +1,127 @@ +import torch.nn.functional as F +from torch import Tensor + +from .module import Module + + +__all__ = ["PixelShuffle", "PixelUnshuffle"] + + +class PixelShuffle(Module): + r"""Rearrange elements in a tensor according to an upscaling factor. + + Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` + to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor. + + This is useful for implementing efficient sub-pixel convolution + with a stride of :math:`1/r`. + + See the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ + by Shi et al. (2016) for more details. + + Args: + upscale_factor (int): factor to increase spatial resolution by + + Shape: + - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions + - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where + + .. math:: + C_{out} = C_{in} \div \text{upscale\_factor}^2 + + .. math:: + H_{out} = H_{in} \times \text{upscale\_factor} + + .. math:: + W_{out} = W_{in} \times \text{upscale\_factor} + + Examples:: + + >>> pixel_shuffle = nn.PixelShuffle(3) + >>> input = torch.randn(1, 9, 4, 4) + >>> output = pixel_shuffle(input) + >>> print(output.size()) + torch.Size([1, 1, 12, 12]) + + .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network: + https://arxiv.org/abs/1609.05158 + """ + + __constants__ = ["upscale_factor"] + upscale_factor: int + + def __init__(self, upscale_factor: int) -> None: + super().__init__() + self.upscale_factor = upscale_factor + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.pixel_shuffle(input, self.upscale_factor) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"upscale_factor={self.upscale_factor}" + + +class PixelUnshuffle(Module): + r"""Reverse the PixelShuffle operation. + + Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements + in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape + :math:`(*, C \times r^2, H, W)`, where r is a downscale factor. + + See the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ + by Shi et al. (2016) for more details. + + Args: + downscale_factor (int): factor to decrease spatial resolution by + + Shape: + - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions + - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where + + .. math:: + C_{out} = C_{in} \times \text{downscale\_factor}^2 + + .. math:: + H_{out} = H_{in} \div \text{downscale\_factor} + + .. math:: + W_{out} = W_{in} \div \text{downscale\_factor} + + Examples:: + + >>> pixel_unshuffle = nn.PixelUnshuffle(3) + >>> input = torch.randn(1, 1, 12, 12) + >>> output = pixel_unshuffle(input) + >>> print(output.size()) + torch.Size([1, 9, 4, 4]) + + .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network: + https://arxiv.org/abs/1609.05158 + """ + + __constants__ = ["downscale_factor"] + downscale_factor: int + + def __init__(self, downscale_factor: int) -> None: + super().__init__() + self.downscale_factor = downscale_factor + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.pixel_unshuffle(input, self.downscale_factor) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + return f"downscale_factor={self.downscale_factor}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc57c25b168396fa9ceff5b32fd368befa094af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py @@ -0,0 +1,1550 @@ +import torch.nn.functional as F +from torch import Tensor +from torch.nn.common_types import ( + _ratio_2_t, + _ratio_3_t, + _size_1_t, + _size_2_opt_t, + _size_2_t, + _size_3_opt_t, + _size_3_t, + _size_any_opt_t, + _size_any_t, +) + +from .module import Module +from .utils import _pair, _single, _triple + + +__all__ = [ + "MaxPool1d", + "MaxPool2d", + "MaxPool3d", + "MaxUnpool1d", + "MaxUnpool2d", + "MaxUnpool3d", + "AvgPool1d", + "AvgPool2d", + "AvgPool3d", + "FractionalMaxPool2d", + "FractionalMaxPool3d", + "LPPool1d", + "LPPool2d", + "LPPool3d", + "AdaptiveMaxPool1d", + "AdaptiveMaxPool2d", + "AdaptiveMaxPool3d", + "AdaptiveAvgPool1d", + "AdaptiveAvgPool2d", + "AdaptiveAvgPool3d", +] + + +class _MaxPoolNd(Module): + __constants__ = [ + "kernel_size", + "stride", + "padding", + "dilation", + "return_indices", + "ceil_mode", + ] + return_indices: bool + ceil_mode: bool + + def __init__( + self, + kernel_size: _size_any_t, + stride: _size_any_t | None = None, + padding: _size_any_t = 0, + dilation: _size_any_t = 1, + return_indices: bool = False, + ceil_mode: bool = False, + ) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.dilation = dilation + self.return_indices = return_indices + self.ceil_mode = ceil_mode + + def extra_repr(self) -> str: + return ( + "kernel_size={kernel_size}, stride={stride}, padding={padding}" + ", dilation={dilation}, ceil_mode={ceil_mode}".format(**self.__dict__) + ) + + +class MaxPool1d(_MaxPoolNd): + r"""Applies a 1D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, L)` + and output :math:`(N, C, L_{out})` can be precisely described as: + + .. math:: + out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1} + input(N_i, C_j, stride \times k + m) + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the + sliding window. This `link`_ has a nice visualization of the pooling parameters. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + Args: + kernel_size: The size of the sliding window, must be > 0. + stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`. + padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2. + dilation: The stride between elements within a sliding window, must be > 0. + return_indices: If ``True``, will return the argmax along with the max values. + Useful for :class:`torch.nn.MaxUnpool1d` later + ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This + ensures that every element in the input tensor is covered by a sliding window. + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, + + where ``ceil_mode = False`` + + .. math:: + L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation} + \times (\text{kernel\_size} - 1) - 1}{\text{stride}}\right\rfloor + 1 + + where ``ceil_mode = True`` + + .. math:: + L_{out} = \left\lceil \frac{L_{in} + 2 \times \text{padding} - \text{dilation} + \times (\text{kernel\_size} - 1) - 1 + (stride - 1)}{\text{stride}}\right\rceil + 1 + + - Ensure that the last pooling starts inside the image, make :math:`L_{out} = L_{out} - 1` + when :math:`(L_{out} - 1) * \text{stride} >= L_{in} + \text{padding}`. + + Examples:: + + >>> # pool of size=3, stride=2 + >>> m = nn.MaxPool1d(3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + dilation: _size_1_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.max_pool1d( + input, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + ceil_mode=self.ceil_mode, + return_indices=self.return_indices, + ) + + +class MaxPool2d(_MaxPoolNd): + r"""Applies a 2D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, + output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times h + m, + \text{stride[1]} \times w + n) + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Args: + kernel_size: the size of the window to take a max over + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: Implicit negative infinity padding to be added on both sides + dilation: a parameter that controls the stride of elements in the window + return_indices: if ``True``, will return the max indices along with the outputs. + Useful for :class:`torch.nn.MaxUnpool2d` later + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})` + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]} + \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]} + \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.MaxPool2d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.MaxPool2d((3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + dilation: _size_2_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.max_pool2d( + input, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + ceil_mode=self.ceil_mode, + return_indices=self.return_indices, + ) + + +class MaxPool3d(_MaxPoolNd): + r"""Applies a 3D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, + output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times d + k, + \text{stride[1]} \times h + m, \text{stride[2]} \times w + n) + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Args: + kernel_size: the size of the window to take a max over + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: Implicit negative infinity padding to be added on all three sides + dilation: a parameter that controls the stride of elements in the window + return_indices: if ``True``, will return the max indices along with the outputs. + Useful for :class:`torch.nn.MaxUnpool3d` later + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times + (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times + (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times + (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.MaxPool3d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + dilation: _size_3_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.max_pool3d( + input, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + ceil_mode=self.ceil_mode, + return_indices=self.return_indices, + ) + + +class _MaxUnpoolNd(Module): + def extra_repr(self) -> str: + return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}" + + +class MaxUnpool1d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool1d`. + + :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost. + + :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool1d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs and Example below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`. + - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0] + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?") + >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool1d(2, stride=2) + >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8.]]]) + + >>> # Example showcasing the use of output_size + >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices, output_size=input.size()) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8., 0.]]]) + + >>> unpool(output, indices) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8.]]]) + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + + def __init__( + self, + kernel_size: _size_1_t, + stride: _size_1_t | None = None, + padding: _size_1_t = 0, + ) -> None: + super().__init__() + self.kernel_size = _single(kernel_size) + self.stride = _single(stride if (stride is not None) else kernel_size) + self.padding = _single(padding) + + def forward( + self, input: Tensor, indices: Tensor, output_size: list[int] | None = None + ) -> Tensor: + """Runs the forward pass.""" + return F.max_unpool1d( + input, indices, self.kernel_size, self.stride, self.padding, output_size + ) + + +class MaxUnpool2d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool2d`. + + :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost. + + :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool2d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs and Example below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} + + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool2d(2, stride=2) + >>> input = torch.tensor([[[[ 1., 2., 3., 4.], + [ 5., 6., 7., 8.], + [ 9., 10., 11., 12.], + [13., 14., 15., 16.]]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices) + tensor([[[[ 0., 0., 0., 0.], + [ 0., 6., 0., 8.], + [ 0., 0., 0., 0.], + [ 0., 14., 0., 16.]]]]) + >>> # Now using output_size to resolve an ambiguous size for the inverse + >>> input = torch.tensor([[[[ 1., 2., 3., 4., 5.], + [ 6., 7., 8., 9., 10.], + [11., 12., 13., 14., 15.], + [16., 17., 18., 19., 20.]]]]) + >>> output, indices = pool(input) + >>> # This call will not work without specifying output_size + >>> unpool(output, indices, output_size=input.size()) + tensor([[[[ 0., 0., 0., 0., 0.], + [ 0., 7., 0., 9., 0.], + [ 0., 0., 0., 0., 0.], + [ 0., 17., 0., 19., 0.]]]]) + + + """ + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + + def __init__( + self, + kernel_size: _size_2_t, + stride: _size_2_t | None = None, + padding: _size_2_t = 0, + ) -> None: + super().__init__() + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride if (stride is not None) else kernel_size) + self.padding = _pair(padding) + + def forward( + self, input: Tensor, indices: Tensor, output_size: list[int] | None = None + ) -> Tensor: + """Runs the forward pass.""" + return F.max_unpool2d( + input, indices, self.kernel_size, self.stride, self.padding, output_size + ) + + +class MaxUnpool3d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool3d`. + + :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost. + :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool3d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs section below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} + + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]} + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> # pool of square window of size=3, stride=2 + >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool3d(3, stride=2) + >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15)) + >>> unpooled_output = unpool(output, indices) + >>> unpooled_output.size() + torch.Size([20, 16, 51, 33, 15]) + """ + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + + def __init__( + self, + kernel_size: _size_3_t, + stride: _size_3_t | None = None, + padding: _size_3_t = 0, + ) -> None: + super().__init__() + self.kernel_size = _triple(kernel_size) + self.stride = _triple(stride if (stride is not None) else kernel_size) + self.padding = _triple(padding) + + def forward( + self, input: Tensor, indices: Tensor, output_size: list[int] | None = None + ) -> Tensor: + """Runs the forward pass.""" + return F.max_unpool3d( + input, indices, self.kernel_size, self.stride, self.padding, output_size + ) + + +class _AvgPoolNd(Module): + __constants__ = [ + "kernel_size", + "stride", + "padding", + "ceil_mode", + "count_include_pad", + ] + + def extra_repr(self) -> str: + return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}" + + +class AvgPool1d(_AvgPoolNd): + r"""Applies a 1D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, L)`, + output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k` + can be precisely described as: + + .. math:: + + \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k-1} + \text{input}(N_i, C_j, \text{stride} \times l + m) + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + .. note:: + pad should be at most half of effective kernel size. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be + an ``int`` or a one-element tuple. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on both sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor \frac{L_{in} + + 2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in} + + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in + :math:`L_{out}` being reduced by one. + + Examples:: + + >>> # pool with window of size=3, stride=2 + >>> m = nn.AvgPool1d(3, stride=2) + >>> m(torch.tensor([[[1., 2, 3, 4, 5, 6, 7]]])) + tensor([[[2., 4., 6.]]]) + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + ceil_mode: bool + count_include_pad: bool + + def __init__( + self, + kernel_size: _size_1_t, + stride: _size_1_t = None, + padding: _size_1_t = 0, + ceil_mode: bool = False, + count_include_pad: bool = True, + ) -> None: + super().__init__() + self.kernel_size = _single(kernel_size) + self.stride = _single(stride if stride is not None else kernel_size) + self.padding = _single(padding) + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.avg_pool1d( + input, + self.kernel_size, + self.stride, + self.padding, + self.ceil_mode, + self.count_include_pad, + ) + + +class AvgPool2d(_AvgPoolNd): + r"""Applies a 2D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, + output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` + can be precisely described as: + + .. math:: + + out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} + input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + .. note:: + pad should be at most half of effective kernel size. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be: + + - a single ``int`` or a single-element tuple -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on both sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used. + + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - + \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - + \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in} + + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region, + resulting in :math:`H_{out}` being reduced by one. + + The same applies for :math:`W_{out}`. + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.AvgPool2d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.AvgPool2d((3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + """ + + __constants__ = [ + "kernel_size", + "stride", + "padding", + "ceil_mode", + "count_include_pad", + "divisor_override", + ] + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + ceil_mode: bool + count_include_pad: bool + + def __init__( + self, + kernel_size: _size_2_t, + stride: _size_2_t | None = None, + padding: _size_2_t = 0, + ceil_mode: bool = False, + count_include_pad: bool = True, + divisor_override: int | None = None, + ) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + self.divisor_override = divisor_override + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.avg_pool2d( + input, + self.kernel_size, + self.stride, + self.padding, + self.ceil_mode, + self.count_include_pad, + self.divisor_override, + ) + + +class AvgPool3d(_AvgPoolNd): + r"""Applies a 3D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, + output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\ + & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k, + \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)} + {kD \times kH \times kW} + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + .. note:: + pad should be at most half of effective kernel size. + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on all three sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + divisor_override: if specified, it will be used as divisor, otherwise :attr:`kernel_size` will be used + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - + \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - + \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - + \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in} + + \text{padding}[0]`, we skip the last window as it would start in the padded region, + resulting in :math:`D_{out}` being reduced by one. + + The same applies for :math:`W_{out}` and :math:`H_{out}`. + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.AvgPool3d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + """ + + __constants__ = [ + "kernel_size", + "stride", + "padding", + "ceil_mode", + "count_include_pad", + "divisor_override", + ] + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + ceil_mode: bool + count_include_pad: bool + + def __init__( + self, + kernel_size: _size_3_t, + stride: _size_3_t | None = None, + padding: _size_3_t = 0, + ceil_mode: bool = False, + count_include_pad: bool = True, + divisor_override: int | None = None, + ) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + self.divisor_override = divisor_override + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.avg_pool3d( + input, + self.kernel_size, + self.stride, + self.padding, + self.ceil_mode, + self.count_include_pad, + self.divisor_override, + ) + + def __setstate__(self, d): + super().__setstate__(d) + self.__dict__.setdefault("padding", 0) + self.__dict__.setdefault("ceil_mode", False) + self.__dict__.setdefault("count_include_pad", True) + + +class FractionalMaxPool2d(Module): + r"""Applies a 2D fractional max pooling over an input signal composed of several input planes. + + Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham + + The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic + step size determined by the target output size. + The number of output features is equal to the number of input planes. + + .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined. + + Args: + kernel_size: the size of the window to take a max over. + Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)` + output_size: the target output size of the image of the form `oH x oW`. + Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`. + Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}` + output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. + This has to be a number or tuple in the range (0, 1). + Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}` + and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}` + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + :math:`(H_{out}, W_{out})=\text{output\_size}` or + :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`. + + Examples: + >>> # pool of square window of size=3, and target output size 13x12 + >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12)) + >>> # pool of square window and target output size being half of input image size + >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + .. _Fractional MaxPooling: + https://arxiv.org/abs/1412.6071 + """ + + __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"] + + kernel_size: _size_2_t + return_indices: bool + output_size: _size_2_t + output_ratio: _ratio_2_t + + def __init__( + self, + kernel_size: _size_2_t, + output_size: _size_2_t | None = None, + output_ratio: _ratio_2_t | None = None, + return_indices: bool = False, + _random_samples=None, + ) -> None: + super().__init__() + self.kernel_size = _pair(kernel_size) + self.return_indices = return_indices + self.register_buffer("_random_samples", _random_samples) + self.output_size = _pair(output_size) if output_size is not None else None + self.output_ratio = _pair(output_ratio) if output_ratio is not None else None + if output_size is None and output_ratio is None: + raise ValueError( + "FractionalMaxPool2d requires specifying either " + "an output size, or a pooling ratio" + ) + if output_size is not None and output_ratio is not None: + raise ValueError( + "only one of output_size and output_ratio may be specified" + ) + if self.output_ratio is not None: + if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1): + raise ValueError( + f"output_ratio must be between 0 and 1 (got {output_ratio})" + ) + + def forward(self, input: Tensor): + return F.fractional_max_pool2d( + input, + self.kernel_size, + self.output_size, + self.output_ratio, + self.return_indices, + _random_samples=self._random_samples, + ) + + +class FractionalMaxPool3d(Module): + r"""Applies a 3D fractional max pooling over an input signal composed of several input planes. + + Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham + + The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic + step size determined by the target output size. + The number of output features is equal to the number of input planes. + + .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined. + + Args: + kernel_size: the size of the window to take a max over. + Can be a single number `k` (for a square kernel of `k x k x k`) or a tuple `(kt x kh x kw)`, + `k` must greater than 0. + output_size: the target output size of the image of the form `oT x oH x oW`. + Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH` + output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. + This has to be a number or tuple in the range (0, 1) + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False`` + + Shape: + - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where + :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or + :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})` + + Examples: + >>> # pool of cubic window of size=3, and target output size 13x12x11 + >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11)) + >>> # pool of cubic window and target output size being half of input size + >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5)) + >>> input = torch.randn(20, 16, 50, 32, 16) + >>> output = m(input) + + .. _Fractional MaxPooling: + https://arxiv.org/abs/1412.6071 + """ + + __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"] + kernel_size: _size_3_t + return_indices: bool + output_size: _size_3_t + output_ratio: _ratio_3_t + + def __init__( + self, + kernel_size: _size_3_t, + output_size: _size_3_t | None = None, + output_ratio: _ratio_3_t | None = None, + return_indices: bool = False, + _random_samples=None, + ) -> None: + super().__init__() + if (isinstance(kernel_size, int) and kernel_size <= 0) or ( + isinstance(kernel_size, (tuple, list)) + and not all(k > 0 for k in kernel_size) + ): + raise ValueError(f"kernel_size must greater than 0, but got {kernel_size}") + self.kernel_size = _triple(kernel_size) + self.return_indices = return_indices + self.register_buffer("_random_samples", _random_samples) + self.output_size = _triple(output_size) if output_size is not None else None + self.output_ratio = _triple(output_ratio) if output_ratio is not None else None + if output_size is None and output_ratio is None: + raise ValueError( + "FractionalMaxPool3d requires specifying either " + "an output size, or a pooling ratio" + ) + if output_size is not None and output_ratio is not None: + raise ValueError( + "only one of output_size and output_ratio may be specified" + ) + if self.output_ratio is not None: + if not ( + 0 < self.output_ratio[0] < 1 + and 0 < self.output_ratio[1] < 1 + and 0 < self.output_ratio[2] < 1 + ): + raise ValueError( + f"output_ratio must be between 0 and 1 (got {output_ratio})" + ) + + def forward(self, input: Tensor): + return F.fractional_max_pool3d( + input, + self.kernel_size, + self.output_size, + self.output_ratio, + self.return_indices, + _random_samples=self._random_samples, + ) + + +class _LPPoolNd(Module): + __constants__ = ["norm_type", "kernel_size", "stride", "ceil_mode"] + + norm_type: float + ceil_mode: bool + + def __init__( + self, + norm_type: float, + kernel_size: _size_any_t, + stride: _size_any_t | None = None, + ceil_mode: bool = False, + ) -> None: + super().__init__() + self.norm_type = norm_type + self.kernel_size = kernel_size + self.stride = stride + self.ceil_mode = ceil_mode + + def extra_repr(self) -> str: + return ( + "norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, " + "ceil_mode={ceil_mode}".format(**self.__dict__) + ) + + +class LPPool1d(_LPPoolNd): + r"""Applies a 1D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling) + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: a single int, the size of the window + stride: a single int, the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Note: + When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the + left padding or the input. Sliding windows that would start in the right padded region are ignored. + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + + Examples:: + >>> # power-2 pool of window of length 3, with stride 2. + >>> m = nn.LPPool1d(2, 3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + """ + + kernel_size: _size_1_t + stride: _size_1_t + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.lp_pool1d( + input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode + ) + + +class LPPool2d(_LPPoolNd): + r"""Applies a 2D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to average pooling) + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Note: + When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the + left padding or the input. Sliding windows that would start in the right padded region are ignored. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + Examples:: + + >>> # power-2 pool of square window of size=3, stride=2 + >>> m = nn.LPPool2d(2, 3, stride=2) + >>> # pool of non-square window of power 1.2 + >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + """ + + kernel_size: _size_2_t + stride: _size_2_t + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.lp_pool2d( + input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode + ) + + +class LPPool3d(_LPPoolNd): + r"""Applies a 3D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to average pooling) + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the height, width and depth dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Note: + When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the + left padding or the input. Sliding windows that would start in the right padded region are ignored. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor + + Examples:: + + >>> # power-2 pool of square window of size=3, stride=2 + >>> m = nn.LPPool3d(2, 3, stride=2) + >>> # pool of non-square window of power 1.2 + >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + + """ + + kernel_size: _size_3_t + stride: _size_3_t + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.lp_pool3d( + input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode + ) + + +class _AdaptiveMaxPoolNd(Module): + __constants__ = ["output_size", "return_indices"] + return_indices: bool + + def __init__( + self, output_size: _size_any_opt_t, return_indices: bool = False + ) -> None: + super().__init__() + self.output_size = output_size + self.return_indices = return_indices + + def extra_repr(self) -> str: + return f"output_size={self.output_size}" + + +# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and +# output shapes are, and how the operation computes output. + + +class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd): + r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes. + + The output size is :math:`L_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size :math:`L_{out}`. + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool1d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + :math:`L_{out}=\text{output\_size}`. + + Examples: + >>> # target output size of 5 + >>> m = nn.AdaptiveMaxPool1d(5) + >>> input = torch.randn(1, 64, 8) + >>> output = m(input) + + """ + + output_size: _size_1_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.adaptive_max_pool1d(input, self.output_size, self.return_indices) + + +class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd): + r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes. + + The output is of size :math:`H_{out} \times W_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`. + Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a + square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}` + can be either a ``int``, or ``None`` which means the size will be the same as that + of the input. + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool2d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + :math:`(H_{out}, W_{out})=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7 + >>> m = nn.AdaptiveMaxPool2d((5, 7)) + >>> input = torch.randn(1, 64, 8, 9) + >>> output = m(input) + >>> # target output size of 7x7 (square) + >>> m = nn.AdaptiveMaxPool2d(7) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + >>> # target output size of 10x7 + >>> m = nn.AdaptiveMaxPool2d((None, 7)) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + + """ + + output_size: _size_2_opt_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.adaptive_max_pool2d(input, self.output_size, self.return_indices) + + +class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd): + r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes. + + The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`. + Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single + :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`. + :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a + ``int``, or ``None`` which means the size will be the same as that of the input. + + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool3d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7x9 + >>> m = nn.AdaptiveMaxPool3d((5, 7, 9)) + >>> input = torch.randn(1, 64, 8, 9, 10) + >>> output = m(input) + >>> # target output size of 7x7x7 (cube) + >>> m = nn.AdaptiveMaxPool3d(7) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + >>> # target output size of 7x9x8 + >>> m = nn.AdaptiveMaxPool3d((7, None, None)) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + + """ + + output_size: _size_3_opt_t + + def forward(self, input: Tensor): + """Runs the forward pass.""" + return F.adaptive_max_pool3d(input, self.output_size, self.return_indices) + + +class _AdaptiveAvgPoolNd(Module): + __constants__ = ["output_size"] + + def __init__(self, output_size: _size_any_opt_t) -> None: + super().__init__() + self.output_size = output_size + + def extra_repr(self) -> str: + return f"output_size={self.output_size}" + + +class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd): + r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes. + + The output size is :math:`L_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size :math:`L_{out}`. + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + :math:`L_{out}=\text{output\_size}`. + + Examples: + >>> # target output size of 5 + >>> m = nn.AdaptiveAvgPool1d(5) + >>> input = torch.randn(1, 64, 8) + >>> output = m(input) + + """ + + output_size: _size_1_t + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.adaptive_avg_pool1d(input, self.output_size) + + +class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd): + r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes. + + The output is of size H x W, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form H x W. + Can be a tuple (H, W) or a single H for a square image H x H. + H and W can be either a ``int``, or ``None`` which means the size will + be the same as that of the input. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where + :math:`S=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7 + >>> m = nn.AdaptiveAvgPool2d((5, 7)) + >>> input = torch.randn(1, 64, 8, 9) + >>> output = m(input) + >>> # target output size of 7x7 (square) + >>> m = nn.AdaptiveAvgPool2d(7) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + >>> # target output size of 10x7 + >>> m = nn.AdaptiveAvgPool2d((None, 7)) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + + """ + + output_size: _size_2_opt_t + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.adaptive_avg_pool2d(input, self.output_size) + + +class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd): + r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes. + + The output is of size D x H x W, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the form D x H x W. + Can be a tuple (D, H, W) or a single number D for a cube D x D x D. + D, H and W can be either a ``int``, or ``None`` which means the size will + be the same as that of the input. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`, + where :math:`S=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7x9 + >>> m = nn.AdaptiveAvgPool3d((5, 7, 9)) + >>> input = torch.randn(1, 64, 8, 9, 10) + >>> output = m(input) + >>> # target output size of 7x7x7 (cube) + >>> m = nn.AdaptiveAvgPool3d(7) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + >>> # target output size of 7x9x8 + >>> m = nn.AdaptiveAvgPool3d((7, None, None)) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + + """ + + output_size: _size_3_opt_t + + def forward(self, input: Tensor) -> Tensor: + """Runs the forward pass.""" + return F.adaptive_avg_pool3d(input, self.output_size) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..68e8292870fc8a8d19ce3307294377b162c8b6fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py @@ -0,0 +1,1850 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import math +import numbers +import warnings +import weakref +from typing import overload +from typing_extensions import deprecated + +import torch +from torch import _VF, Tensor +from torch.nn import init +from torch.nn.parameter import Parameter +from torch.nn.utils.rnn import PackedSequence + +from .module import Module + + +__all__ = [ + "RNNBase", + "RNN", + "LSTM", + "GRU", + "RNNCellBase", + "RNNCell", + "LSTMCell", + "GRUCell", +] + +_rnn_impls = { + "RNN_TANH": _VF.rnn_tanh, + "RNN_RELU": _VF.rnn_relu, +} + + +def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: + return tensor.index_select(dim, permutation) + + +@deprecated( + "`apply_permutation` is deprecated, please use `tensor.index_select(dim, permutation)` instead", + category=FutureWarning, +) +def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: + return _apply_permutation(tensor, permutation, dim) + + +class RNNBase(Module): + r"""Base class for RNN modules (RNN, LSTM, GRU). + + Implements aspects of RNNs shared by the RNN, LSTM, and GRU classes, such as module initialization + and utility methods for parameter storage management. + + .. note:: + The forward method is not implemented by the RNNBase class. + + .. note:: + LSTM and GRU classes override some methods implemented by RNNBase. + """ + + __constants__ = [ + "mode", + "input_size", + "hidden_size", + "num_layers", + "bias", + "batch_first", + "dropout", + "bidirectional", + "proj_size", + ] + __jit_unused_properties__ = ["all_weights"] + + mode: str + input_size: int + hidden_size: int + num_layers: int + bias: bool + batch_first: bool + dropout: float + bidirectional: bool + proj_size: int + + def __init__( + self, + mode: str, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + proj_size: int = 0, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.mode = mode + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = float(dropout) + self.bidirectional = bidirectional + self.proj_size = proj_size + self._flat_weight_refs: list[weakref.ReferenceType[Parameter] | None] = [] + num_directions = 2 if bidirectional else 1 + + if ( + not isinstance(dropout, numbers.Number) + or not 0 <= dropout <= 1 + or isinstance(dropout, bool) + ): + raise ValueError( + "dropout should be a number in range [0, 1] " + "representing the probability of an element being " + "zeroed" + ) + if dropout > 0 and num_layers == 1: + warnings.warn( + "dropout option adds dropout after all but last " + "recurrent layer, so non-zero dropout expects " + f"num_layers greater than 1, but got dropout={dropout} and " + f"num_layers={num_layers}", + stacklevel=2, + ) + + if not isinstance(hidden_size, int): + raise TypeError( + f"hidden_size should be of type int, got: {type(hidden_size).__name__}" + ) + if hidden_size <= 0: + raise ValueError("hidden_size must be greater than zero") + if num_layers <= 0: + raise ValueError("num_layers must be greater than zero") + if proj_size < 0: + raise ValueError( + "proj_size should be a positive integer or zero to disable projections" + ) + if proj_size >= hidden_size: + raise ValueError("proj_size has to be smaller than hidden_size") + + if mode == "LSTM": + gate_size = 4 * hidden_size + elif mode == "GRU": + gate_size = 3 * hidden_size + elif mode == "RNN_TANH": + gate_size = hidden_size + elif mode == "RNN_RELU": + gate_size = hidden_size + else: + raise ValueError("Unrecognized RNN mode: " + mode) + + self._flat_weights_names = [] + self._all_weights = [] + for layer in range(num_layers): + for direction in range(num_directions): + real_hidden_size = proj_size if proj_size > 0 else hidden_size + layer_input_size = ( + input_size if layer == 0 else real_hidden_size * num_directions + ) + + w_ih = Parameter( + torch.empty((gate_size, layer_input_size), **factory_kwargs) + ) + w_hh = Parameter( + torch.empty((gate_size, real_hidden_size), **factory_kwargs) + ) + b_ih = Parameter(torch.empty(gate_size, **factory_kwargs)) + # Second bias vector included for CuDNN compatibility. Only one + # bias vector is needed in standard definition. + b_hh = Parameter(torch.empty(gate_size, **factory_kwargs)) + layer_params: tuple[Tensor, ...] = () + if self.proj_size == 0: + if bias: + layer_params = (w_ih, w_hh, b_ih, b_hh) + else: + layer_params = (w_ih, w_hh) + else: + w_hr = Parameter( + torch.empty((proj_size, hidden_size), **factory_kwargs) + ) + if bias: + layer_params = (w_ih, w_hh, b_ih, b_hh, w_hr) + else: + layer_params = (w_ih, w_hh, w_hr) + + suffix = "_reverse" if direction == 1 else "" + param_names = ["weight_ih_l{}{}", "weight_hh_l{}{}"] + if bias: + param_names += ["bias_ih_l{}{}", "bias_hh_l{}{}"] + if self.proj_size > 0: + param_names += ["weight_hr_l{}{}"] + param_names = [x.format(layer, suffix) for x in param_names] + + for name, param in zip(param_names, layer_params, strict=True): + setattr(self, name, param) + self._flat_weights_names.extend(param_names) + self._all_weights.append(param_names) + + self._init_flat_weights() + + self.reset_parameters() + + def _init_flat_weights(self) -> None: + self._flat_weights = [ + getattr(self, wn) if hasattr(self, wn) else None + for wn in self._flat_weights_names + ] + self._flat_weight_refs = [ + weakref.ref(w) if w is not None else None for w in self._flat_weights + ] + self.flatten_parameters() + + def __setattr__(self, attr, value) -> None: + if hasattr(self, "_flat_weights_names") and attr in self._flat_weights_names: + # keep self._flat_weights up to date if you do self.weight = ... + idx = self._flat_weights_names.index(attr) + self._flat_weights[idx] = value + super().__setattr__(attr, value) + + def flatten_parameters(self) -> None: + """Reset parameter data pointer so that they can use faster code paths. + + Right now, this works only if the module is on the GPU and cuDNN is enabled. + Otherwise, it's a no-op. + """ + # Short-circuits if _flat_weights is only partially instantiated + if len(self._flat_weights) != len(self._flat_weights_names): + return + + for w in self._flat_weights: + if not isinstance(w, Tensor): + return + # Short-circuits if any tensor in self._flat_weights is not acceptable to cuDNN + # or the tensors in _flat_weights are of different dtypes + + first_fw = self._flat_weights[0] # type: ignore[union-attr] + dtype = first_fw.dtype # type: ignore[union-attr] + for fw in self._flat_weights: + if ( + not isinstance(fw, Tensor) + or fw.dtype != dtype + or not fw.is_cuda + or not torch.backends.cudnn.is_acceptable(fw) + ): + return + + # If any parameters alias, we fall back to the slower, copying code path. This is + # a sufficient check, because overlapping parameter buffers that don't completely + # alias would break the assumptions of the uniqueness check in + # Module.named_parameters(). + unique_data_ptrs = { + p.data_ptr() # type: ignore[union-attr] + for p in self._flat_weights + } + if len(unique_data_ptrs) != len(self._flat_weights): + return + + with torch.cuda.device_of(first_fw): + import torch.backends.cudnn.rnn as rnn + + # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is + # an inplace operation on self._flat_weights + with torch.no_grad(): + if torch._use_cudnn_rnn_flatten_weight(): + num_weights = 4 if self.bias else 2 + if self.proj_size > 0: + num_weights += 1 + torch._cudnn_rnn_flatten_weight( + self._flat_weights, # type: ignore[arg-type] + num_weights, + self.input_size, + rnn.get_cudnn_mode(self.mode), + self.hidden_size, + self.proj_size, + self.num_layers, + self.batch_first, + bool(self.bidirectional), + ) + + def _apply(self, fn, recurse=True): + self._flat_weight_refs = [] + ret = super()._apply(fn, recurse) + + # Resets _flat_weights + # Note: be v. careful before removing this, as 3rd party device types + # likely rely on this behavior to properly .to() modules like LSTM. + self._init_flat_weights() + + return ret + + def reset_parameters(self) -> None: + stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0 + for weight in self.parameters(): + init.uniform_(weight, -stdv, stdv) + + def check_input(self, input: Tensor, batch_sizes: Tensor | None) -> None: + if not torch.jit.is_scripting(): + if ( + input.dtype != self._flat_weights[0].dtype # type: ignore[union-attr] + and not torch._C._is_any_autocast_enabled() + ): + raise ValueError( + f"input must have the type {self._flat_weights[0].dtype}, got type {input.dtype}" # type: ignore[union-attr] + ) + expected_input_dim = 2 if batch_sizes is not None else 3 + if input.dim() != expected_input_dim: + raise RuntimeError( + f"input must have {expected_input_dim} dimensions, got {input.dim()}" + ) + if self.input_size != input.size(-1): + raise RuntimeError( + f"input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}" + ) + + def get_expected_hidden_size( + self, input: Tensor, batch_sizes: Tensor | None + ) -> tuple[int, int, int]: + if batch_sizes is not None: + mini_batch = int(batch_sizes[0]) + else: + mini_batch = input.size(0) if self.batch_first else input.size(1) + num_directions = 2 if self.bidirectional else 1 + if self.proj_size > 0: + expected_hidden_size = ( + self.num_layers * num_directions, + mini_batch, + self.proj_size, + ) + else: + expected_hidden_size = ( + self.num_layers * num_directions, + mini_batch, + self.hidden_size, + ) + return expected_hidden_size + + def check_hidden_size( + self, + hx: Tensor, + expected_hidden_size: tuple[int, int, int], + msg: str = "Expected hidden size {}, got {}", + ) -> None: + if hx.size() != expected_hidden_size: + raise RuntimeError(msg.format(expected_hidden_size, list(hx.size()))) + + def _weights_have_changed(self): + # Returns True if the weight tensors have changed since the last forward pass. + # This is the case when used with torch.func.functional_call(), for example. + weights_changed = False + for ref, name in zip( + self._flat_weight_refs, self._flat_weights_names, strict=True + ): + weight = getattr(self, name) if hasattr(self, name) else None + if weight is not None and ref is not None and ref() is not weight: + weights_changed = True + break + return weights_changed + + def check_forward_args( + self, input: Tensor, hidden: Tensor, batch_sizes: Tensor | None + ) -> None: + self.check_input(input, batch_sizes) + expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) + + self.check_hidden_size(hidden, expected_hidden_size) + + def permute_hidden(self, hx: Tensor, permutation: Tensor | None): + if permutation is None: + return hx + return _apply_permutation(hx, permutation) + + def extra_repr(self) -> str: + s = "{input_size}, {hidden_size}" + if self.proj_size != 0: + s += ", proj_size={proj_size}" + if self.num_layers != 1: + s += ", num_layers={num_layers}" + if self.bias is not True: + s += ", bias={bias}" + if self.batch_first is not False: + s += ", batch_first={batch_first}" + if self.dropout != 0: + s += ", dropout={dropout}" + if self.bidirectional is not False: + s += ", bidirectional={bidirectional}" + return s.format(**self.__dict__) + + def _update_flat_weights(self) -> None: + if not torch.jit.is_scripting(): + if self._weights_have_changed(): + self._init_flat_weights() + + def __getstate__(self): + # If weights have been changed, update the _flat_weights in __getstate__ here. + self._update_flat_weights() + # Don't serialize the weight references. + state = self.__dict__.copy() + del state["_flat_weight_refs"] + return state + + def __setstate__(self, d): + super().__setstate__(d) + if "all_weights" in d: + self._all_weights = d["all_weights"] + # In PyTorch 1.8 we added a proj_size member variable to LSTM. + # LSTMs that were serialized via torch.save(module) before PyTorch 1.8 + # don't have it, so to preserve compatibility we set proj_size here. + if "proj_size" not in d: + self.proj_size = 0 + + if not isinstance(self._all_weights[0][0], str): + num_layers = self.num_layers + num_directions = 2 if self.bidirectional else 1 + self._flat_weights_names = [] + self._all_weights = [] + for layer in range(num_layers): + for direction in range(num_directions): + suffix = "_reverse" if direction == 1 else "" + weights = [ + "weight_ih_l{}{}", + "weight_hh_l{}{}", + "bias_ih_l{}{}", + "bias_hh_l{}{}", + "weight_hr_l{}{}", + ] + weights = [x.format(layer, suffix) for x in weights] + if self.bias: + if self.proj_size > 0: + self._all_weights += [weights] + self._flat_weights_names.extend(weights) + else: + self._all_weights += [weights[:4]] + self._flat_weights_names.extend(weights[:4]) + else: + if self.proj_size > 0: + self._all_weights += [weights[:2]] + [weights[-1:]] + self._flat_weights_names.extend( + weights[:2] + [weights[-1:]] + ) + else: + self._all_weights += [weights[:2]] + self._flat_weights_names.extend(weights[:2]) + self._flat_weights = [ + getattr(self, wn) if hasattr(self, wn) else None + for wn in self._flat_weights_names + ] + + self._flat_weight_refs = [ + weakref.ref(w) if w is not None else None for w in self._flat_weights + ] + + @property + def all_weights(self) -> list[list[Parameter]]: + return [ + [getattr(self, weight) for weight in weights] + for weights in self._all_weights + ] + + def _replicate_for_data_parallel(self): + replica = super()._replicate_for_data_parallel() + # Need to copy these caches, otherwise the replica will share the same + # flat weights list. + replica._flat_weights = replica._flat_weights[:] + replica._flat_weights_names = replica._flat_weights_names[:] + return replica + + +class RNN(RNNBase): + r"""__init__(input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None) + + Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` + non-linearity to an input sequence. For each element in the input sequence, + each layer computes the following function: + + .. math:: + h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh}) + + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is + the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the + previous layer at time `t-1` or the initial hidden state at time `0`. + If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`. + + .. code-block:: python + + # Efficient implementation equivalent to the following with bidirectional=False + rnn = nn.RNN(input_size, hidden_size, num_layers) + params = dict(rnn.named_parameters()) + def forward(x, hx=None, batch_first=False): + if batch_first: + x = x.transpose(0, 1) + seq_len, batch_size, _ = x.size() + if hx is None: + hx = torch.zeros(rnn.num_layers, batch_size, rnn.hidden_size) + h_t_minus_1 = hx.clone() + h_t = hx.clone() + output = [] + for t in range(seq_len): + for layer in range(rnn.num_layers): + input_t = x[t] if layer == 0 else h_t[layer - 1] + h_t[layer] = torch.tanh( + input_t @ params[f"weight_ih_l{layer}"].T + + h_t_minus_1[layer] @ params[f"weight_hh_l{layer}"].T + + params[f"bias_hh_l{layer}"] + + params[f"bias_ih_l{layer}"] + ) + output.append(h_t[-1].clone()) + h_t_minus_1 = h_t.clone() + output = torch.stack(output) + if batch_first: + output = output.transpose(0, 1) + return output, h_t + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two RNNs together to form a `stacked RNN`, + with the second RNN taking in outputs of the first RNN and + computing the final results. Default: 1 + nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as `(batch, seq, feature)` instead of `(seq, batch, feature)`. + Note that this does not apply to hidden or cell states. See the + Inputs/Outputs sections below for details. Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + RNN layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` + + Inputs: input, hx + * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input, + :math:`(L, N, H_{in})` when ``batch_first=False`` or + :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of + the input sequence. The input can also be a packed variable length sequence. + See :func:`torch.nn.utils.rnn.pack_padded_sequence` or + :func:`torch.nn.utils.rnn.pack_sequence` for details. + * **hx**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden + state for the input sequence batch. Defaults to zeros if not provided. + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input\_size} \\ + H_{out} ={} & \text{hidden\_size} + \end{aligned} + + Outputs: output, h_n + * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input, + :math:`(L, N, D * H_{out})` when ``batch_first=False`` or + :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features + `(h_t)` from the last layer of the RNN, for each `t`. If a + :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output + will also be a packed sequence. + * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state + for each element in the batch. + + Attributes: + weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, + of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is + `(hidden_size, num_directions * hidden_size)` + weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, + of shape `(hidden_size, hidden_size)` + bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, + of shape `(hidden_size)` + bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, + of shape `(hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + .. note:: + For bidirectional RNNs, forward and backward are directions 0 and 1 respectively. + Example of splitting the output layers when ``batch_first=False``: + ``output.view(seq_len, batch, num_directions, hidden_size)``. + + .. note:: + ``batch_first`` argument is ignored for unbatched inputs. + + .. include:: ../cudnn_rnn_determinism.rst + + .. include:: ../cudnn_persistent_rnn.rst + + Examples:: + + >>> rnn = nn.RNN(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> output, hn = rnn(input, h0) + """ + + @overload + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + nonlinearity: str = "tanh", + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + device=None, + dtype=None, + ) -> None: ... + + @overload + def __init__(self, *args, **kwargs) -> None: ... + + def __init__(self, *args, **kwargs): + if "proj_size" in kwargs: + raise ValueError( + "proj_size argument is only supported for LSTM, not RNN or GRU" + ) + if len(args) > 3: + self.nonlinearity = args[3] + args = args[:3] + args[4:] + else: + self.nonlinearity = kwargs.pop("nonlinearity", "tanh") + if self.nonlinearity == "tanh": + mode = "RNN_TANH" + elif self.nonlinearity == "relu": + mode = "RNN_RELU" + else: + raise ValueError( + f"Unknown nonlinearity '{self.nonlinearity}'. Select from 'tanh' or 'relu'." + ) + super().__init__(mode, *args, **kwargs) + + @overload + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: Tensor, + hx: Tensor | None = None, + ) -> tuple[Tensor, Tensor]: + pass + + @overload + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: PackedSequence, + hx: Tensor | None = None, + ) -> tuple[PackedSequence, Tensor]: + pass + + def forward(self, input, hx=None): # noqa: F811 + """ + Runs the forward pass. + """ + self._update_flat_weights() + + num_directions = 2 if self.bidirectional else 1 + orig_input = input + + if isinstance(orig_input, PackedSequence): + input, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = batch_sizes[0] + # script() is unhappy when max_batch_size is different type in cond branches, so we duplicate + if hx is None: + hx = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + else: + batch_sizes = None + if input.dim() not in (2, 3): + raise ValueError( + f"RNN: Expected input to be 2D or 3D, got {input.dim()}D tensor instead" + ) + is_batched = input.dim() == 3 + batch_dim = 0 if self.batch_first else 1 + if not is_batched: + input = input.unsqueeze(batch_dim) + if hx is not None: + if hx.dim() != 2: + raise RuntimeError( + f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor" + ) + hx = hx.unsqueeze(1) + else: + if hx is not None and hx.dim() != 3: + raise RuntimeError( + f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor" + ) + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + if hx is None: + hx = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + assert hx is not None + self.check_forward_args(input, hx, batch_sizes) + assert self.mode == "RNN_TANH" or self.mode == "RNN_RELU" + if batch_sizes is None: + if self.mode == "RNN_TANH": + result = _VF.rnn_tanh( + input, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = _VF.rnn_relu( + input, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + if self.mode == "RNN_TANH": + result = _VF.rnn_tanh( + input, + batch_sizes, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + else: + result = _VF.rnn_relu( + input, + batch_sizes, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + + output = result[0] + hidden = result[1] + + if isinstance(orig_input, PackedSequence): + output_packed = PackedSequence( + output, + batch_sizes, + sorted_indices, + unsorted_indices, + ) + return output_packed, self.permute_hidden(hidden, unsorted_indices) + + if not is_batched: # type: ignore[possibly-undefined] + output = output.squeeze(batch_dim) # type: ignore[possibly-undefined] + hidden = hidden.squeeze(1) + + return output, self.permute_hidden(hidden, unsorted_indices) + + +# XXX: LSTM and GRU implementation is different from RNNBase, this is because: +# 1. we want to support nn.LSTM and nn.GRU in TorchScript and TorchScript in +# its current state could not support the python Union Type or Any Type +# 2. TorchScript static typing does not allow a Function or Callable type in +# Dict values, so we have to separately call _VF instead of using _rnn_impls +# 3. This is temporary only and in the transition state that we want to make it +# on time for the release +# +# More discussion details in https://github.com/pytorch/pytorch/pull/23266 +# +# TODO: remove the overriding implementations for LSTM and GRU when TorchScript +# support expressing these two modules generally. + + +class LSTM(RNNBase): + r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,proj_size=0,device=None,dtype=None) + + Apply a multi-layer long short-term memory (LSTM) RNN to an input sequence. + For each element in the input sequence, each layer computes the following + function: + + .. math:: + \begin{array}{ll} \\ + i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\ + f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\ + g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\ + o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\ + c_t = f_t \odot c_{t-1} + i_t \odot g_t \\ + h_t = o_t \odot \tanh(c_t) \\ + \end{array} + + where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell + state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}` + is the hidden state of the layer at time `t-1` or the initial hidden + state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`, + :math:`o_t` are the input, forget, cell, and output gates, respectively. + :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer + (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by + dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random + variable which is :math:`0` with probability :attr:`dropout`. + + If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes + the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from + ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly). + Second, the output hidden state of each layer will be multiplied by a learnable projection + matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output + of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact + dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two LSTMs together to form a `stacked LSTM`, + with the second LSTM taking in outputs of the first LSTM and + computing the final results. Default: 1 + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as `(batch, seq, feature)` instead of `(seq, batch, feature)`. + Note that this does not apply to hidden or cell states. See the + Inputs/Outputs sections below for details. Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + LSTM layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False`` + proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0 + + Inputs: input, (h_0, c_0) + * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input, + :math:`(L, N, H_{in})` when ``batch_first=False`` or + :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of + the input sequence. The input can also be a packed variable length sequence. + See :func:`torch.nn.utils.rnn.pack_padded_sequence` or + :func:`torch.nn.utils.rnn.pack_sequence` for details. + * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{out})` containing the + initial hidden state for each element in the input sequence. + Defaults to zeros if (h_0, c_0) is not provided. + * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{cell})` containing the + initial cell state for each element in the input sequence. + Defaults to zeros if (h_0, c_0) is not provided. + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input\_size} \\ + H_{cell} ={} & \text{hidden\_size} \\ + H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\ + \end{aligned} + + Outputs: output, (h_n, c_n) + * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input, + :math:`(L, N, D * H_{out})` when ``batch_first=False`` or + :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features + `(h_t)` from the last layer of the LSTM, for each `t`. If a + :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output + will also be a packed sequence. When ``bidirectional=True``, `output` will contain + a concatenation of the forward and reverse hidden states at each time step in the sequence. + * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{out})` containing the + final hidden state for each element in the sequence. When ``bidirectional=True``, + `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively. + * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or + :math:`(D * \text{num\_layers}, N, H_{cell})` containing the + final cell state for each element in the sequence. When ``bidirectional=True``, + `c_n` will contain a concatenation of the final forward and reverse cell states, respectively. + + Attributes: + weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer + `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`. + Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If + ``proj_size > 0`` was specified, the shape will be + `(4*hidden_size, num_directions * proj_size)` for `k > 0` + weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer + `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0`` + was specified, the shape will be `(4*hidden_size, proj_size)`. + bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer + `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)` + bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer + `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)` + weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer + of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was + specified. + weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction. + Only present when ``bidirectional=True``. + weight_hh_l[k]_reverse: Analogous to `weight_hh_l[k]` for the reverse direction. + Only present when ``bidirectional=True``. + bias_ih_l[k]_reverse: Analogous to `bias_ih_l[k]` for the reverse direction. + Only present when ``bidirectional=True``. + bias_hh_l[k]_reverse: Analogous to `bias_hh_l[k]` for the reverse direction. + Only present when ``bidirectional=True``. + weight_hr_l[k]_reverse: Analogous to `weight_hr_l[k]` for the reverse direction. + Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified. + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + .. note:: + For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively. + Example of splitting the output layers when ``batch_first=False``: + ``output.view(seq_len, batch, num_directions, hidden_size)``. + + .. note:: + For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the + former contains the final forward and reverse hidden states, while the latter contains the + final forward hidden state and the initial reverse hidden state. + + .. note:: + ``batch_first`` argument is ignored for unbatched inputs. + + .. note:: + ``proj_size`` should be smaller than ``hidden_size``. + + .. include:: ../cudnn_rnn_determinism.rst + + .. include:: ../cudnn_persistent_rnn.rst + + Examples:: + + >>> rnn = nn.LSTM(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> c0 = torch.randn(2, 3, 20) + >>> output, (hn, cn) = rnn(input, (h0, c0)) + """ + + @overload + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + proj_size: int = 0, + device=None, + dtype=None, + ) -> None: ... + + @overload + def __init__(self, *args, **kwargs) -> None: ... + + def __init__(self, *args, **kwargs): + super().__init__("LSTM", *args, **kwargs) + + def get_expected_cell_size( + self, input: Tensor, batch_sizes: Tensor | None + ) -> tuple[int, int, int]: + if batch_sizes is not None: + mini_batch = int(batch_sizes[0]) + else: + mini_batch = input.size(0) if self.batch_first else input.size(1) + num_directions = 2 if self.bidirectional else 1 + expected_hidden_size = ( + self.num_layers * num_directions, + mini_batch, + self.hidden_size, + ) + return expected_hidden_size + + # In the future, we should prevent mypy from applying contravariance rules here. + # See torch/nn/modules/module.py::_forward_unimplemented + def check_forward_args( + self, + input: Tensor, + hidden: tuple[Tensor, Tensor], # type: ignore[override] + batch_sizes: Tensor | None, + ) -> None: + self.check_input(input, batch_sizes) + self.check_hidden_size( + hidden[0], + self.get_expected_hidden_size(input, batch_sizes), + "Expected hidden[0] size {}, got {}", + ) + self.check_hidden_size( + hidden[1], + self.get_expected_cell_size(input, batch_sizes), + "Expected hidden[1] size {}, got {}", + ) + + # Same as above, see torch/nn/modules/module.py::_forward_unimplemented + def permute_hidden( # type: ignore[override] + self, + hx: tuple[Tensor, Tensor], + permutation: Tensor | None, + ) -> tuple[Tensor, Tensor]: + if permutation is None: + return hx + return _apply_permutation(hx[0], permutation), _apply_permutation( + hx[1], permutation + ) + + # Same as above, see torch/nn/modules/module.py::_forward_unimplemented + @overload # type: ignore[override] + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: Tensor, + hx: tuple[Tensor, Tensor] | None = None, + ) -> tuple[Tensor, tuple[Tensor, Tensor]]: # noqa: F811 + pass + + # Same as above, see torch/nn/modules/module.py::_forward_unimplemented + @overload + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: PackedSequence, + hx: tuple[Tensor, Tensor] | None = None, + ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]: # noqa: F811 + pass + + def forward(self, input, hx=None): # noqa: F811 + self._update_flat_weights() + + orig_input = input + # xxx: isinstance check needs to be in conditional for TorchScript to compile + batch_sizes = None + num_directions = 2 if self.bidirectional else 1 + real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size + if isinstance(orig_input, PackedSequence): + input, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = batch_sizes[0] + if hx is None: + h_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + real_hidden_size, + dtype=input.dtype, + device=input.device, + ) + c_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + hx = (h_zeros, c_zeros) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + else: + if input.dim() not in (2, 3): + raise ValueError( + f"LSTM: Expected input to be 2D or 3D, got {input.dim()}D instead" + ) + is_batched = input.dim() == 3 + batch_dim = 0 if self.batch_first else 1 + if not is_batched: + input = input.unsqueeze(batch_dim) + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + if hx is None: + h_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + real_hidden_size, + dtype=input.dtype, + device=input.device, + ) + c_zeros = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + hx = (h_zeros, c_zeros) + self.check_forward_args(input, hx, batch_sizes) + else: + if is_batched: + if hx[0].dim() != 3 or hx[1].dim() != 3: + msg = ( + "For batched 3-D input, hx and cx should " + f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors" + ) + raise RuntimeError(msg) + else: + if hx[0].dim() != 2 or hx[1].dim() != 2: + msg = ( + "For unbatched 2-D input, hx and cx should " + f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors" + ) + raise RuntimeError(msg) + hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1)) + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + self.check_forward_args(input, hx, batch_sizes) + hx = self.permute_hidden(hx, sorted_indices) + + if batch_sizes is None: + result = _VF.lstm( + input, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = _VF.lstm( + input, + batch_sizes, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + output = result[0] + hidden = result[1:] + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + output_packed = PackedSequence( + output, + batch_sizes, + sorted_indices, + unsorted_indices, + ) + return output_packed, self.permute_hidden(hidden, unsorted_indices) + else: + if not is_batched: # type: ignore[possibly-undefined] + output = output.squeeze(batch_dim) # type: ignore[possibly-undefined] + hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1)) + return output, self.permute_hidden(hidden, unsorted_indices) + + +class GRU(RNNBase): + r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None) + + Apply a multi-layer gated recurrent unit (GRU) RNN to an input sequence. + For each element in the input sequence, each layer computes the following + function: + + .. math:: + \begin{array}{ll} + r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ + z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ + n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\ + h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} + \end{array} + + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input + at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer + at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`, + :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively. + :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer + (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by + dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random + variable which is :math:`0` with probability :attr:`dropout`. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two GRUs together to form a `stacked GRU`, + with the second GRU taking in outputs of the first GRU and + computing the final results. Default: 1 + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as `(batch, seq, feature)` instead of `(seq, batch, feature)`. + Note that this does not apply to hidden or cell states. See the + Inputs/Outputs sections below for details. Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + GRU layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False`` + + Inputs: input, h_0 + * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input, + :math:`(L, N, H_{in})` when ``batch_first=False`` or + :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of + the input sequence. The input can also be a packed variable length sequence. + See :func:`torch.nn.utils.rnn.pack_padded_sequence` or + :func:`torch.nn.utils.rnn.pack_sequence` for details. + * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or + :math:`(D * \text{num\_layers}, N, H_{out})` + containing the initial hidden state for the input sequence. Defaults to zeros if not provided. + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input\_size} \\ + H_{out} ={} & \text{hidden\_size} + \end{aligned} + + Outputs: output, h_n + * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input, + :math:`(L, N, D * H_{out})` when ``batch_first=False`` or + :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features + `(h_t)` from the last layer of the GRU, for each `t`. If a + :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output + will also be a packed sequence. + * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or + :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state + for the input sequence. + + Attributes: + weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer + (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`. + Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)` + weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer + (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)` + bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer + (b_ir|b_iz|b_in), of shape `(3*hidden_size)` + bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer + (b_hr|b_hz|b_hn), of shape `(3*hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + .. note:: + For bidirectional GRUs, forward and backward are directions 0 and 1 respectively. + Example of splitting the output layers when ``batch_first=False``: + ``output.view(seq_len, batch, num_directions, hidden_size)``. + + .. note:: + ``batch_first`` argument is ignored for unbatched inputs. + + .. note:: + The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks. + In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the + previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix + `W` and addition of bias: + + .. math:: + \begin{aligned} + n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn}) + \end{aligned} + + This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}` + + .. math:: + \begin{aligned} + n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) + \end{aligned} + + This implementation differs on purpose for efficiency. + + .. include:: ../cudnn_persistent_rnn.rst + + Examples:: + + >>> rnn = nn.GRU(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> output, hn = rnn(input, h0) + """ + + @overload + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = False, + dropout: float = 0.0, + bidirectional: bool = False, + device=None, + dtype=None, + ) -> None: ... + + @overload + def __init__(self, *args, **kwargs) -> None: ... + + def __init__(self, *args, **kwargs): + if "proj_size" in kwargs: + raise ValueError( + "proj_size argument is only supported for LSTM, not RNN or GRU" + ) + super().__init__("GRU", *args, **kwargs) + + @overload # type: ignore[override] + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: Tensor, + hx: Tensor | None = None, + ) -> tuple[Tensor, Tensor]: # noqa: F811 + pass + + @overload + @torch._jit_internal._overload_method # noqa: F811 + def forward( + self, + input: PackedSequence, + hx: Tensor | None = None, + ) -> tuple[PackedSequence, Tensor]: # noqa: F811 + pass + + def forward(self, input, hx=None): # noqa: F811 + self._update_flat_weights() + + orig_input = input + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + input, batch_sizes, sorted_indices, unsorted_indices = input + max_batch_size = batch_sizes[0] + if hx is None: + num_directions = 2 if self.bidirectional else 1 + hx = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + else: + batch_sizes = None + if input.dim() not in (2, 3): + raise ValueError( + f"GRU: Expected input to be 2D or 3D, got {input.dim()}D instead" + ) + is_batched = input.dim() == 3 + batch_dim = 0 if self.batch_first else 1 + if not is_batched: + input = input.unsqueeze(batch_dim) + if hx is not None: + if hx.dim() != 2: + raise RuntimeError( + f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor" + ) + hx = hx.unsqueeze(1) + else: + if hx is not None and hx.dim() != 3: + raise RuntimeError( + f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor" + ) + max_batch_size = input.size(0) if self.batch_first else input.size(1) + sorted_indices = None + unsorted_indices = None + if hx is None: + num_directions = 2 if self.bidirectional else 1 + hx = torch.zeros( + self.num_layers * num_directions, + max_batch_size, + self.hidden_size, + dtype=input.dtype, + device=input.device, + ) + else: + # Each batch of the hidden state should match the input sequence that + # the user believes he/she is passing in. + hx = self.permute_hidden(hx, sorted_indices) + + self.check_forward_args(input, hx, batch_sizes) + if batch_sizes is None: + result = _VF.gru( + input, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + self.batch_first, + ) + else: + result = _VF.gru( + input, + batch_sizes, + hx, + self._flat_weights, # type: ignore[arg-type] + self.bias, + self.num_layers, + self.dropout, + self.training, + self.bidirectional, + ) + output = result[0] + hidden = result[1] + + # xxx: isinstance check needs to be in conditional for TorchScript to compile + if isinstance(orig_input, PackedSequence): + output_packed = PackedSequence( + output, + batch_sizes, + sorted_indices, + unsorted_indices, + ) + return output_packed, self.permute_hidden(hidden, unsorted_indices) + else: + if not is_batched: # type: ignore[possibly-undefined] + output = output.squeeze(batch_dim) # type: ignore[possibly-undefined] + hidden = hidden.squeeze(1) + + return output, self.permute_hidden(hidden, unsorted_indices) + + +class RNNCellBase(Module): + __constants__ = ["input_size", "hidden_size", "bias"] + + input_size: int + hidden_size: int + bias: bool + weight_ih: Tensor + weight_hh: Tensor + # WARNING: bias_ih and bias_hh purposely not defined here. + # See https://github.com/pytorch/pytorch/issues/39670 + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool, + num_chunks: int, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + self.weight_ih = Parameter( + torch.empty((num_chunks * hidden_size, input_size), **factory_kwargs) + ) + self.weight_hh = Parameter( + torch.empty((num_chunks * hidden_size, hidden_size), **factory_kwargs) + ) + if bias: + self.bias_ih = Parameter( + torch.empty(num_chunks * hidden_size, **factory_kwargs) + ) + self.bias_hh = Parameter( + torch.empty(num_chunks * hidden_size, **factory_kwargs) + ) + else: + self.register_parameter("bias_ih", None) + self.register_parameter("bias_hh", None) + + self.reset_parameters() + + def extra_repr(self) -> str: + s = "{input_size}, {hidden_size}" + if "bias" in self.__dict__ and self.bias is not True: + s += ", bias={bias}" + if "nonlinearity" in self.__dict__ and self.nonlinearity != "tanh": + s += ", nonlinearity={nonlinearity}" + return s.format(**self.__dict__) + + def reset_parameters(self) -> None: + stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0 + for weight in self.parameters(): + init.uniform_(weight, -stdv, stdv) + + +class RNNCell(RNNCellBase): + r"""An Elman RNN cell with tanh or ReLU non-linearity. + + .. math:: + + h' = \tanh(W_{ih} x + b_{ih} + W_{hh} h + b_{hh}) + + If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` + + Inputs: input, hidden + - **input**: tensor containing input features + - **hidden**: tensor containing the initial hidden state + Defaults to zero if not provided. + + Outputs: h' + - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state + for each element in the batch + + Shape: + - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where + :math:`H_{in}` = `input_size`. + - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden + state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided. + - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state. + + Attributes: + weight_ih: the learnable input-hidden weights, of shape + `(hidden_size, input_size)` + weight_hh: the learnable hidden-hidden weights, of shape + `(hidden_size, hidden_size)` + bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` + bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + Examples:: + + >>> rnn = nn.RNNCell(10, 20) + >>> input = torch.randn(6, 3, 10) + >>> hx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx = rnn(input[i], hx) + ... output.append(hx) + """ + + __constants__ = ["input_size", "hidden_size", "bias", "nonlinearity"] + nonlinearity: str + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + nonlinearity: str = "tanh", + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs) + self.nonlinearity = nonlinearity + + def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor: + if input.dim() not in (1, 2): + raise ValueError( + f"RNNCell: Expected input to be 1D or 2D, got {input.dim()}D instead" + ) + if hx is not None and hx.dim() not in (1, 2): + raise ValueError( + f"RNNCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + else: + hx = hx.unsqueeze(0) if not is_batched else hx + + if self.nonlinearity == "tanh": + ret = _VF.rnn_tanh_cell( + input, + hx, + self.weight_ih, + self.weight_hh, + self.bias_ih, + self.bias_hh, + ) + elif self.nonlinearity == "relu": + ret = _VF.rnn_relu_cell( + input, + hx, + self.weight_ih, + self.weight_hh, + self.bias_ih, + self.bias_hh, + ) + else: + ret = input # TODO: remove when jit supports exception flow + raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}") + + if not is_batched: + ret = ret.squeeze(0) + + return ret + + +class LSTMCell(RNNCellBase): + r"""A long short-term memory (LSTM) cell. + + .. math:: + + \begin{array}{ll} + i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ + f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ + g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\ + o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ + c' = f \odot c + i \odot g \\ + h' = o \odot \tanh(c') \\ + \end{array} + + where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + bias: If ``False``, then the layer does not use bias weights `b_ih` and + `b_hh`. Default: ``True`` + + Inputs: input, (h_0, c_0) + - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features + - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state + - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state + + If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero. + + Outputs: (h_1, c_1) + - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state + - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state + + Attributes: + weight_ih: the learnable input-hidden weights, of shape + `(4*hidden_size, input_size)` + weight_hh: the learnable hidden-hidden weights, of shape + `(4*hidden_size, hidden_size)` + bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)` + bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Examples:: + + >>> rnn = nn.LSTMCell(10, 20) # (input_size, hidden_size) + >>> input = torch.randn(2, 3, 10) # (time_steps, batch, input_size) + >>> hx = torch.randn(3, 20) # (batch, hidden_size) + >>> cx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(input.size()[0]): + ... hx, cx = rnn(input[i], (hx, cx)) + ... output.append(hx) + >>> output = torch.stack(output, dim=0) + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs) + + def forward( + self, input: Tensor, hx: tuple[Tensor, Tensor] | None = None + ) -> tuple[Tensor, Tensor]: + if input.dim() not in (1, 2): + raise ValueError( + f"LSTMCell: Expected input to be 1D or 2D, got {input.dim()}D instead" + ) + if hx is not None: + for idx, value in enumerate(hx): + if value.dim() not in (1, 2): + raise ValueError( + f"LSTMCell: Expected hx[{idx}] to be 1D or 2D, got {value.dim()}D instead" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + zeros = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + hx = (zeros, zeros) + else: + hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx + + ret = _VF.lstm_cell( + input, + hx, + self.weight_ih, + self.weight_hh, + self.bias_ih, + self.bias_hh, + ) + + if not is_batched: + ret = (ret[0].squeeze(0), ret[1].squeeze(0)) + return ret + + +class GRUCell(RNNCellBase): + r"""A gated recurrent unit (GRU) cell. + + .. math:: + + \begin{array}{ll} + r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\ + z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\ + n = \tanh(W_{in} x + b_{in} + r \odot (W_{hn} h + b_{hn})) \\ + h' = (1 - z) \odot n + z \odot h + \end{array} + + where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + bias: If ``False``, then the layer does not use bias weights `b_ih` and + `b_hh`. Default: ``True`` + + Inputs: input, hidden + - **input** : tensor containing input features + - **hidden** : tensor containing the initial hidden + state for each element in the batch. + Defaults to zero if not provided. + + Outputs: h' + - **h'** : tensor containing the next hidden state + for each element in the batch + + Shape: + - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where + :math:`H_{in}` = `input_size`. + - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden + state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided. + - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state. + + Attributes: + weight_ih: the learnable input-hidden weights, of shape + `(3*hidden_size, input_size)` + weight_hh: the learnable hidden-hidden weights, of shape + `(3*hidden_size, hidden_size)` + bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)` + bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)` + + .. note:: + All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` + where :math:`k = \frac{1}{\text{hidden\_size}}` + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Examples:: + + >>> rnn = nn.GRUCell(10, 20) + >>> input = torch.randn(6, 3, 10) + >>> hx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + ... hx = rnn(input[i], hx) + ... output.append(hx) + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs) + + def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor: + if input.dim() not in (1, 2): + raise ValueError( + f"GRUCell: Expected input to be 1D or 2D, got {input.dim()}D instead" + ) + if hx is not None and hx.dim() not in (1, 2): + raise ValueError( + f"GRUCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead" + ) + is_batched = input.dim() == 2 + if not is_batched: + input = input.unsqueeze(0) + + if hx is None: + hx = torch.zeros( + input.size(0), self.hidden_size, dtype=input.dtype, device=input.device + ) + else: + hx = hx.unsqueeze(0) if not is_batched else hx + + ret = _VF.gru_cell( + input, + hx, + self.weight_ih, + self.weight_hh, + self.bias_ih, + self.bias_hh, + ) + + if not is_batched: + ret = ret.squeeze(0) + + return ret diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec531abce695374b919fbf92d4863ce73da515f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py @@ -0,0 +1,549 @@ +# mypy: allow-untyped-defs + +import torch +from torch import Tensor +from torch.nn import functional as F, init +from torch.nn.parameter import Parameter + +from .module import Module + + +__all__ = ["Embedding", "EmbeddingBag"] + + +class Embedding(Module): + r"""A simple lookup table that stores embeddings of a fixed dictionary and size. + + This module is often used to store word embeddings and retrieve them using indices. + The input to the module is a list of indices, and the output is the corresponding + word embeddings. + + Args: + num_embeddings (int): size of the dictionary of embeddings + embedding_dim (int): the size of each embedding vector + padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient; + therefore, the embedding vector at :attr:`padding_idx` is not updated during training, + i.e. it remains as a fixed "pad". For a newly constructed Embedding, + the embedding vector at :attr:`padding_idx` will default to all zeros, + but can be updated to another value to be used as the padding vector. + max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` + is renormalized to have norm :attr:`max_norm`. + norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. + scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of + the words in the mini-batch. Default ``False``. + sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. + See Notes for more details regarding sparse gradients. + + Attributes: + weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) + initialized from :math:`\mathcal{N}(0, 1)` + + Shape: + - Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract + - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}` + + .. note:: + Keep in mind that only a limited number of optimizers support + sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), + :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`) + + .. note:: + When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the + :attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be + modified in-place, performing a differentiable operation on ``Embedding.weight`` before + calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when + :attr:`max_norm` is not ``None``. For example:: + + n, d, m = 3, 5, 7 + embedding = nn.Embedding(n, d, max_norm=1.0) + W = torch.randn((m, d), requires_grad=True) + idx = torch.tensor([1, 2]) + a = ( + embedding.weight.clone() @ W.t() + ) # weight must be cloned for this to be differentiable + b = embedding(idx) @ W.t() # modifies weight in-place + out = a.unsqueeze(0) + b.unsqueeze(1) + loss = out.sigmoid().prod() + loss.backward() + + Examples:: + + >>> # an Embedding module containing 10 tensors of size 3 + >>> embedding = nn.Embedding(10, 3) + >>> # a batch of 2 samples of 4 indices each + >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding(input) + tensor([[[-0.0251, -1.6902, 0.7172], + [-0.6431, 0.0748, 0.6969], + [ 1.4970, 1.3448, -0.9685], + [-0.3677, -2.7265, -0.1685]], + + [[ 1.4970, 1.3448, -0.9685], + [ 0.4362, -0.4004, 0.9400], + [-0.6431, 0.0748, 0.6969], + [ 0.9124, -2.3616, 1.1151]]]) + + + >>> # example with padding_idx + >>> embedding = nn.Embedding(10, 3, padding_idx=0) + >>> input = torch.LongTensor([[0, 2, 0, 5]]) + >>> embedding(input) + tensor([[[ 0.0000, 0.0000, 0.0000], + [ 0.1535, -2.0309, 0.9315], + [ 0.0000, 0.0000, 0.0000], + [-0.1655, 0.9897, 0.0635]]]) + + >>> # example of changing `pad` vector + >>> padding_idx = 0 + >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx) + >>> embedding.weight + Parameter containing: + tensor([[ 0.0000, 0.0000, 0.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + >>> with torch.no_grad(): + ... embedding.weight[padding_idx] = torch.ones(3) + >>> embedding.weight + Parameter containing: + tensor([[ 1.0000, 1.0000, 1.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + """ + + __constants__ = [ + "num_embeddings", + "embedding_dim", + "padding_idx", + "max_norm", + "norm_type", + "scale_grad_by_freq", + "sparse", + ] + + num_embeddings: int + embedding_dim: int + padding_idx: int | None + max_norm: float | None + norm_type: float + scale_grad_by_freq: bool + weight: Tensor + freeze: bool + sparse: bool + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int | None = None, + max_norm: float | None = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + _weight: Tensor | None = None, + _freeze: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + if padding_idx is not None: + if padding_idx > 0: + assert padding_idx < self.num_embeddings, ( + "Padding_idx must be within num_embeddings" + ) + elif padding_idx < 0: + assert padding_idx >= -self.num_embeddings, ( + "Padding_idx must be within num_embeddings" + ) + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + if _weight is None: + self.weight = Parameter( + torch.empty((num_embeddings, embedding_dim), **factory_kwargs), + requires_grad=not _freeze, + ) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "Shape of weight does not match num_embeddings and embedding_dim" + self.weight = Parameter(_weight, requires_grad=not _freeze) + + self.sparse = sparse + + def reset_parameters(self) -> None: + init.normal_(self.weight) + self._fill_padding_idx_with_zero() + + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): + self.weight[self.padding_idx].fill_(0) + + def forward(self, input: Tensor) -> Tensor: + return F.embedding( + input, + self.weight, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + def extra_repr(self) -> str: + s = "{num_embeddings}, {embedding_dim}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + if self.sparse is not False: + s += ", sparse=True" + return s.format(**self.__dict__) + + @classmethod + def from_pretrained( + cls, + embeddings, + freeze=True, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, + ): + r"""Create Embedding instance from given 2-dimensional FloatTensor. + + Args: + embeddings (Tensor): FloatTensor containing weights for the Embedding. + First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``. + freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process. + Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True`` + padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient; + therefore, the embedding vector at :attr:`padding_idx` is not updated during training, + i.e. it remains as a fixed "pad". + max_norm (float, optional): See module initialization documentation. + norm_type (float, optional): See module initialization documentation. Default ``2``. + scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``. + sparse (bool, optional): See module initialization documentation. + + Examples:: + + >>> # FloatTensor containing pretrained weights + >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]]) + >>> embedding = nn.Embedding.from_pretrained(weight) + >>> # Get embeddings for index 1 + >>> input = torch.LongTensor([1]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding(input) + tensor([[ 4.0000, 5.1000, 6.3000]]) + """ + assert embeddings.dim() == 2, ( + "Embeddings parameter is expected to be 2-dimensional" + ) + rows, cols = embeddings.shape + embedding = cls( + num_embeddings=rows, + embedding_dim=cols, + _weight=embeddings, + _freeze=freeze, + padding_idx=padding_idx, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + sparse=sparse, + ) + return embedding + + +class EmbeddingBag(Module): + r"""Compute sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings. + + For bags of constant length, no :attr:`per_sample_weights`, no indices equal to :attr:`padding_idx`, + and with 2D inputs, this class + + * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``, + * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``, + * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``. + + However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these + operations. + + EmbeddingBag also supports per-sample weights as an argument to the forward + pass. This scales the output of the Embedding before performing a weighted + reduction as specified by ``mode``. If :attr:`per_sample_weights` is passed, the + only supported ``mode`` is ``"sum"``, which computes a weighted sum according to + :attr:`per_sample_weights`. + + Args: + num_embeddings (int): size of the dictionary of embeddings + embedding_dim (int): the size of each embedding vector + max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` + is renormalized to have norm :attr:`max_norm`. + norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. + scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of + the words in the mini-batch. Default ``False``. + Note: this option is not supported when ``mode="max"``. + mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag. + ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights` + into consideration. ``"mean"`` computes the average of the values + in the bag, ``"max"`` computes the max value over each bag. + Default: ``"mean"`` + sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See + Notes for more details regarding sparse gradients. Note: this option is not + supported when ``mode="max"``. + include_last_offset (bool, optional): if ``True``, the size of offsets is equal to the number of bags + 1. + The last element is the size of the input, or the ending index position + of the last bag (sequence). This matches the CSR format. Ignored when + input is 2D. Default ``False``. + padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the + gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated + during training, i.e. it remains as a fixed "pad". For a newly constructed + EmbeddingBag, the embedding vector at :attr:`padding_idx` will default to all + zeros, but can be updated to another value to be used as the padding vector. + Note that the embedding vector at :attr:`padding_idx` is excluded from the + reduction. + + Attributes: + weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)` + initialized from :math:`\mathcal{N}(0, 1)`. + + Examples:: + + >>> # an EmbeddingBag module containing 10 tensors of size 3 + >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum') + >>> # a batch of 2 samples of 4 indices each + >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long) + >>> offsets = torch.tensor([0, 4], dtype=torch.long) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding_sum(input, offsets) + tensor([[-0.8861, -5.4350, -0.0523], + [ 1.1306, -2.5798, -1.0044]]) + + >>> # Example with padding_idx + >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2) + >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long) + >>> offsets = torch.tensor([0, 4], dtype=torch.long) + >>> embedding_sum(input, offsets) + tensor([[ 0.0000, 0.0000, 0.0000], + [-0.7082, 3.2145, -2.6251]]) + + >>> # An EmbeddingBag can be loaded from an Embedding like so + >>> embedding = nn.Embedding(10, 3, padding_idx=2) + >>> embedding_sum = nn.EmbeddingBag.from_pretrained( + embedding.weight, + padding_idx=embedding.padding_idx, + mode='sum') + """ + + __constants__ = [ + "num_embeddings", + "embedding_dim", + "max_norm", + "norm_type", + "scale_grad_by_freq", + "mode", + "sparse", + "include_last_offset", + "padding_idx", + ] + + num_embeddings: int + embedding_dim: int + max_norm: float | None + norm_type: float + scale_grad_by_freq: bool + weight: Tensor + mode: str + sparse: bool + include_last_offset: bool + padding_idx: int | None + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + max_norm: float | None = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "mean", + sparse: bool = False, + _weight: Tensor | None = None, + include_last_offset: bool = False, + padding_idx: int | None = None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + if padding_idx is not None: + if padding_idx > 0: + assert padding_idx < self.num_embeddings, ( + "padding_idx must be within num_embeddings" + ) + elif padding_idx < 0: + assert padding_idx >= -self.num_embeddings, ( + "padding_idx must be within num_embeddings" + ) + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + if _weight is None: + self.weight = Parameter( + torch.empty((num_embeddings, embedding_dim), **factory_kwargs) + ) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "Shape of weight does not match num_embeddings and embedding_dim" + self.weight = Parameter(_weight) + self.mode = mode + self.sparse = sparse + self.include_last_offset = include_last_offset + + def reset_parameters(self) -> None: + init.normal_(self.weight) + self._fill_padding_idx_with_zero() + + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): + self.weight[self.padding_idx].fill_(0) + + def forward( + self, + input: Tensor, + offsets: Tensor | None = None, + per_sample_weights: Tensor | None = None, + ) -> Tensor: + """Forward pass of EmbeddingBag. + + Args: + input (Tensor): Tensor containing bags of indices into the embedding matrix. + offsets (Tensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines + the starting index position of each bag (sequence) in :attr:`input`. + per_sample_weights (Tensor, optional): a tensor of float / double weights, or None + to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights` + must have exactly the same shape as input and is treated as having the same + :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``. + + Returns: + Tensor output shape of `(B, embedding_dim)`. + + .. note:: + + A few notes about ``input`` and ``offsets``: + + - :attr:`input` and :attr:`offsets` have to be of the same type, either int or long + + - If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences) + each of fixed length ``N``, and this will return ``B`` values aggregated in a way + depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case. + + - If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of + multiple bags (sequences). :attr:`offsets` is required to be a 1D tensor containing the + starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets` of shape `(B)`, + :attr:`input` will be viewed as having ``B`` bags. Empty bags (i.e., having 0-length) will have + returned vectors filled by zeros. + """ + return F.embedding_bag( + input, + self.weight, + offsets, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.mode, + self.sparse, + per_sample_weights, + self.include_last_offset, + self.padding_idx, + ) + + def extra_repr(self) -> str: + s = "{num_embeddings}, {embedding_dim}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + s += ", mode={mode}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + return s.format(**{k: repr(v) for k, v in self.__dict__.items()}) + + @classmethod + def from_pretrained( + cls, + embeddings: Tensor, + freeze: bool = True, + max_norm: float | None = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "mean", + sparse: bool = False, + include_last_offset: bool = False, + padding_idx: int | None = None, + ) -> "EmbeddingBag": + r"""Create EmbeddingBag instance from given 2-dimensional FloatTensor. + + Args: + embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag. + First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'. + freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process. + Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True`` + max_norm (float, optional): See module initialization documentation. Default: ``None`` + norm_type (float, optional): See module initialization documentation. Default ``2``. + scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``. + mode (str, optional): See module initialization documentation. Default: ``"mean"`` + sparse (bool, optional): See module initialization documentation. Default: ``False``. + include_last_offset (bool, optional): See module initialization documentation. Default: ``False``. + padding_idx (int, optional): See module initialization documentation. Default: ``None``. + + Examples:: + + >>> # FloatTensor containing pretrained weights + >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]]) + >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight) + >>> # Get embeddings for index 1 + >>> input = torch.LongTensor([[1, 0]]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embeddingbag(input) + tensor([[ 2.5000, 3.7000, 4.6500]]) + """ + assert embeddings.dim() == 2, ( + "Embeddings parameter is expected to be 2-dimensional" + ) + rows, cols = embeddings.shape + embeddingbag = cls( + num_embeddings=rows, + embedding_dim=cols, + _weight=embeddings, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + mode=mode, + sparse=sparse, + include_last_offset=include_last_offset, + padding_idx=padding_idx, + ) + embeddingbag.weight.requires_grad = not freeze + return embeddingbag diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..6841e85ed6d2e423aa30e95b5b1d3e62f30ec9fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py @@ -0,0 +1,1256 @@ +# mypy: allow-untyped-defs +import copy +import warnings +from collections.abc import Callable +from typing import Any + +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.nn.init import xavier_uniform_ + +from .activation import MultiheadAttention +from .container import ModuleList +from .dropout import Dropout +from .linear import Linear +from .module import Module +from .normalization import LayerNorm + + +__all__ = [ + "Transformer", + "TransformerEncoder", + "TransformerDecoder", + "TransformerEncoderLayer", + "TransformerDecoderLayer", +] + + +def _generate_square_subsequent_mask( + sz: int, + device: torch.device | None = None, + dtype: torch.dtype | None = None, +) -> Tensor: + r"""Generate a square causal mask for the sequence. + + The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). + """ + return torch.triu( + torch.full((sz, sz), float("-inf"), dtype=dtype, device=device), + diagonal=1, + ) + + +def _get_seq_len(src: Tensor, batch_first: bool) -> int | None: + if src.is_nested: + return None + else: + src_size = src.size() + if len(src_size) == 2: + # unbatched: S, E + return src_size[0] + else: + # batched: B, S, E if batch_first else S, B, E + seq_len_pos = 1 if batch_first else 0 + return src_size[seq_len_pos] + + +class Transformer(Module): + r"""A basic transformer layer. + + + This Transformer layer implements the original Transformer architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer Transformer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build an efficient transformer layer from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of encoder/decoder intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before + other attention and feedforward operations, otherwise after. Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples: + >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12) + >>> src = torch.rand((10, 32, 512)) + >>> tgt = torch.rand((20, 32, 512)) + >>> out = transformer_model(src, tgt) + + Note: A full example to apply nn.Transformer module for the word language model is available in + https://github.com/pytorch/examples/tree/master/word_language_model + """ + + def __init__( + self, + d_model: int = 512, + nhead: int = 8, + num_encoder_layers: int = 6, + num_decoder_layers: int = 6, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: str | Callable[[Tensor], Tensor] = F.relu, + custom_encoder: Any | None = None, + custom_decoder: Any | None = None, + layer_norm_eps: float = 1e-5, + batch_first: bool = False, + norm_first: bool = False, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer( + d_model, + nhead, + dim_feedforward, + dropout, + activation, + layer_norm_eps, + batch_first, + norm_first, + bias, + **factory_kwargs, + ) + encoder_norm = LayerNorm( + d_model, + eps=layer_norm_eps, + bias=bias, + # pyrefly: ignore [bad-argument-type] + **factory_kwargs, + ) + self.encoder = TransformerEncoder( + encoder_layer, num_encoder_layers, encoder_norm + ) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer( + d_model, + nhead, + dim_feedforward, + dropout, + activation, + layer_norm_eps, + batch_first, + norm_first, + bias, + **factory_kwargs, + ) + decoder_norm = LayerNorm( + d_model, + eps=layer_norm_eps, + bias=bias, + # pyrefly: ignore [bad-argument-type] + **factory_kwargs, + ) + self.decoder = TransformerDecoder( + decoder_layer, num_decoder_layers, decoder_norm + ) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + self.batch_first = batch_first + + def forward( + self, + src: Tensor, + tgt: Tensor, + src_mask: Tensor | None = None, + tgt_mask: Tensor | None = None, + memory_mask: Tensor | None = None, + src_key_padding_mask: Tensor | None = None, + tgt_key_padding_mask: Tensor | None = None, + memory_key_padding_mask: Tensor | None = None, + src_is_causal: bool | None = None, + tgt_is_causal: bool | None = None, + memory_is_causal: bool = False, + ) -> Tensor: + r"""Take in and process masked source/target sequences. + + .. note:: + + If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are + not allowed to participate in the attention, + which is the opposite of the definition for :attr:`attn_mask` + in :func:`torch.nn.functional.scaled_dot_product_attention`. + + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + src_mask: the additive mask for the src sequence (optional). + tgt_mask: the additive mask for the tgt sequence (optional). + memory_mask: the additive mask for the encoder output (optional). + src_key_padding_mask: the Tensor mask for src keys per batch (optional). + tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional). + memory_key_padding_mask: the Tensor mask for memory keys per batch (optional). + src_is_causal: If specified, applies a causal mask as ``src_mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``src_is_causal`` provides a hint that ``src_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory_mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or + `(N, S, E)` if `batch_first=True`. + - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or + `(N, T, E)` if `batch_first=True`. + - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`. + - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`. + - memory_mask: :math:`(T, S)`. + - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`. + - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`. + - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`. + + Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked + positions. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by + the attention. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + + - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or + `(N, T, E)` if `batch_first=True`. + + Note: Due to the multi-head attention architecture in the transformer model, + the output sequence length of a transformer is same as the input sequence + (i.e. target) length of the decoder. + + where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the + batch size, :math:`E` is the feature number + + Examples: + >>> # xdoctest: +SKIP + >>> output = transformer_model( + ... src, tgt, src_mask=src_mask, tgt_mask=tgt_mask + ... ) + """ + is_batched = src.dim() == 3 + if not self.batch_first and src.size(1) != tgt.size(1) and is_batched: + raise RuntimeError("the batch number of src and tgt must be equal") + elif self.batch_first and src.size(0) != tgt.size(0) and is_batched: + raise RuntimeError("the batch number of src and tgt must be equal") + + if src.size(-1) != self.d_model or tgt.size(-1) != self.d_model: + raise RuntimeError( + "the feature number of src and tgt must be equal to d_model" + ) + + memory = self.encoder( + src, + mask=src_mask, + src_key_padding_mask=src_key_padding_mask, + is_causal=src_is_causal, + ) + output = self.decoder( + tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + tgt_is_causal=tgt_is_causal, + memory_is_causal=memory_is_causal, + ) + return output + + @staticmethod + def generate_square_subsequent_mask( + sz: int, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ) -> Tensor: + r"""Generate a square causal mask for the sequence. + + The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). + """ + return _generate_square_subsequent_mask(sz, dtype=dtype, device=device) + + def _reset_parameters(self) -> None: + r"""Initiate parameters in the transformer model.""" + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(Module): + r"""TransformerEncoder is a stack of N encoder layers. + + This TransformerEncoder layer implements the original architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer Transformer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build efficient layers from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + .. warning:: + All layers in the TransformerEncoder are initialized with the same parameters. + It is recommended to manually initialize the layers after creating the TransformerEncoder instance. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + + __constants__ = ["norm"] + + def __init__( + self, + encoder_layer: "TransformerEncoderLayer", + num_layers: int, + norm: Module | None = None, + enable_nested_tensor: bool = True, + mask_check: bool = True, + ) -> None: + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + # this attribute saves the value providedat object construction + self.enable_nested_tensor = enable_nested_tensor + # this attribute controls whether nested tensors are used + self.use_nested_tensor = enable_nested_tensor + self.mask_check = mask_check + + enc_layer = "encoder_layer" + why_not_sparsity_fast_path = "" + if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer): + why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer" + elif encoder_layer.norm_first: + why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True" + elif not encoder_layer.self_attn.batch_first: + why_not_sparsity_fast_path = ( + f"{enc_layer}.self_attn.batch_first was not True" + + "(use batch_first for better inference performance)" + ) + elif not encoder_layer.self_attn._qkv_same_embed_dim: + why_not_sparsity_fast_path = ( + f"{enc_layer}.self_attn._qkv_same_embed_dim was not True" + ) + elif encoder_layer.self_attn.in_proj_bias is None: + why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False" + elif not encoder_layer.activation_relu_or_gelu: + why_not_sparsity_fast_path = ( + f"{enc_layer}.activation_relu_or_gelu was not True" + ) + elif encoder_layer.norm1.eps != encoder_layer.norm2.eps: + why_not_sparsity_fast_path = ( + f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps" + ) + elif encoder_layer.self_attn.num_heads % 2 == 1: + why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd" + + if enable_nested_tensor and why_not_sparsity_fast_path: + warnings.warn( + f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}", + stacklevel=2, + ) + self.use_nested_tensor = False + + def forward( + self, + src: Tensor, + mask: Tensor | None = None, + src_key_padding_mask: Tensor | None = None, + is_causal: bool | None = None, + ) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + is_causal: If specified, applies a causal mask as ``mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``is_causal`` provides a hint that ``mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + src_key_padding_mask = F._canonical_mask( + mask=src_key_padding_mask, + mask_name="src_key_padding_mask", + other_type=F._none_or_dtype(mask), + other_name="mask", + target_type=src.dtype, + ) + + mask = F._canonical_mask( + mask=mask, + mask_name="mask", + other_type=None, + other_name="", + target_type=src.dtype, + check_other=False, + ) + + output = src + convert_to_nested = False + first_layer = self.layers[0] + src_key_padding_mask_for_layers = src_key_padding_mask + why_not_sparsity_fast_path = "" + str_first_layer = "self.layers[0]" + batch_first = first_layer.self_attn.batch_first + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + do_mask_check = getattr(self, "mask_check", True) + + if not is_fastpath_enabled: + why_not_sparsity_fast_path = ( + "torch.backends.mha.get_fastpath_enabled() was not True" + ) + elif not hasattr(self, "use_nested_tensor"): + why_not_sparsity_fast_path = "use_nested_tensor attribute not present" + elif not self.use_nested_tensor: + why_not_sparsity_fast_path = ( + "self.use_nested_tensor (set in init) was not True" + ) + elif first_layer.training: + why_not_sparsity_fast_path = f"{str_first_layer} was in training mode" + elif src.dim() != 3: + why_not_sparsity_fast_path = ( + f"input not batched; expected src.dim() of 3 but got {src.dim()}" + ) + elif src_key_padding_mask is None: + why_not_sparsity_fast_path = "src_key_padding_mask was None" + # This check avoids a call to torch._nested_tensor_from_mask_left_aligned() that + # breaks in torch.compile. + elif do_mask_check and torch.compiler.is_compiling(): + why_not_sparsity_fast_path = ( + "mask_check enabled with torch.compile or torch.export" + ) + elif do_mask_check and not torch._nested_tensor_from_mask_left_aligned( + src, src_key_padding_mask.logical_not() + ): + why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned" + elif output.is_nested: + why_not_sparsity_fast_path = "NestedTensor input is not supported" + elif mask is not None: + why_not_sparsity_fast_path = ( + "src_key_padding_mask and mask were both supplied" + ) + elif torch.is_autocast_enabled(): + why_not_sparsity_fast_path = "autocast is enabled" + + if not why_not_sparsity_fast_path: + tensor_args = ( + src, + first_layer.self_attn.in_proj_weight, + first_layer.self_attn.in_proj_bias, + first_layer.self_attn.out_proj.weight, + first_layer.self_attn.out_proj.bias, + first_layer.norm1.weight, + first_layer.norm1.bias, + first_layer.norm2.weight, + first_layer.norm2.bias, + first_layer.linear1.weight, + first_layer.linear1.bias, + first_layer.linear2.weight, + first_layer.linear2.bias, + ) + _supported_device_type = [ + "cpu", + "cuda", + "xpu", + torch.utils.backend_registration._privateuse1_backend_name, + ] + if torch.overrides.has_torch_function(tensor_args): + why_not_sparsity_fast_path = "some Tensor argument has_torch_function" + elif src.device.type not in _supported_device_type: + why_not_sparsity_fast_path = ( + f"src device is neither one of {_supported_device_type}" + ) + elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args): + why_not_sparsity_fast_path = ( + "grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad" + ) + + if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None): + convert_to_nested = True + output = torch._nested_tensor_from_mask( + output, src_key_padding_mask.logical_not(), mask_check=False + ) + src_key_padding_mask_for_layers = None + + seq_len = _get_seq_len(src, batch_first) + is_causal = _detect_is_causal_mask(mask, is_causal, seq_len) + + for mod in self.layers: + output = mod( + output, + src_mask=mask, + is_causal=is_causal, + src_key_padding_mask=src_key_padding_mask_for_layers, + ) + + if convert_to_nested: + output = output.to_padded_tensor(0.0, src.size()) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(Module): + r"""TransformerDecoder is a stack of N decoder layers. + + This TransformerDecoder layer implements the original architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer Transformer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build efficient layers from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + .. warning:: + All layers in the TransformerDecoder are initialized with the same parameters. + It is recommended to manually initialize the layers after creating the TransformerDecoder instance. + + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + + Examples: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = transformer_decoder(tgt, memory) + """ + + __constants__ = ["norm"] + + def __init__( + self, + decoder_layer: "TransformerDecoderLayer", + num_layers: int, + norm: Module | None = None, + ) -> None: + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + tgt: Tensor, + memory: Tensor, + tgt_mask: Tensor | None = None, + memory_mask: Tensor | None = None, + tgt_key_padding_mask: Tensor | None = None, + memory_key_padding_mask: Tensor | None = None, + tgt_is_causal: bool | None = None, + memory_is_causal: bool = False, + ) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer in turn. + + Args: + tgt: the sequence to the decoder (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + tgt_is_causal: If specified, applies a causal mask as ``tgt mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + output = tgt + + seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first) + tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len) + + for mod in self.layers: + output = mod( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + tgt_is_causal=tgt_is_causal, + memory_is_causal=memory_is_causal, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(Module): + r"""TransformerEncoderLayer is made up of self-attn and feedforward network. + + This TransformerEncoderLayer implements the original architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer Transformer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build efficient layers from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + TransformerEncoderLayer can handle either traditional torch.tensor inputs, + or Nested Tensor inputs. Derived classes are expected to similarly accept + both input formats. (Not all combinations of inputs are currently + supported by TransformerEncoderLayer while Nested Tensor is in prototype + state.) + + If you are implementing a custom layer, you may derive it either from + the Module or TransformerEncoderLayer class. If your custom layer + supports both torch.Tensors and Nested Tensors inputs, make its + implementation a derived class of TransformerEncoderLayer. If your custom + Layer supports only torch.Tensor inputs, derive its implementation from + Module. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, layer norm is done prior to attention and feedforward + operations, respectively. Otherwise it's done after. Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> out = encoder_layer(src) + + Alternatively, when ``batch_first`` is ``True``: + >>> encoder_layer = nn.TransformerEncoderLayer( + ... d_model=512, nhead=8, batch_first=True + ... ) + >>> src = torch.rand(32, 10, 512) + >>> out = encoder_layer(src) + + Fast path: + forward() will use a special optimized implementation described in + `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following + conditions are met: + + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor + argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``) + - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu`` + - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed + - if src is a `NestedTensor `_, neither ``src_mask`` + nor ``src_key_padding_mask`` is passed + - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case + unless the caller has manually modified one without modifying the other) + + If the optimized implementation is in use, a + `NestedTensor `_ can be + passed for ``src`` to represent padding more efficiently than using a padding + mask. In this case, a `NestedTensor `_ will be + returned, and an additional speedup proportional to the fraction of the input that + is padding can be expected. + + .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`: + https://arxiv.org/abs/2205.14135 + + """ + + __constants__ = ["norm_first"] + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: str | Callable[[Tensor], Tensor] = F.relu, + layer_norm_eps: float = 1e-5, + batch_first: bool = False, + norm_first: bool = False, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=dropout, + bias=bias, + batch_first=batch_first, + **factory_kwargs, + ) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs) + + self.norm_first = norm_first + # pyrefly: ignore [bad-argument-type] + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + activation = _get_activation_fn(activation) + + # We can't test self.activation in forward() in TorchScript, + # so stash some information about it instead. + if activation is F.relu or isinstance(activation, torch.nn.ReLU): + self.activation_relu_or_gelu = 1 + elif activation is F.gelu or isinstance(activation, torch.nn.GELU): + self.activation_relu_or_gelu = 2 + else: + self.activation_relu_or_gelu = 0 + self.activation = activation + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, "activation"): + self.activation = F.relu + + def forward( + self, + src: Tensor, + src_mask: Tensor | None = None, + src_key_padding_mask: Tensor | None = None, + is_causal: bool = False, + ) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + is_causal: If specified, applies a causal mask as ``src mask``. + Default: ``False``. + Warning: + ``is_causal`` provides a hint that ``src_mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + src_key_padding_mask = F._canonical_mask( + mask=src_key_padding_mask, + mask_name="src_key_padding_mask", + other_type=F._none_or_dtype(src_mask), + other_name="src_mask", + target_type=src.dtype, + ) + + src_mask = F._canonical_mask( + mask=src_mask, + mask_name="src_mask", + other_type=None, + other_name="", + target_type=src.dtype, + check_other=False, + ) + + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + + why_not_sparsity_fast_path = "" + if not is_fastpath_enabled: + why_not_sparsity_fast_path = ( + "torch.backends.mha.get_fastpath_enabled() was not True" + ) + elif src.dim() != 3: + why_not_sparsity_fast_path = ( + f"input not batched; expected src.dim() of 3 but got {src.dim()}" + ) + elif self.training: + why_not_sparsity_fast_path = "training is enabled" + elif not self.self_attn.batch_first: + why_not_sparsity_fast_path = "self_attn.batch_first was not True" + elif self.self_attn.in_proj_bias is None: + why_not_sparsity_fast_path = "self_attn was passed bias=False" + elif not self.self_attn._qkv_same_embed_dim: + why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True" + elif not self.activation_relu_or_gelu: + why_not_sparsity_fast_path = "activation_relu_or_gelu was not True" + elif self.norm1.eps != self.norm2.eps: + why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps" + elif src.is_nested and ( + src_key_padding_mask is not None or src_mask is not None + ): + why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input" + elif self.self_attn.num_heads % 2 == 1: + why_not_sparsity_fast_path = "num_head is odd" + elif torch.is_autocast_enabled(): + why_not_sparsity_fast_path = "autocast is enabled" + elif any( + len(getattr(m, "_forward_hooks", {})) + + len(getattr(m, "_forward_pre_hooks", {})) + for m in self.modules() + ): + why_not_sparsity_fast_path = "forward pre-/hooks are attached to the module" + if not why_not_sparsity_fast_path: + tensor_args = ( + src, + self.self_attn.in_proj_weight, + self.self_attn.in_proj_bias, + self.self_attn.out_proj.weight, + self.self_attn.out_proj.bias, + self.norm1.weight, + self.norm1.bias, + self.norm2.weight, + self.norm2.bias, + self.linear1.weight, + self.linear1.bias, + self.linear2.weight, + self.linear2.bias, + ) + + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + _supported_device_type = [ + "cpu", + "cuda", + "xpu", + torch.utils.backend_registration._privateuse1_backend_name, + ] + if torch.overrides.has_torch_function(tensor_args): + why_not_sparsity_fast_path = "some Tensor argument has_torch_function" + elif not all( + (x.device.type in _supported_device_type) for x in tensor_args + ): + why_not_sparsity_fast_path = ( + "some Tensor argument's device is neither one of " + f"{_supported_device_type}" + ) + elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args): + why_not_sparsity_fast_path = ( + "grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad" + ) + + if not why_not_sparsity_fast_path: + merged_mask, mask_type = self.self_attn.merge_masks( + src_mask, src_key_padding_mask, src + ) + return torch._transformer_encoder_layer_fwd( + src, + self.self_attn.embed_dim, + self.self_attn.num_heads, + self.self_attn.in_proj_weight, + self.self_attn.in_proj_bias, + self.self_attn.out_proj.weight, + self.self_attn.out_proj.bias, + self.activation_relu_or_gelu == 2, + self.norm_first, + self.norm1.eps, + self.norm1.weight, + self.norm1.bias, + self.norm2.weight, + self.norm2.bias, + self.linear1.weight, + self.linear1.bias, + self.linear2.weight, + self.linear2.bias, + merged_mask, + mask_type, + ) + + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + x = src + if self.norm_first: + x = x + self._sa_block( + self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal + ) + x = x + self._ff_block(self.norm2(x)) + else: + x = self.norm1( + x + + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal) + ) + x = self.norm2(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block( + self, + x: Tensor, + attn_mask: Tensor | None, + key_padding_mask: Tensor | None, + is_causal: bool = False, + ) -> Tensor: + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + is_causal=is_causal, + )[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class TransformerDecoderLayer(Module): + r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + + This TransformerDecoderLayer implements the original architecture described + in the `Attention Is All You Need `_ paper. The + intent of this layer is as a reference implementation for foundational understanding + and thus it contains only limited features relative to newer Transformer architectures. + Given the fast pace of innovation in transformer-like architectures, we recommend + exploring this `tutorial `_ + to build efficient layers from building blocks in core or using higher + level libraries from the `PyTorch Ecosystem `_. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, layer norm is done prior to self attention, multihead + attention and feedforward operations, respectively. Otherwise it's done after. + Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = decoder_layer(tgt, memory) + + Alternatively, when ``batch_first`` is ``True``: + >>> decoder_layer = nn.TransformerDecoderLayer( + ... d_model=512, nhead=8, batch_first=True + ... ) + >>> memory = torch.rand(32, 10, 512) + >>> tgt = torch.rand(32, 20, 512) + >>> out = decoder_layer(tgt, memory) + """ + + __constants__ = ["norm_first"] + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: str | Callable[[Tensor], Tensor] = F.relu, + layer_norm_eps: float = 1e-5, + batch_first: bool = False, + norm_first: bool = False, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=dropout, + batch_first=batch_first, + bias=bias, + **factory_kwargs, + ) + self.multihead_attn = MultiheadAttention( + d_model, + nhead, + dropout=dropout, + batch_first=batch_first, + bias=bias, + **factory_kwargs, + ) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs) + + self.norm_first = norm_first + # pyrefly: ignore [bad-argument-type] + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + # pyrefly: ignore [bad-argument-type] + self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + self.dropout3 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + def __setstate__(self, state): + if "activation" not in state: + state["activation"] = F.relu + super().__setstate__(state) + + def forward( + self, + tgt: Tensor, + memory: Tensor, + tgt_mask: Tensor | None = None, + memory_mask: Tensor | None = None, + tgt_key_padding_mask: Tensor | None = None, + memory_key_padding_mask: Tensor | None = None, + tgt_is_causal: bool = False, + memory_is_causal: bool = False, + ) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer. + + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + tgt_is_causal: If specified, applies a causal mask as ``tgt mask``. + Default: ``False``. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + + x = tgt + if self.norm_first: + x = x + self._sa_block( + self.norm1(x), tgt_mask, tgt_key_padding_mask, tgt_is_causal + ) + x = x + self._mha_block( + self.norm2(x), + memory, + memory_mask, + memory_key_padding_mask, + memory_is_causal, + ) + x = x + self._ff_block(self.norm3(x)) + else: + x = self.norm1( + x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal) + ) + x = self.norm2( + x + + self._mha_block( + x, memory, memory_mask, memory_key_padding_mask, memory_is_causal + ) + ) + x = self.norm3(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block( + self, + x: Tensor, + attn_mask: Tensor | None, + key_padding_mask: Tensor | None, + is_causal: bool = False, + ) -> Tensor: + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + is_causal=is_causal, + need_weights=False, + )[0] + return self.dropout1(x) + + # multihead attention block + def _mha_block( + self, + x: Tensor, + mem: Tensor, + attn_mask: Tensor | None, + key_padding_mask: Tensor | None, + is_causal: bool = False, + ) -> Tensor: + x = self.multihead_attn( + x, + mem, + mem, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + is_causal=is_causal, + need_weights=False, + )[0] + return self.dropout2(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout3(x) + + +def _get_clones(module, N): + # FIXME: copy.deepcopy() is not defined on nn.module + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]: + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}") + + +def _detect_is_causal_mask( + mask: Tensor | None, + is_causal: bool | None = None, + size: int | None = None, +) -> bool: + """Return whether the given attention mask is causal. + + Warning: + If ``is_causal`` is not ``None``, its value will be returned as is. If a + user supplies an incorrect ``is_causal`` hint, + + ``is_causal=False`` when the mask is in fact a causal attention.mask + may lead to reduced performance relative to what would be achievable + with ``is_causal=True``; + ``is_causal=True`` when the mask is in fact not a causal attention.mask + may lead to incorrect and unpredictable execution - in some scenarios, + a causal mask may be applied based on the hint, in other execution + scenarios the specified mask may be used. The choice may not appear + to be deterministic, in that a number of factors like alignment, + hardware SKU, etc influence the decision whether to use a mask or + rely on the hint. + ``size`` if not None, check whether the mask is a causal mask of the provided size + Otherwise, checks for any causal mask. + """ + # Prevent type refinement + make_causal = is_causal is True + + if is_causal is None and mask is not None: + sz = size if size is not None else mask.size(-2) + causal_comparison = _generate_square_subsequent_mask( + sz, device=mask.device, dtype=mask.dtype + ) + + # Do not use `torch.equal` so we handle batched masks by + # broadcasting the comparison. + if mask.size() == causal_comparison.size(): + make_causal = bool((mask == causal_comparison).all()) + else: + make_causal = False + + return make_causal diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py new file mode 100644 index 0000000000000000000000000000000000000000..29e58bc6a9f3779924584e2934874a1333b3e501 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py @@ -0,0 +1,298 @@ +# mypy: allow-untyped-defs + +import torch.nn.functional as F +from torch import Tensor +from torch.nn.common_types import _ratio_2_t, _ratio_any_t, _size_2_t, _size_any_t + +from .module import Module + + +__all__ = ["Upsample", "UpsamplingNearest2d", "UpsamplingBilinear2d"] + + +class Upsample(Module): + r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data. + + The input data is assumed to be of the form + `minibatch x channels x [optional depth] x [optional height] x width`. + Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor. + + The algorithms available for upsampling are nearest neighbor and linear, + bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor, + respectively. + + One can either give a :attr:`scale_factor` or the target output :attr:`size` to + calculate the output size. (You cannot give both, as it is ambiguous) + + Args: + size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional): + output spatial sizes + scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional): + multiplier for spatial size. Has to match input size if it is a tuple. + mode (str, optional): the upsampling algorithm: one of ``'nearest'``, + ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``. + Default: ``'nearest'`` + align_corners (bool, optional): if ``True``, the corner pixels of the input + and output tensors are aligned, and thus preserving the values at + those pixels. This only has effect when :attr:`mode` is + ``'linear'``, ``'bilinear'``, ``'bicubic'``, or ``'trilinear'``. + Default: ``False`` + recompute_scale_factor (bool, optional): recompute the scale_factor for use in the + interpolation calculation. If `recompute_scale_factor` is ``True``, then + `scale_factor` must be passed in and `scale_factor` is used to compute the + output `size`. The computed output `size` will be used to infer new scales for + the interpolation. Note that when `scale_factor` is floating-point, it may differ + from the recomputed `scale_factor` due to rounding and precision issues. + If `recompute_scale_factor` is ``False``, then `size` or `scale_factor` will + be used directly for interpolation. + + Shape: + - Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})` + - Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})` + or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor D_{in} \times \text{scale\_factor} \right\rfloor + + .. math:: + H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor + + .. math:: + W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor + + .. warning:: + With ``align_corners = True``, the linearly interpolating modes + (`linear`, `bilinear`, `bicubic`, and `trilinear`) don't proportionally + align the output and input pixels, and thus the output values can depend + on the input size. This was the default behavior for these modes up to + version 0.3.1. Since then, the default behavior is + ``align_corners = False``. See below for concrete examples on how this + affects the outputs. + + .. note:: + If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`. + + Examples:: + + >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2) + >>> input + tensor([[[[1., 2.], + [3., 4.]]]]) + + >>> m = nn.Upsample(scale_factor=2, mode='nearest') + >>> m(input) + tensor([[[[1., 1., 2., 2.], + [1., 1., 2., 2.], + [3., 3., 4., 4.], + [3., 3., 4., 4.]]]]) + + >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles") + >>> m = nn.Upsample(scale_factor=2, mode='bilinear') # align_corners=False + >>> m(input) + tensor([[[[1.0000, 1.2500, 1.7500, 2.0000], + [1.5000, 1.7500, 2.2500, 2.5000], + [2.5000, 2.7500, 3.2500, 3.5000], + [3.0000, 3.2500, 3.7500, 4.0000]]]]) + + >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + >>> m(input) + tensor([[[[1.0000, 1.3333, 1.6667, 2.0000], + [1.6667, 2.0000, 2.3333, 2.6667], + [2.3333, 2.6667, 3.0000, 3.3333], + [3.0000, 3.3333, 3.6667, 4.0000]]]]) + + >>> # Try scaling the same data in a larger tensor + >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3) + >>> input_3x3[:, :, :2, :2].copy_(input) + tensor([[[[1., 2.], + [3., 4.]]]]) + >>> input_3x3 + tensor([[[[1., 2., 0.], + [3., 4., 0.], + [0., 0., 0.]]]]) + + >>> # xdoctest: +IGNORE_WANT("seems to fail when other tests are run in the same session") + >>> m = nn.Upsample(scale_factor=2, mode='bilinear') # align_corners=False + >>> # Notice that values in top left corner are the same with the small input (except at boundary) + >>> m(input_3x3) + tensor([[[[1.0000, 1.2500, 1.7500, 1.5000, 0.5000, 0.0000], + [1.5000, 1.7500, 2.2500, 1.8750, 0.6250, 0.0000], + [2.5000, 2.7500, 3.2500, 2.6250, 0.8750, 0.0000], + [2.2500, 2.4375, 2.8125, 2.2500, 0.7500, 0.0000], + [0.7500, 0.8125, 0.9375, 0.7500, 0.2500, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]]) + + >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + >>> # Notice that values in top left corner are now changed + >>> m(input_3x3) + tensor([[[[1.0000, 1.4000, 1.8000, 1.6000, 0.8000, 0.0000], + [1.8000, 2.2000, 2.6000, 2.2400, 1.1200, 0.0000], + [2.6000, 3.0000, 3.4000, 2.8800, 1.4400, 0.0000], + [2.4000, 2.7200, 3.0400, 2.5600, 1.2800, 0.0000], + [1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]]) + """ + + __constants__ = [ + "size", + "scale_factor", + "mode", + "align_corners", + "name", + "recompute_scale_factor", + ] + name: str + size: _size_any_t | None + scale_factor: _ratio_any_t | None + mode: str + align_corners: bool | None + recompute_scale_factor: bool | None + + def __init__( + self, + size: _size_any_t | None = None, + scale_factor: _ratio_any_t | None = None, + mode: str = "nearest", + align_corners: bool | None = None, + recompute_scale_factor: bool | None = None, + ) -> None: + super().__init__() + self.name = type(self).__name__ + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = align_corners + self.recompute_scale_factor = recompute_scale_factor + + def forward(self, input: Tensor) -> Tensor: + """ + Runs the forward pass. + """ + return F.interpolate( + input, + self.size, + self.scale_factor, + self.mode, + self.align_corners, + recompute_scale_factor=self.recompute_scale_factor, + ) + + def __setstate__(self, state): + if "recompute_scale_factor" not in state: + state["recompute_scale_factor"] = True + + super().__setstate__(state) + + def extra_repr(self) -> str: + """ + Return the extra representation of the module. + """ + if self.scale_factor is not None: + info = "scale_factor=" + repr(self.scale_factor) + else: + info = "size=" + repr(self.size) + info += ", mode=" + repr(self.mode) + return info + + +class UpsamplingNearest2d(Upsample): + r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input channels. + + To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor` + as it's constructor argument. + + When :attr:`size` is given, it is the output size of the image `(h, w)`. + + Args: + size (int or Tuple[int, int], optional): output spatial sizes + scale_factor (float or Tuple[float, float], optional): multiplier for + spatial size. + + .. warning:: + This class is deprecated in favor of :func:`~nn.functional.interpolate`. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` + - Output: :math:`(N, C, H_{out}, W_{out})` where + + .. math:: + H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor + + .. math:: + W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor + + Examples:: + + >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2) + >>> input + tensor([[[[1., 2.], + [3., 4.]]]]) + + >>> m = nn.UpsamplingNearest2d(scale_factor=2) + >>> m(input) + tensor([[[[1., 1., 2., 2.], + [1., 1., 2., 2.], + [3., 3., 4., 4.], + [3., 3., 4., 4.]]]]) + """ + + def __init__( + self, + size: _size_2_t | None = None, + scale_factor: _ratio_2_t | None = None, + ) -> None: + super().__init__(size, scale_factor, mode="nearest") + + +class UpsamplingBilinear2d(Upsample): + r"""Applies a 2D bilinear upsampling to an input signal composed of several input channels. + + To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor` + as it's constructor argument. + + When :attr:`size` is given, it is the output size of the image `(h, w)`. + + Args: + size (int or Tuple[int, int], optional): output spatial sizes + scale_factor (float or Tuple[float, float], optional): multiplier for + spatial size. + + .. warning:: + This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is + equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` + - Output: :math:`(N, C, H_{out}, W_{out})` where + + .. math:: + H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor + + .. math:: + W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor + + Examples:: + + >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2) + >>> input + tensor([[[[1., 2.], + [3., 4.]]]]) + + >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?") + >>> m = nn.UpsamplingBilinear2d(scale_factor=2) + >>> m(input) + tensor([[[[1.0000, 1.3333, 1.6667, 2.0000], + [1.6667, 2.0000, 2.3333, 2.6667], + [2.3333, 2.6667, 3.0000, 3.3333], + [3.0000, 3.3333, 3.6667, 4.0000]]]]) + """ + + def __init__( + self, + size: _size_2_t | None = None, + scale_factor: _ratio_2_t | None = None, + ) -> None: + super().__init__(size, scale_factor, mode="bilinear", align_corners=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5dffadefe152d527090aef870f87a7a7565eac25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py @@ -0,0 +1,83 @@ +# mypy: allow-untyped-defs +import collections +from itertools import repeat +from typing import Any + + +__all__ = ["consume_prefix_in_state_dict_if_present"] + + +def _ntuple(n, name="parse"): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return tuple(x) + return tuple(repeat(x, n)) + + parse.__name__ = name + return parse + + +_single = _ntuple(1, "_single") +_pair = _ntuple(2, "_pair") +_triple = _ntuple(3, "_triple") +_quadruple = _ntuple(4, "_quadruple") + + +def _reverse_repeat_tuple(t, n): + r"""Reverse the order of `t` and repeat each element for `n` times. + + This can be used to translate padding arg used by Conv and Pooling modules + to the ones used by `F.pad`. + """ + return tuple(x for x in reversed(t) for _ in range(n)) + + +def _list_with_default(out_size: list[int], defaults: list[int]) -> list[int]: + import torch + + if isinstance(out_size, (int, torch.SymInt)): + # pyrefly: ignore [bad-return] + return out_size + if len(defaults) <= len(out_size): + raise ValueError(f"Input dimension should be at least {len(out_size) + 1}") + return [ + v if v is not None else d + for v, d in zip(out_size, defaults[-len(out_size) :], strict=False) + ] + + +def consume_prefix_in_state_dict_if_present( + state_dict: dict[str, Any], + prefix: str, +) -> None: + r"""Strip the prefix in state_dict in place, if any. + + .. note:: + Given a `state_dict` from a DP/DDP model, a local model can load it by applying + `consume_prefix_in_state_dict_if_present(state_dict, "module.")` before calling + :meth:`torch.nn.Module.load_state_dict`. + + Args: + state_dict (OrderedDict): a state-dict to be loaded to the model. + prefix (str): prefix. + """ + keys = list(state_dict.keys()) + for key in keys: + if key.startswith(prefix): + newkey = key[len(prefix) :] + state_dict[newkey] = state_dict.pop(key) + + # also strip the prefix in metadata if any. + if hasattr(state_dict, "_metadata"): + keys = list(state_dict._metadata.keys()) + for key in keys: + # for the metadata dict, the key can be: + # '': for the DDP module, which we want to remove. + # 'module': for the actual model. + # 'module.xx.xx': for the rest. + if len(key) == 0: + continue + # handling both, 'module' case and 'module.' cases + if key == prefix.replace(".", "") or key.startswith(prefix): + newkey = key[len(prefix) :] + state_dict._metadata[newkey] = state_dict._metadata.pop(key) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8648d10aadc8dec59ea7ebc54aa77cd60ee4f5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py @@ -0,0 +1,27 @@ +from typing_extensions import deprecated + +from torch.nn.parallel.data_parallel import data_parallel, DataParallel +from torch.nn.parallel.distributed import DistributedDataParallel +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter + + +__all__ = [ + "replicate", + "scatter", + "parallel_apply", + "gather", + "data_parallel", + "DataParallel", + "DistributedDataParallel", +] + + +@deprecated( + "`torch.nn.parallel.DistributedDataParallelCPU` is deprecated, " + "please use `torch.nn.parallel.DistributedDataParallel` instead.", + category=FutureWarning, +) +class DistributedDataParallelCPU(DistributedDataParallel): + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..70a2eace9eff15b06df7958588afd8e1580bb8a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py @@ -0,0 +1,131 @@ +import warnings +from itertools import chain + +import torch +from torch._utils import _get_device_index +from torch.autograd import Function +from torch.nn.parallel import comm + + +class Broadcast(Function): + @staticmethod + def forward(ctx, target_gpus, *inputs): + assert all(i.device.type != "cpu" for i in inputs), ( + "Broadcast function not implemented for CPU tensors" + ) + target_gpus = [_get_device_index(x, True) for x in target_gpus] + ctx.target_gpus = target_gpus + if len(inputs) == 0: + return () + ctx.num_inputs = len(inputs) + ctx.input_device = inputs[0].get_device() + outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus) + non_differentiables = [] + for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]): + if not input_requires_grad: + non_differentiables.extend(output[idx] for output in outputs) + ctx.mark_non_differentiable(*non_differentiables) + return tuple(chain.from_iterable(outputs)) + + @staticmethod + def backward(ctx, *grad_outputs): + return (None,) + ReduceAddCoalesced.apply( + ctx.input_device, ctx.num_inputs, *grad_outputs + ) + + +class ReduceAddCoalesced(Function): + @staticmethod + def forward(ctx, destination, num_inputs, *grads): + ctx.target_gpus = [ + grads[i].get_device() for i in range(0, len(grads), num_inputs) + ] + + grads_ = [grads[i : i + num_inputs] for i in range(0, len(grads), num_inputs)] + return comm.reduce_add_coalesced(grads_, destination) + + @staticmethod + def backward(ctx, *grad_outputs): + return ( + None, + None, + ) + Broadcast.apply(ctx.target_gpus, *grad_outputs) + + +class Gather(Function): + @staticmethod + def forward(ctx, target_device, dim, *inputs): + assert all(i.device.type != "cpu" for i in inputs), ( + "Gather function not implemented for CPU tensors" + ) + if target_device == "cpu": + ctx.target_device = "cpu" + else: + target_device = _get_device_index(target_device, True) + ctx.target_device = target_device + ctx.dim = dim + ctx.input_gpus = tuple(i.get_device() for i in inputs) + if all(t.dim() == 0 for t in inputs) and dim == 0: + inputs = tuple(t.view(1) for t in inputs) + warnings.warn( + "Was asked to gather along dimension 0, but all " + "input tensors were scalars; will instead unsqueeze " + "and return a vector.", + stacklevel=2, + ) + ctx.unsqueezed_scalar = True + else: + ctx.unsqueezed_scalar = False + ctx.input_sizes = tuple(i.size(ctx.dim) for i in inputs) + return comm.gather(inputs, ctx.dim, ctx.target_device) + + @staticmethod + def backward(ctx, grad_output): + scattered_grads = Scatter.apply( + ctx.input_gpus, ctx.input_sizes, ctx.dim, grad_output + ) + if ctx.unsqueezed_scalar: + scattered_grads = tuple(g[0] for g in scattered_grads) + return (None, None) + scattered_grads + + +class Scatter(Function): + @staticmethod + def forward(ctx, target_gpus, chunk_sizes, dim, input): + target_gpus = [_get_device_index(x, True) for x in target_gpus] + ctx.dim = dim + ctx.input_device = input.get_device() if input.device.type != "cpu" else -1 + streams = None + if torch.accelerator.is_available() and ctx.input_device == -1: + # Perform CPU to GPU copies in a background stream + streams = [_get_stream(torch.device(device)) for device in target_gpus] + outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) + # Synchronize with the copy stream + if streams is not None: + for i, output in enumerate(outputs): + with torch.accelerator.device_index(target_gpus[i]): + main_stream = torch.accelerator.current_stream() + main_stream.wait_stream(streams[i]) + output.record_stream(main_stream) + return outputs + + @staticmethod + def backward(ctx, *grad_output): + return None, None, None, Gather.apply(ctx.input_device, ctx.dim, *grad_output) + + +# background streams used for copying +_streams: list[torch.Stream | None] | None = None + + +def _get_stream(device: torch.device): + """Get a background stream for copying between CPU and target device.""" + global _streams + if device.type == "cpu" or not torch.accelerator.is_available(): + return None + assert torch.accelerator.current_accelerator().type == device.type + if _streams is None: + _streams = [None] * torch.accelerator.device_count() + if _streams[device.index] is None: + _streams[device.index] = torch.Stream(device.index) + return _streams[device.index] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..255c0c4b332712a714610801f11c8e2b33df3671 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py @@ -0,0 +1,261 @@ +# mypy: allow-untyped-defs +import warnings + +import torch +from torch._utils import ( + _flatten_dense_tensors, + _get_device_index, + _handle_complex, + _reorder_tensors_as, + _take_tensors, + _unflatten_dense_tensors, +) +from torch.cuda import nccl + + +def broadcast(tensor, devices=None, *, out=None): + r"""Broadcasts a tensor to specified GPU devices. + + Args: + tensor (Tensor): tensor to broadcast. Can be on CPU or GPU. + devices (Iterable[torch.device, str or int], optional): an iterable of + GPU devices, among which to broadcast. + out (Sequence[Tensor], optional, keyword-only): the GPU tensors to + store output results. + + .. note:: + Exactly one of :attr:`devices` and :attr:`out` must be specified. + + Returns: + - If :attr:`devices` is specified, + a tuple containing copies of :attr:`tensor`, placed on + :attr:`devices`. + - If :attr:`out` is specified, + a tuple containing :attr:`out` tensors, each containing a copy of + :attr:`tensor`. + """ + tensor = _handle_complex(tensor) + if not ((devices is None) ^ (out is None)): + raise RuntimeError( + f"Exactly one of 'devices' and 'out' must be specified, but got devices={devices} and out={out}" + ) + if devices is not None: + devices = [_get_device_index(d) for d in devices] + return torch._C._broadcast(tensor, devices) + else: + # pyrefly: ignore [bad-argument-type] + return torch._C._broadcast_out(tensor, out) + + +def broadcast_coalesced(tensors, devices, buffer_size=10485760): + """Broadcast a sequence of tensors to the specified GPUs. + + Small tensors are first coalesced into a buffer to reduce the number of synchronizations. + + Args: + tensors (sequence): tensors to broadcast. Must be on the same device, + either CPU or GPU. + devices (Iterable[torch.device, str or int]): an iterable of GPU + devices, among which to broadcast. + buffer_size (int): maximum size of the buffer used for coalescing + + Returns: + A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`. + """ + devices = [_get_device_index(d) for d in devices] + tensors = [_handle_complex(t) for t in tensors] + return torch._C._broadcast_coalesced(tensors, devices, buffer_size) + + +def reduce_add(inputs, destination=None): + """Sum tensors from multiple GPUs. + + All inputs should have matching shapes, dtype, and layout. The output tensor + will be of the same shape, dtype, and layout. + + Args: + inputs (Iterable[Tensor]): an iterable of tensors to add. + destination (int, optional): a device on which the output will be + placed (default: current device). + + Returns: + A tensor containing an elementwise sum of all inputs, placed on the + :attr:`destination` device. + """ + destination = _get_device_index(destination, optional=True) + input_size = inputs[0].size() + root_index = None # index of input tensor that already is on the correct device + for i, inp in enumerate(inputs): + assert inp.device.type != "cpu", "reduce_add expects all inputs to be on GPUs" + if inp.get_device() == destination: + root_index = i + if inp.size() != input_size: + got = "x".join(str(x) for x in inp.size()) + expected = "x".join(str(x) for x in input_size) + raise ValueError( + f"input {i} has invalid size: got {got}, but expected {expected}" + ) + if root_index is None: + raise RuntimeError( + "reduce_add expects destination to be on the same GPU with one of the tensors" + ) + + if len(inputs) == 1: + return inputs[0] + + if nccl.is_available(inputs): + result = torch.empty_like(inputs[root_index]) + nccl.reduce(inputs, output=result, root=root_index) + else: + destination_device = torch.device(inputs[root_index].device.type, destination) + nonroot = [t for i, t in enumerate(inputs) if i != root_index] + # make a new tensor w/o clone + result = inputs[root_index] + nonroot[0].to( + device=destination_device, non_blocking=True + ) + for other in nonroot[1:]: + result.add_(other.to(device=destination_device, non_blocking=True)) + return result + + +def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): + """Sum tensors from multiple GPUs. + + Small tensors are first coalesced into a buffer to reduce the number + of synchronizations. + + Args: + inputs (Iterable[Iterable[Tensor]]): iterable of iterables that + contain tensors from a single device. + destination (int, optional): a device on which the output will be + placed (default: current device). + buffer_size (int): maximum size of the buffer used for coalescing + + Returns: + A tuple of tensors containing an elementwise sum of each group of + inputs, placed on the ``destination`` device. + """ + # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just + # return `inputs`. + dense_tensors: list[list] = [[] for _ in inputs] # shape (num_gpus, num_tensors) + output = [] + ref_order = [] + # process sparse ones first since they may have different sizes on different gpus + for tensor_at_gpus in zip(*inputs, strict=True): + if all(t.is_sparse for t in tensor_at_gpus): + result = reduce_add(tensor_at_gpus, destination) # this will be sparse too + output.append(result) + ref_order.append(tensor_at_gpus[0]) + else: + for coll, t in zip(dense_tensors, tensor_at_gpus, strict=True): + coll.append(t.to_dense() if t.is_sparse else t) + ref_order.append(dense_tensors[0][-1]) + itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] + # now the dense ones, which have consistent sizes + for chunks in zip(*itrs, strict=True): + flat_tensors = [ + _flatten_dense_tensors(chunk) for chunk in chunks + ] # (num_gpus,) + flat_result = reduce_add(flat_tensors, destination) + for t in _unflatten_dense_tensors(flat_result, chunks[0]): + # The unflattened tensors do not share storage, and we don't expose + # base flat tensor anyways, so give them different version counters. + # See NOTE [ Version Counter in comm.*_coalesced ] + output.append(t.data) + return tuple(_reorder_tensors_as(output, ref_order)) + + +def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=None): + """Scatters tensor across multiple GPUs. + + Args: + tensor (Tensor): tensor to scatter. Can be on CPU or GPU. + devices (Iterable[torch.device, str or int], optional): an iterable of + GPU devices, among which to scatter. + chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on + each device. It should match :attr:`devices` in length and sums to + ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided + into equal chunks. + dim (int, optional): A dimension along which to chunk :attr:`tensor`. + Default: ``0``. + streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among + which to execute the scatter. If not specified, the default stream will + be utilized. + out (Sequence[Tensor], optional, keyword-only): the GPU tensors to + store output results. Sizes of these tensors must match that of + :attr:`tensor`, except for :attr:`dim`, where the total size must + sum to ``tensor.size(dim)``. + + .. note:: + Exactly one of :attr:`devices` and :attr:`out` must be specified. When + :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and + will be inferred from sizes of :attr:`out`. + + Returns: + - If :attr:`devices` is specified, + a tuple containing chunks of :attr:`tensor`, placed on + :attr:`devices`. + - If :attr:`out` is specified, + a tuple containing :attr:`out` tensors, each containing a chunk of + :attr:`tensor`. + """ + tensor = _handle_complex(tensor) + if out is None: + # pyrefly: ignore [not-iterable] + devices = [_get_device_index(d) for d in devices] + return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) + else: + if devices is not None: + raise RuntimeError( + f"'devices' must not be specified when 'out' is specified, but got devices={devices}" + ) + if chunk_sizes is not None: + raise RuntimeError( + f"'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes={chunk_sizes}" + ) + return tuple(torch._C._scatter_out(tensor, out, dim, streams)) + + +def gather(tensors, dim=0, destination=None, *, out=None): + r"""Gathers tensors from multiple GPU devices. + + Args: + tensors (Iterable[Tensor]): an iterable of tensors to gather. + Tensor sizes in all dimensions other than :attr:`dim` have to match. + dim (int, optional): a dimension along which the tensors will be + concatenated. Default: ``0``. + destination (torch.device, str, or int, optional): the output device. + Can be CPU or CUDA. Default: the current CUDA device. + out (Tensor, optional, keyword-only): the tensor to store gather result. + Its sizes must match those of :attr:`tensors`, except for :attr:`dim`, + where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``. + Can be on CPU or CUDA. + + .. note:: + :attr:`destination` must not be specified when :attr:`out` is specified. + + Returns: + - If :attr:`destination` is specified, + a tensor located on :attr:`destination` device, that is a result of + concatenating :attr:`tensors` along :attr:`dim`. + - If :attr:`out` is specified, + the :attr:`out` tensor, now containing results of concatenating + :attr:`tensors` along :attr:`dim`. + """ + tensors = [_handle_complex(t) for t in tensors] + if out is None: + if destination == -1: + warnings.warn( + "Using -1 to represent CPU tensor is deprecated. Please use a " + 'device object or string instead, e.g., "cpu".', + FutureWarning, + stacklevel=2, + ) + destination = _get_device_index(destination, allow_cpu=True, optional=True) + return torch._C._gather(tensors, dim, destination) + else: + if destination is not None: + raise RuntimeError( + f"'destination' must not be specified when 'out' is specified, but got destination={destination}" + ) + return torch._C._gather_out(tensors, out, dim) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2319439f092bed9a4277838dcb3b794de64b97 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py @@ -0,0 +1,289 @@ +# mypy: allow-untyped-defs +import operator +import warnings +from collections.abc import Sequence +from itertools import chain +from typing import Any, Generic, TypeVar + +import torch +from torch._utils import ( + _get_all_device_indices, + _get_available_device_type, + _get_device_index, + _get_devices_properties, +) +from torch.nn.modules import Module +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter_kwargs + + +__all__ = ["DataParallel", "data_parallel"] + + +def _check_balance(device_ids: Sequence[int | torch.device]) -> None: + imbalance_warn = """ + There is an imbalance between your GPUs. You may want to exclude GPU {} which + has less than 75% of the memory or cores of GPU {}. You can do so by setting + the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES + environment variable.""" + device_ids = [_get_device_index(x, True) for x in device_ids] + dev_props = _get_devices_properties(device_ids) + + def warn_imbalance(get_prop) -> bool: + values = [get_prop(props) for props in dev_props] + min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1)) + max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1)) + if min_val / max_val < 0.75: + warnings.warn( + imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]), + stacklevel=2, + ) + return True + return False + + if warn_imbalance(lambda props: props.total_memory): + return + if warn_imbalance(lambda props: props.multi_processor_count): + return + + +T = TypeVar("T", bound=Module) + + +class DataParallel(Module, Generic[T]): + r"""Implements data parallelism at the module level. + + This container parallelizes the application of the given :attr:`module` by + splitting the input across the specified devices by chunking in the batch + dimension (other objects will be copied once per device). In the forward + pass, the module is replicated on each device, and each replica handles a + portion of the input. During the backwards pass, gradients from each replica + are summed into the original module. + + The batch size should be larger than the number of GPUs used. + + .. warning:: + It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`, + instead of this class, to do multi-GPU training, even if there is only a single + node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`. + + Arbitrary positional and keyword inputs are allowed to be passed into + DataParallel but some types are specially handled. tensors will be + **scattered** on dim specified (default 0). tuple, list and dict types will + be shallow copied. The other types will be shared among different threads + and can be corrupted if written to in the model's forward pass. + + The parallelized :attr:`module` must have its parameters and buffers on + ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel` + module. + + .. warning:: + In each forward, :attr:`module` is **replicated** on each device, so any + updates to the running module in ``forward`` will be lost. For example, + if :attr:`module` has a counter attribute that is incremented in each + ``forward``, it will always stay at the initial value because the update + is done on the replicas which are destroyed after ``forward``. However, + :class:`~torch.nn.DataParallel` guarantees that the replica on + ``device[0]`` will have its parameters and buffers sharing storage with + the base parallelized :attr:`module`. So **in-place** updates to the + parameters or buffers on ``device[0]`` will be recorded. E.g., + :class:`~torch.nn.BatchNorm2d` and :func:`~torch.nn.utils.spectral_norm` + rely on this behavior to update the buffers. + + .. warning:: + Forward and backward hooks defined on :attr:`module` and its submodules + will be invoked ``len(device_ids)`` times, each with inputs located on + a particular device. Particularly, the hooks are only guaranteed to be + executed in correct order with respect to operations on corresponding + devices. For example, it is not guaranteed that hooks set via + :meth:`~torch.nn.Module.register_forward_pre_hook` be executed before + `all` ``len(device_ids)`` :meth:`~torch.nn.Module.forward` calls, but + that each such hook be executed before the corresponding + :meth:`~torch.nn.Module.forward` call of that device. + + .. warning:: + When :attr:`module` returns a scalar (i.e., 0-dimensional tensor) in + :func:`forward`, this wrapper will return a vector of length equal to + number of devices used in data parallelism, containing the result from + each device. + + .. note:: + There is a subtlety in using the + ``pack sequence -> recurrent network -> unpack sequence`` pattern in a + :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`. + See :ref:`pack-rnn-unpack-with-data-parallelism` section in FAQ for + details. + + + Args: + module (Module): module to be parallelized + device_ids (list of int or torch.device): CUDA devices (default: all devices) + output_device (int or torch.device): device location of output (default: device_ids[0]) + + Attributes: + module (Module): the module to be parallelized + + Example:: + + >>> # xdoctest: +SKIP + >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) + >>> output = net(input_var) # input_var can be on any device, including CPU + """ + + # TODO: update notes/cuda.rst when this class handles 8+ GPUs well + + def __init__( + self, + module: T, + device_ids: Sequence[int | torch.device] | None = None, + output_device: int | torch.device | None = None, + dim: int = 0, + ) -> None: + super().__init__() + torch._C._log_api_usage_once("torch.nn.parallel.DataParallel") + device_type = _get_available_device_type() + if device_type is None or device_type == "mps": + self.module = module + self.device_ids = [] + return + + if device_ids is None: + device_ids = _get_all_device_indices() + + if device_ids is None: + raise RuntimeError("no available devices were found") + + if output_device is None: + output_device = device_ids[0] + + self.dim = dim + self.module = module + self.device_ids = [_get_device_index(x, True) for x in device_ids] + self.output_device = _get_device_index(output_device, True) + # pyrefly: ignore [read-only] + self.src_device_obj = torch.device(device_type, self.device_ids[0]) + + if device_type == "cuda": + _check_balance(self.device_ids) + + if len(self.device_ids) == 1: + self.module.to(self.src_device_obj) + + def forward(self, *inputs: Any, **kwargs: Any) -> Any: + with torch.autograd.profiler.record_function("DataParallel.forward"): + if not self.device_ids: + return self.module(*inputs, **kwargs) + + # pyrefly: ignore [bad-argument-type] + for t in chain(self.module.parameters(), self.module.buffers()): + if t.device != self.src_device_obj: + raise RuntimeError( + "module must have its parameters and buffers " + f"on device {self.src_device_obj} (device_ids[0]) but found one of " + f"them on device: {t.device}" + ) + + inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids) + # for forward function without any inputs, empty list and dict will be created + # so the module can be executed on one device which is the first one in device_ids + if not inputs and not module_kwargs: + inputs = ((),) + module_kwargs = ({},) + + if len(self.device_ids) == 1: + return self.module(*inputs[0], **module_kwargs[0]) + replicas = self.replicate(self.module, self.device_ids[: len(inputs)]) + outputs = self.parallel_apply(replicas, inputs, module_kwargs) + return self.gather(outputs, self.output_device) + + def replicate(self, module: T, device_ids: Sequence[int | torch.device]) -> list[T]: + return replicate(module, device_ids, not torch.is_grad_enabled()) + + def scatter( + self, + inputs: tuple[Any, ...], + kwargs: dict[str, Any] | None, + device_ids: Sequence[int | torch.device], + ) -> Any: + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def parallel_apply( + self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any + ) -> list[Any]: + return parallel_apply( + replicas, inputs, kwargs, self.device_ids[: len(replicas)] + ) + + def gather(self, outputs: Any, output_device: int | torch.device) -> Any: + return gather(outputs, output_device, dim=self.dim) + + +def data_parallel( + module: Module, + inputs: Any, + device_ids: Sequence[int | torch.device] | None = None, + output_device: int | torch.device | None = None, + dim: int = 0, + module_kwargs: Any | None = None, +) -> torch.Tensor: + r"""Evaluate module(input) in parallel across the GPUs given in device_ids. + + This is the functional version of the DataParallel module. + + Args: + module (Module): the module to evaluate in parallel + inputs (Tensor): inputs to the module + device_ids (list of int or torch.device): GPU ids on which to replicate module + output_device (list of int or torch.device): GPU location of the output Use -1 to indicate the CPU. + (default: device_ids[0]) + Returns: + a Tensor containing the result of module(input) located on + output_device + """ + if not isinstance(inputs, tuple): + inputs = (inputs,) if inputs is not None else () + + device_type = _get_available_device_type() + + if device_type is None: + raise RuntimeError("device type could not be determined") + + if device_ids is None: + device_ids = _get_all_device_indices() + + if device_ids is None: + raise RuntimeError("no available devices were found") + + if output_device is None: + output_device = device_ids[0] + + device_ids = [_get_device_index(x, True) for x in device_ids] + output_device = _get_device_index(output_device, True) + # pyrefly: ignore [no-matching-overload] + src_device_obj = torch.device(device_type, device_ids[0]) + + # pyrefly: ignore [bad-argument-type] + for t in chain(module.parameters(), module.buffers()): + if t.device != src_device_obj: + raise RuntimeError( + "module must have its parameters and buffers " + f"on device {src_device_obj} (device_ids[0]) but found one of " + f"them on device: {t.device}" + ) + + inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim) + # for module without any inputs, empty list and dict will be created + # so the module can be executed on one device which is the first one in device_ids + if not inputs and not module_kwargs: + inputs = ((),) + module_kwargs = ({},) + + assert module_kwargs is not None + + if len(device_ids) == 1: + return module(*inputs[0], **module_kwargs[0]) + used_device_ids = device_ids[: len(inputs)] + replicas = replicate(module, used_device_ids) + outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids) + return gather(outputs, output_device, dim) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..4899d123e80a124f31e45ed832bba195af32c353 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py @@ -0,0 +1,2434 @@ +# mypy: allow-untyped-defs +import copy +import functools +import inspect +import itertools +import logging +import os +import sys +import warnings +import weakref +from collections import defaultdict, deque +from collections.abc import Callable +from contextlib import contextmanager +from dataclasses import dataclass, fields, is_dataclass +from enum import auto, Enum +from typing import Any, Optional, TYPE_CHECKING + +import torch +import torch.distributed as dist +from torch._utils import _get_device_index +from torch.autograd import Function, Variable +from torch.distributed.algorithms.join import Join, Joinable, JoinHook +from torch.nn.modules import Module +from torch.nn.parallel.scatter_gather import gather, scatter_kwargs +from torch.utils._pytree import tree_flatten, tree_unflatten + + +RPC_AVAILABLE = False +if dist.is_available(): + from torch.distributed.distributed_c10d import ( + _get_default_group, + _rank_not_in_group, + ReduceOp, + ) + from torch.distributed.utils import ( + _alloc_storage, + _cast_forward_inputs, + _free_storage, + _sync_module_states, + _to_kwargs, + _verify_param_shape_across_processes, + ) +if dist.rpc.is_available(): + RPC_AVAILABLE = True + from torch.distributed.rpc import RRef + +if TYPE_CHECKING: + from torch.utils.hooks import RemovableHandle + + +__all__ = ["DistributedDataParallel"] + +logger = logging.getLogger(__name__) + + +@dataclass +class _MixedPrecision: + """ + This configures DDP-native mixed precision training. + + Attributes: + param_dtype (torch.dtype): This specifies the dtype for model + parameters, inputs (when ``cast_forward_inputs`` is set to + ``True``), and therefore the dtype for computation. + However, outside the forward and backward passes, parameters are in + full precision. Model checkpointing always happens in full + precision. + reduce_dtype (torch.dtype): This specifies the dtype for gradient + reduction, which is permitted to differ from ``param_dtype``. + buffer_dtype (torch.dtype): This specifies the dtype for buffers. + + .. note:: This API is experimental and subject to change. + + .. note:: Only floating point tensors are cast to their specified dtypes. + + .. note:: ``state_dict`` checkpoints parameters and buffers in full + precision. + + .. note:: Each low precision dtype must be specified explicitly. For + example, ``_MixedPrecision(reduce_dtype=torch.float16)`` only specifies + the reduction dtype to be low precision, and DDP will not cast + parameters or buffers. + + .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction + happens in ``param_dtype`` if specified or the original parameter dtype + otherwise. For example, ``_MixedPrecision(param_dtype=torch.float16)`` + would result in communication occurring in fp16. + """ + + param_dtype: torch.dtype | None = None + reduce_dtype: torch.dtype | None = None + buffer_dtype: torch.dtype | None = None + # TODO (rohan-varma): keep_low_precision_grads: bool = False + # TODO (rohan-varma): APIs to allow users to run batchnorm and layernorm + # in full precision. For DDP, this can be implemented by not performing the + # parameter cast for BN and LN units. + + +def _cast_buffers(mixed_precision_config, root_module): + """Casts buffers to the given ``buffer_dtype``.""" + for buf in root_module.buffers(): + if hasattr(buf, "_ddp_ignored") and buf._ddp_ignored: + continue + + buf.data = buf.to(dtype=mixed_precision_config.buffer_dtype) + + +def _setup_mixed_precision_params(mixed_precision_config, root_module): + """Create and free storage for the mixed precision parameters.""" + for param in root_module.parameters(): + # Do not setup mixed precision for DDP ignored parameters. + if hasattr(param, "_ddp_ignored") and param._ddp_ignored: + continue + + if not hasattr(param, "_mp_param"): + param._mp_param = torch.zeros_like( + param, + device=param.device, + dtype=mixed_precision_config.param_dtype, + requires_grad=param.requires_grad, + ) + _free_storage(param._mp_param) + # _fp_param will point to the full precision param so it can be switched + # back to at the end of forward / backward. + param._fp_param = param.data + + +def _tree_flatten_with_rref(output): + output_is_rref = RPC_AVAILABLE and isinstance(output, RRef) + if output_is_rref: + output_tensor_list, treespec = tree_flatten(output.local_value()) + else: + output_tensor_list, treespec = tree_flatten(output) + # Need to return flattened tensors, spec to re-pack them, as well + # as if the return type was actually an RRef to reconstruct. + return output_tensor_list, treespec, output_is_rref + + +def _tree_unflatten_with_rref(output, treespec, output_is_rref): + output = tree_unflatten(output, treespec) + if output_is_rref: + output = RRef(output) + return output + + +def _find_tensors(obj): + r"""Recursively find all tensors contained in the specified object.""" + if RPC_AVAILABLE and isinstance(obj, RRef): + # If the current node is the owner of the RRef, unwrap it and try to + # find Tensors. + # TODO: Expand to remote RRefs. + if obj.is_owner(): + return _find_tensors(obj.local_value()) + if isinstance(obj, torch.Tensor): + return [obj] + if isinstance(obj, (list, tuple)): + return itertools.chain.from_iterable(map(_find_tensors, obj)) + if isinstance(obj, dict): + return itertools.chain.from_iterable(map(_find_tensors, obj.values())) + if is_dataclass(obj): + return itertools.chain.from_iterable( + map(_find_tensors, (getattr(obj, f.name) for f in fields(obj))) + ) + + return [] + + +def _dump_DDP_relevant_env_vars(): + relevant_env_vars = [ + "RANK", + "LOCAL_RANK", + "WORLD_SIZE", + "MASTER_PORT", + "MASTER_ADDR", + "CUDA_VISIBLE_DEVICES", + "GLOO_SOCKET_IFNAME", + "GLOO_DEVICE_TRANSPORT", + "NCCL_SOCKET_IFNAME", + "TORCH_NCCL_BLOCKING_WAIT", + "NCCL_DEBUG", + "NCCL_DEBUG_SUBSYS", + "NCCL_IB_DISABLE", + # More NCCL env vars: + "NCCL_P2P_DISABLE", + "NCCL_P2P_LEVEL", + "NCCL_SHM_DISABLE", + "NCCL_SOCKET_NTHREADS", + "NCCL_NSOCKS_PERTHREAD", + "NCCL_BUFFSIZE", + "NCCL_NTHREADS", + "NCCL_RINGS", + "NCCL_MAX_NCHANNELS", + "NCCL_MIN_NCHANNELS", + "NCCL_CHECKS_DISABLE", + "NCCL_CHECK_POINTERS", + "NCCL_LAUNCH_MODE", + "NCCL_IB_HCA", + "NCCL_IB_TIMEOUT", + "NCCL_IB_RETRY_CNT", + "NCCL_IB_GID_INDEX", + "NCCL_IB_SL", + "NCCL_IB_TC", + "NCCL_IB_AR_THRESHOLD", + "NCCL_IB_CUDA_SUPPORT", + "NCCL_NET_GDR_LEVEL", + "NCCL_NET_GDR_READ", + "NCCL_SINGLE_RING_THRESHOLD", + "NCCL_LL_THRESHOLD", + "NCCL_TREE_THRESHOLD", + "NCCL_ALGO", + "NCCL_PROTO", + "NCCL_IGNORE_CPU_AFFINITY", + "NCCL_DEBUG_FILE", + "NCCL_COLLNET_ENABLE", + "NCCL_TOPO_FILE", + "NCCL_TOPO_DUMP_FILE", + "TORCH_NCCL_ASYNC_ERROR_HANDLING", + ] + formatted_output = "" + for var in relevant_env_vars: + value = os.environ.get(var, "N/A") + formatted_output += f"env:{var}={value}\n" + print(formatted_output) + + +class _BufferCommHookLocation(Enum): + PRE_FORWARD = auto() + POST_FORWARD = auto() + + +@dataclass +class _BufferCommHook: + buffer_comm_hook: Callable + buffer_comm_hook_state: Any + buffer_comm_hook_location: _BufferCommHookLocation + + +# Add a DDPSink to run various functions when backwards starts, such as +# queueing call back of out-most backward/graph task, +# this helps call back is fired after all gradients' calculation +# is completed. +class _DDPSink(Function): + @staticmethod + # pyrefly: ignore [bad-override] + def forward(ctx, ddp_weakref, *inputs): + # set_materialize_grads(False) will ensure that None gradients stay as + # None and are not filled with zeros. + ctx.set_materialize_grads(False) + ctx.ddp_weakref = ddp_weakref + ret = inputs + if ddp_weakref()._ddp_sink_clone: + ret = tuple( + inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs + ) + return ret + + @staticmethod + def backward(ctx, *grad_outputs): + # Enqueue delay allreduce for static graph training on the first + # iteration. + ddp_weakref = ctx.ddp_weakref() + reducer = ddp_weakref.reducer + static_graph = ddp_weakref.static_graph + delay_ar_enqueued = ( + static_graph and ddp_weakref._static_graph_delay_allreduce_enqueued + ) + if static_graph and not delay_ar_enqueued: + Variable._execution_engine.queue_callback( # type: ignore[call-arg,misc] + reducer._delay_all_reduce + ) + ddp_weakref._static_graph_delay_allreduce_enqueued = True + + return (None, *grad_outputs) + + +class _DDPJoinHook(JoinHook): + def __init__(self, ddp, divide_by_initial_world_size): + """Set config variables for internal usage.""" + assert isinstance(ddp, DistributedDataParallel), ( + "DDP join hook requires passing in a DistributedDataParallel " + "instance as the state" + ) + assert ddp.logger is not None + ddp.logger._set_uneven_input_join() + self.ddp = ddp + self.ddp._divide_by_initial_world_size = divide_by_initial_world_size + super().__init__() + + def main_hook(self): + """Shadow the DDP collective communication operations in the forward and backward passes.""" + ddp = self.ddp + # Buckets are rebuilt only once during a training period + ddp.reducer._rebuild_buckets() + + # Schedule a broadcast if we are syncing module buffers in the + # forward pass + # TODO: make DDP uneven inputs context manager support buffer + # comm hook (https://github.com/pytorch/pytorch/issues/65436) + ddp._check_and_sync_module_buffers() + + # Check if need to sync in the backward pass + should_sync_backwards = ddp._check_global_requires_backward_grad_sync( + is_joined_rank=True + ) + # Forward parameter sync is disabled in the next iteration if we + # are skipping gradient sync this iteration, so set + # `require_forward_param_sync` accordingly + ddp.require_forward_param_sync = should_sync_backwards + if not should_sync_backwards: + return + + # Schedule one allreduce per gradient bucket to match the backward + # pass allreduce + ddp._match_all_reduce_for_bwd_pass() + + # Check if we need to allreduce locally unused parameters + if ddp.find_unused_parameters: + ddp._match_unused_params_allreduce() + + # Rebuilt parameters are pushed only once during a training period + ddp.reducer._push_all_rebuilt_params() + + def post_hook(self, is_last_joiner: bool): + """Sync the final model to ensure that the model is the same across all processes.""" + self.ddp._sync_final_model(is_last_joiner) + + +class DistributedDataParallel(Module, Joinable): + r"""Implement distributed data parallelism based on ``torch.distributed`` at module level. + + This container provides data parallelism by synchronizing gradients + across each model replica. The devices to synchronize across are + specified by the input ``process_group``, which is the entire world + by default. Note that ``DistributedDataParallel`` does not chunk or + otherwise shard the input across participating GPUs; the user is + responsible for defining how to do so, for example through the use + of a :class:`DistributedSampler`. + + See also: :ref:`distributed-basics` and :ref:`cuda-nn-ddp-instead`. + The same constraints on input as in :class:`torch.nn.DataParallel` apply. + + Creation of this class requires that ``torch.distributed`` to be already + initialized, by calling :func:`torch.distributed.init_process_group`. + + ``DistributedDataParallel`` is proven to be significantly faster than + :class:`torch.nn.DataParallel` for single-node multi-GPU data + parallel training. + + To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn + up ``N`` processes, ensuring that each process exclusively works on a single + GPU from 0 to N-1. This can be done by either setting + ``CUDA_VISIBLE_DEVICES`` for every process or by calling the following API for GPUs, + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.cuda.set_device(i) + + or calling the unified API for :ref:`accelerator`, + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.accelerator.set_device_index(i) + + where i is from 0 to N-1. In each process, you should refer the following + to construct this module: + + >>> # xdoctest: +SKIP("undefined variables") + >>> if torch.accelerator.is_available(): + >>> device_type = torch.accelerator.current_accelerator().type + >>> vendor_backend = torch.distributed.get_default_backend_for_device(device_type) + >>> + >>> torch.distributed.init_process_group( + >>> backend=vendor_backend, world_size=N, init_method='...' + >>> ) + >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i) + + Or you can use the latest API for initialization: + + >>> torch.distributed.init_process_group(device_id=i) + + In order to spawn up multiple processes per node, you can use either + ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``. + + .. note:: + Please refer to `PyTorch Distributed Overview `__ + for a brief introduction to all features related to distributed training. + + .. note:: + ``DistributedDataParallel`` can be used in conjunction with + :class:`torch.distributed.optim.ZeroRedundancyOptimizer` to reduce + per-rank optimizer states memory footprint. Please refer to + `ZeroRedundancyOptimizer recipe `__ + for more details. + + .. note:: ``nccl`` backend is currently the fastest and highly recommended + backend when using GPUs. This applies to both single-node and + multi-node distributed training. + + .. note:: This module also supports mixed-precision distributed training. + This means that your model can have different types of parameters such + as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these + mixed types of parameters will just work fine. + + .. note:: If you use ``torch.save`` on one process to checkpoint the module, + and ``torch.load`` on some other processes to recover it, make sure that + ``map_location`` is configured properly for every process. Without + ``map_location``, ``torch.load`` would recover the module to devices + where the module was saved from. + + .. note:: When a model is trained on ``M`` nodes with ``batch=N``, the + gradient will be ``M`` times smaller when compared to the same model + trained on a single node with ``batch=M*N`` if the loss is summed (NOT + averaged as usual) across instances in a batch (because the gradients + between different nodes are averaged). You should take this into + consideration when you want to obtain a mathematically equivalent + training process compared to the local training counterpart. But in most + cases, you can just treat a DistributedDataParallel wrapped model, a + DataParallel wrapped model and an ordinary model on a single GPU as the + same (E.g. using the same learning rate for equivalent batch size). + + .. note:: + Parameters are never broadcast between processes. The module performs + an all-reduce step on gradients and assumes that they will be modified + by the optimizer in all processes in the same way. Buffers + (e.g. BatchNorm stats) are broadcast from the module in process of rank + 0, to all other replicas in the system in every iteration. + + .. note:: + If you are using DistributedDataParallel in conjunction with the + :ref:`distributed-rpc-framework`, you should always use + :meth:`torch.distributed.autograd.backward` to compute gradients and + :class:`torch.distributed.optim.DistributedOptimizer` for optimizing + parameters. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> import torch.distributed.autograd as dist_autograd + >>> from torch.nn.parallel import DistributedDataParallel as DDP + >>> import torch + >>> from torch import optim + >>> from torch.distributed.optim import DistributedOptimizer + >>> import torch.distributed.rpc as rpc + >>> from torch.distributed.rpc import RRef + >>> + >>> t1 = torch.rand((3, 3), requires_grad=True) + >>> t2 = torch.rand((3, 3), requires_grad=True) + >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2)) + >>> ddp_model = DDP(my_model) + >>> + >>> # Setup optimizer + >>> optimizer_params = [rref] + >>> for param in ddp_model.parameters(): + >>> optimizer_params.append(RRef(param)) + >>> + >>> dist_optim = DistributedOptimizer( + >>> optim.SGD, + >>> optimizer_params, + >>> lr=0.05, + >>> ) + >>> + >>> with dist_autograd.context() as context_id: + >>> pred = ddp_model(rref.to_here()) + >>> loss = loss_func(pred, target) + >>> dist_autograd.backward(context_id, [loss]) + >>> dist_optim.step(context_id) + + .. note:: + DistributedDataParallel currently offers limited support for gradient + checkpointing with :meth:`torch.utils.checkpoint`. + If the checkpoint is done with use_reentrant=False (recommended), DDP + will work as expected without any limitations. + If, however, the checkpoint is done with use_reentrant=True (the default), + DDP will work as expected when there are no unused parameters in the model + and each layer is checkpointed at most once (make sure you are not passing + `find_unused_parameters=True` to DDP). We currently do not support the + case where a layer is checkpointed multiple times, or when there unused + parameters in the checkpointed model. + + .. note:: + To let a non-DDP model load a state dict from a DDP model, + :meth:`~torch.nn.modules.utils.consume_prefix_in_state_dict_if_present` + needs to be applied to strip the prefix "module." in the DDP state dict before loading. + + .. warning:: + Constructor, forward method, and differentiation of the output (or a + function of the output of this module) are distributed synchronization + points. Take that into account in case different processes might be + executing different code. + + .. warning:: + This module assumes all parameters are registered in the model by the + time it is created. No parameters should be added nor removed later. + Same applies to buffers. + + .. warning:: + This module assumes all parameters are registered in the model of each + distributed processes are in the same order. The module itself will + conduct gradient ``allreduce`` following the reverse order of the + registered parameters of the model. In other words, it is users' + responsibility to ensure that each distributed process has the exact + same model and thus the exact same parameter registration order. + + .. warning:: + This module allows parameters with non-rowmajor-contiguous strides. + For example, your model may contain some parameters whose + :class:`torch.memory_format` is ``torch.contiguous_format`` + and others whose format is ``torch.channels_last``. However, + corresponding parameters in different processes must have the + same strides. + + .. warning:: + This module doesn't work with :func:`torch.autograd.grad` (i.e. it will + only work if gradients are to be accumulated in ``.grad`` attributes of + parameters). + + .. warning:: + If you plan on using this module with a ``nccl`` backend or a ``gloo`` + backend (that uses Infiniband), together with a DataLoader that uses + multiple workers, please change the multiprocessing start method to + ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately + Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will + likely experience deadlocks if you don't change this setting. + + .. warning:: + You should never try to change your model's parameters after wrapping + up your model with ``DistributedDataParallel``. Because, when + wrapping up your model with ``DistributedDataParallel``, the constructor + of ``DistributedDataParallel`` will register the additional gradient + reduction functions on all the parameters of the model itself at the + time of construction. If you change the model's parameters afterwards, + gradient reduction functions no longer match the correct set of + parameters. + + .. warning:: + Using ``DistributedDataParallel`` in conjunction with the + :ref:`distributed-rpc-framework` is experimental and subject to change. + + Args: + module (Module): module to be parallelized + device_ids (list of int or torch.device): CUDA devices. + 1) For single-device modules, ``device_ids`` can + contain exactly one device id, which represents the only + CUDA device where the input module corresponding to this process resides. + Alternatively, ``device_ids`` can also be ``None``. + 2) For multi-device modules and CPU modules, + ``device_ids`` must be ``None``. + + When ``device_ids`` is ``None`` for both cases, + both the input data for the forward pass and the actual module + must be placed on the correct device. + (default: ``None``) + output_device (int or torch.device): Device location of output for + single-device CUDA modules. For multi-device modules and + CPU modules, it must be ``None``, and the module itself + dictates the output location. (default: ``device_ids[0]`` + for single-device modules) + broadcast_buffers (bool): Flag that enables syncing (broadcasting) + buffers of the module at beginning of the ``forward`` + function. (default: ``True``) + init_sync (bool): Whether to sync during initialization to verify param + shapes and broadcast parameters and buffers. + WARNING: if this is set to False the user is required + to ensure themselves that the weights are the same on + all ranks. + (default: ``True``) + process_group: The process group to be used for distributed data + all-reduction. If ``None``, the default process group, which + is created by :func:`torch.distributed.init_process_group`, + will be used. (default: ``None``) + bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into + multiple buckets so that gradient reduction of each + bucket can potentially overlap with backward computation. + :attr:`bucket_cap_mb` controls the bucket size in + MebiBytes (MiB). If ``None``, a default size of 25 MiB + will be used. (default: ``None``) + find_unused_parameters (bool): Traverse the autograd graph from all + tensors contained in the return value of the + wrapped module's ``forward`` function. Parameters + that don't receive gradients as part of this + graph are preemptively marked as being ready to + be reduced. In addition, parameters that may have + been used in the wrapped module's ``forward`` + function but were not part of loss computation and + thus would also not receive gradients are + preemptively marked as ready to be reduced. + (default: ``False``) + check_reduction: This argument is deprecated. + gradient_as_bucket_view (bool): When set to ``True``, gradients will be views + pointing to different offsets of ``allreduce`` communication + buckets. This can reduce peak memory usage, where the + saved memory size will be equal to the total gradients + size. Moreover, it avoids the overhead of copying between + gradients and ``allreduce`` communication buckets. When + gradients are views, ``detach_()`` cannot be called on the + gradients. If hitting such errors, please fix it by + referring to the :meth:`~torch.optim.Optimizer.zero_grad` + function in ``torch/optim/optimizer.py`` as a solution. + Note that gradients will be views after first iteration, so + the peak memory saving should be checked after first iteration. + static_graph (bool): When set to ``True``, DDP knows the trained graph is + static. Static graph means 1) The set of used and unused + parameters will not change during the whole training loop; in + this case, it does not matter whether users set + ``find_unused_parameters = True`` or not. 2) How the graph is trained + will not change during the whole training loop (meaning there is + no control flow depending on iterations). + When static_graph is set to be ``True``, DDP will support cases that + can not be supported in the past: + 1) Reentrant backwards. + 2) Activation checkpointing multiple times. + 3) Activation checkpointing when model has unused parameters. + 4) There are model parameters that are outside of forward function. + 5) Potentially improve performance when there are unused parameters, + as DDP will not search graph in each iteration to detect unused + parameters when static_graph is set to be ``True``. + To check whether you can set static_graph to be ``True``, one way is to + check ddp logging data at the end of your previous model training, + if ``ddp_logging_data.get("can_set_static_graph") == True``, mostly you + can set ``static_graph = True`` as well. + + Example:: + >>> # xdoctest: +SKIP("undefined variables") + >>> model_DDP = torch.nn.parallel.DistributedDataParallel(model) + >>> # Training loop + >>> ... + >>> ddp_logging_data = model_DDP._get_ddp_logging_data() + >>> static_graph = ddp_logging_data.get("can_set_static_graph") + delay_all_reduce_named_params (list of tuple of str and torch.nn.Parameter): a list + of named parameters whose all reduce will be delayed when the gradient of + the parameter specified in ``param_to_hook_all_reduce`` is ready. Other + arguments of DDP do not apply to named params specified in this argument + as these named params will be ignored by DDP reducer. + param_to_hook_all_reduce (torch.nn.Parameter): a parameter to hook delayed all reduce + of parameters specified in ``delay_all_reduce_named_params``. + skip_all_reduce_unused_params: When set to True, DDP will skip reducing unused parameters. + This requires that unused parameters remain the same across all ranks throughout + the entire training process. If this condition is not met, it may cause + desynchronization and result in training hang. + + + Attributes: + module (Module): the module to be parallelized. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') + >>> net = torch.nn.parallel.DistributedDataParallel(model) + """ + + # used to track whether the given thread is inside ddp forward for torchdynamo purposes + _active_ddp_module: Optional["DistributedDataParallel"] = None + + def __init__( + self, + module, + device_ids=None, + output_device=None, + dim=0, + broadcast_buffers=True, + init_sync=True, + process_group=None, + bucket_cap_mb=None, + find_unused_parameters=False, + check_reduction=False, + gradient_as_bucket_view=False, + static_graph=False, + delay_all_reduce_named_params=None, + param_to_hook_all_reduce=None, + mixed_precision: _MixedPrecision | None = None, + device_mesh=None, + skip_all_reduce_unused_params=False, + ): + super().__init__() + Joinable.__init__(self) + self._use_python_reducer = ( + torch._dynamo.utils.get_optimize_ddp_mode() == "python_reducer" + ) + self.logger: dist.Logger | None = None + if bool(delay_all_reduce_named_params is not None) != bool( + param_to_hook_all_reduce is not None + ): + self._log_and_throw( + ValueError, + "delay_all_reduce_named_params and param_to_hook_all_reduce " + "need to be set at the same time.", + ) + + if process_group and device_mesh is not None: + raise RuntimeError( + "Cannot specify both process_group and device_mesh arguments." + ) + elif process_group is None and device_mesh is None: + self.process_group = _get_default_group() + elif device_mesh is None: + # pyrefly: ignore [bad-assignment] + self.process_group = process_group + else: + if device_mesh.ndim != 1: + raise RuntimeError( + f"Only 1D device mesh is supported, but got {device_mesh}." + ) + self.device_mesh = device_mesh + self.process_group = device_mesh.get_group(mesh_dim=0) + + root_mesh = device_mesh._get_root_mesh() + # if a root mesh is not the same as device_mesh, + # meaning the device_mesh is sliced out from the root mesh. + if root_mesh != device_mesh: + # TODO: This is a temporary work around to enable DDP + TP. + # We should do the logic in DDP so that the 2D implementation is + # sound and the state_dict works out of the box. + # This has to be done before check UninitializedParameter. + from torch.distributed.tensor.parallel.ddp import ( + _pre_dp_module_transform, + ) + + _pre_dp_module_transform(module) + + self._delay_all_reduce_params = [] + if hasattr(module, "_ddp_params_and_buffers_to_ignore"): + self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore) + else: + self.parameters_to_ignore = set() + if delay_all_reduce_named_params is not None: + for name, param in delay_all_reduce_named_params: + self.parameters_to_ignore.add(name) + self._delay_all_reduce_params.append(param) + + self._module_parameters = [ + p + for n, p in module.named_parameters() + if n not in self.parameters_to_ignore + ] + if not any(p.requires_grad for p in self._module_parameters): + if len(self._delay_all_reduce_params): + logger.info("Delay the AllReduce of all parameters.") + else: + self._log_and_throw( + RuntimeError, + "DistributedDataParallel is not needed when a module " + "doesn't have any parameter that requires a gradient.", + ) + + if device_ids is not None and len(device_ids) > 1: + self._log_and_throw( + ValueError, + "device_ids can only be None or contain a single element.", + ) + + self.is_multi_device_module = ( + len({p.device for p in self._module_parameters}) > 1 + ) + distinct_device_types = { + p.device.type for p in self._module_parameters if p.device is not None + } + if len(distinct_device_types) != 1: + self._log_and_throw( + ValueError, + "DistributedDataParallel's input module must be on " + f"the same type of devices, but input module parameters locate in {distinct_device_types}.", + ) + + self.device_type = next(iter(distinct_device_types)) + + if ( + device_ids is None + or len(device_ids) == 0 # For backward compatibility. + or self.device_type == "cpu" + or self.is_multi_device_module + ): + if device_ids or output_device: + self._log_and_throw( + ValueError, + "DistributedDataParallel device_ids and output_device arguments " + "only work with single-device/multiple-device GPU modules or CPU modules, " + f"but got device_ids {device_ids}, output_device {output_device}, " + f"and module parameters { ({p.device for p in self._module_parameters}) }.", + ) + + self.device_ids = None + self.output_device = None + else: + # pyrefly: ignore [bad-assignment] + self.device_ids = [_get_device_index(x, True) for x in device_ids] + + if output_device is None: + output_device = device_ids[0] + + # pyrefly: ignore [bad-assignment] + self.output_device = _get_device_index(output_device, True) + + self.static_graph = False + self.dim = dim + self.module = module + self.device = next(iter(self._module_parameters)).device + self.broadcast_buffers = broadcast_buffers + self.find_unused_parameters = find_unused_parameters + self.require_backward_grad_sync = True + self.require_forward_param_sync = True + self.gradient_as_bucket_view = gradient_as_bucket_view + self.mixed_precision = mixed_precision + if self.mixed_precision is not None: + logger.warning("Received mixed precision config %s", self.mixed_precision) + + if check_reduction: + # This argument is no longer used since the reducer + # will ensure reduction completes even if some parameters + # do not receive gradients. + warnings.warn( + "The `check_reduction` argument in `DistributedDataParallel` " + "module is deprecated. Please avoid using it.", + FutureWarning, + stacklevel=2, + ) + + # Check that a module does not have Uninitialized parameters + for param in self._module_parameters: + if isinstance(param, torch.nn.parameter.UninitializedParameter): + self._log_and_throw( + RuntimeError, + "Modules with uninitialized parameters can't be used with `DistributedDataParallel`. " + "Run a dummy forward pass to correctly initialize the modules", + ) + # used for intra-node param sync and inter-node sync as well + self.broadcast_bucket_size = 250 * 1024 * 1024 + + # reduction bucket size + if bucket_cap_mb is None: + # default case (bucket cap is 25 MiB) + bucket_cap_mb = 25 + self.bucket_bytes_cap_default = True + else: + self.bucket_bytes_cap_default = False + self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024) + + # Whether to perform input tensor CPU to GPU copies on a side-stream + self.use_side_stream_for_tensor_copies = ( + os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1" + ) + + # Initialize gradient buffers and register all reduce hook + self._delay_grad_buffer: torch.Tensor | None = None + self._delay_grad_views: list[torch.Tensor] = [] + self._delay_all_reduce_all_params = False + if len(self._delay_all_reduce_params) != 0: + self._register_delay_all_reduce_hook( + bucket_cap_mb=bucket_cap_mb, + param_to_hook_all_reduce=param_to_hook_all_reduce, + device_ids=device_ids, + ) + if self._delay_all_reduce_all_params: + return + + self.skip_all_reduce_unused_params = skip_all_reduce_unused_params + + # Build parameters for reducer. + parameters, expect_sparse_gradient = self._build_params_for_reducer() + + # All collectives during initialization are gated by this flag. + if init_sync: + # Verify model equivalence. + _verify_param_shape_across_processes(self.process_group, parameters) + # Sync params and buffers. Ensures all DDP models start off at the same value. + _sync_module_states( + module=self.module, + process_group=self.process_group, + broadcast_bucket_size=self.broadcast_bucket_size, + src=0, + params_and_buffers_to_ignore=self.parameters_to_ignore, + broadcast_buffers=self.broadcast_buffers, + ) + + # In debug mode, build a mapping of parameter index -> parameter. + param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + + # Builds reducer. + self._ddp_init_helper( + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, + ) + self._comm_hooks: list[tuple[Callable, object]] = [] + + if self.mixed_precision is not None: + _setup_mixed_precision_params(self.mixed_precision, self.module) + _cast_buffers(self.mixed_precision, self.module) + # Stream used for async low precision copies. + self._mp_stream = torch.Stream() + self._submodule_to_event = defaultdict(deque) # type: ignore[var-annotated] + # Add forward pre-hook to root module to kick off copies to lower + # precision. + self.module.register_forward_pre_hook( + self._root_copy_hook, prepend=False, with_kwargs=True + ) + # Add forward pre hook to all submodules to wait for copy events + # before running computation. + for module in self.module.modules(): + module.register_forward_pre_hook( + self._module_wait_for_copy_hook, + prepend=False, + with_kwargs=True, + ) + # Set up callbacks in backward to upcast and use full precision + # params. TODO (rohan-varma): Make this compose with general + # comm hooks and apply_optimizer_in_backward. Importing inline to + # avoid circular import issue. + from torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks import ( + _AllreduceUpcastHookState, + _reducer_allreduce_and_upcast_hook, + ) + + upcast_hook_state = _AllreduceUpcastHookState( + ddp_weakref=weakref.ref(self), + upcast_stream=torch.Stream(), + ) + self.register_comm_hook( + upcast_hook_state, + _reducer_allreduce_and_upcast_hook, + ) + # Inform reducer of reduced precision param dtype for correctness + # of type checks between gradient and bucket. + self.reducer._set_mixed_precision_param_dtype( # type: ignore[attr-defined] + self.mixed_precision.param_dtype + ) + + self._has_rebuilt_buckets = False + + if static_graph: + self._set_static_graph() + + self._lazy_init_ran = False + + # Register the AccumulateGrad post hooks if optimize_ddp is + # True. The hooks will be deregistered if compiled_autograd is not + # enabled. + self._accum_grad_hooks: list[RemovableHandle] = [] + if self._use_python_reducer: + # pyrefly: ignore [bad-assignment] + torch._inductor.config._fuse_ddp_communication = True + torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb + # Directly adding this to the trace rule will disturb the users + # who are using DDPOptimizer. + torch._dynamo.trace_rules.LEGACY_MOD_INLINELIST.add( + "torch.nn.parallel.distributed" + ) + torch._dynamo.trace_rules.get_legacy_mod_inlinelist.cache_clear() + # NOTE: we should init these lazily + self._register_accum_grad_hook() + + # Whether or not DDPSink performs a clone. + self._ddp_sink_clone = True + + def _register_accum_grad_hook(self): + import torch.distributed._functional_collectives as fcol + + def compiled_accum_grad_hook( + param, + *, + param_index: int, + ): + if not self.require_backward_grad_sync: + return + + if param.grad is None: + return + + if self._comm_hooks: + for hook, state in self._comm_hooks: + hook(state, (param.grad, param)) + else: + gradient = param.grad / self.process_group.size() + gradient = fcol.all_reduce(gradient, "sum", self.process_group) + param.grad.copy_(gradient) + + for index, param in enumerate(self._module_parameters): + if not param.requires_grad: + continue + self._accum_grad_hooks.append( + param.register_post_accumulate_grad_hook( + functools.partial( + compiled_accum_grad_hook, + param_index=index, + ) + ) + ) + + def _delayed_all_reduce_hook(self, grad): + world_size = dist.get_world_size(self.process_group) + + self._delay_grad_buffer.div_(world_size) # type: ignore[union-attr] + _ = dist.all_reduce( + self._delay_grad_buffer, group=self.process_group, async_op=True + ) + return grad + + def _register_delay_all_reduce_hook( + self, + bucket_cap_mb, + param_to_hook_all_reduce, + device_ids, + ): + # 1. Create gradient buffer + device = torch.device("cpu") if device_ids is None else device_ids[0] + self._delay_grad_buffer = torch.zeros( + sum(p.numel() for p in self._delay_all_reduce_params), + device=device, + ) + + # 2. Broadcast the parameters + detached_params = [p.detach() for p in self._delay_all_reduce_params] + dist._broadcast_coalesced(self.process_group, detached_params, bucket_cap_mb, 0) + + # 3. Hook all reduce to the specified parameter + param_to_hook_all_reduce.register_hook(self._delayed_all_reduce_hook) + + # 4. Build tensor views for gradients + offset = 0 + for param in self._delay_all_reduce_params: + grad_view = self._delay_grad_buffer[offset : (offset + param.numel())].view( + param.shape + ) + self._delay_grad_views.append(grad_view) + offset = offset + param.numel() + + # 5. Check whether the all reduce of all params requiring grad is delayed. + for module_name, module in self.module.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + if param.requires_grad: + full_name = f"{module_name}.{param_name}" + if full_name not in self.parameters_to_ignore: + # There is at least a param whose all reduce will not be delayed. + # In this case, we should not set self._delay_all_reduce_all_params + # to True. + return + self._delay_all_reduce_all_params = True + + def _setup_in_backward_optimizers(self): + # Check if user has used apply_optim_in_backward to overlap optimizer + # step + DDP backward. Current constraints: + # 1. Only allreduce is supported at the moment, no custom communication. + # 2. For DDP-managed parameters that have their optimizer run in + # backward, their gradients are set to ``None``. If your use case + # requires DDP parameters grad not to be set to ``None`` after their + # in-backward optimizer runs, please ping + # https://github.com/pytorch/pytorch/issues/90052. + # NOTE: we use self._module_parameters instead of .parameters() since + # the former excludes ignored (non-DDP managed) parameters. + if any(hasattr(p, "_in_backward_optimizers") for p in self._module_parameters): + torch._C._log_api_usage_once("ddp.optimizer_in_backward") + # Remove hooks that apply_optim_in_backward had registered because + # DDP customizes how optimizer is overlapped with backward due to + # the allreduce. + param_to_handle_map = ( + dist.optim.apply_optimizer_in_backward.param_to_optim_hook_handle_map + ) + for p in self._module_parameters: + for handle in param_to_handle_map.get(p, []): + handle.remove() + + # Need a weakref to DDP instance to run all_reduce (from reducer) + # and get managed DDP parameters. + ddp_weakref = weakref.ref(self) + # Note: importing in function, otherwise this will cause a circular + # import. + from torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks import ( + _apply_optim_in_backward_hook, + ) + + self.register_comm_hook( + ddp_weakref, + _apply_optim_in_backward_hook( + gradient_is_bucket_view=self.gradient_as_bucket_view + ), + ) + + self.reducer._set_optimizer_in_backward() # type: ignore[attr-defined] + + def _fire_reducer_autograd_hook(self, idx, *unused): + """ + Fire the reducer's autograd hook to allreduce params in a Reducer bucket. + + Note that this is only used during mixed precision training as the + Reducer's hooks installed during construction time would not be called + as we're working in the low precision parameter setting. + """ + self.reducer._autograd_hook(idx) # type: ignore[attr-defined] + + def _root_copy_hook(self, *args: Any, **kwargs: Any) -> None: + """ + For DDP mixed precision, put low precision copies on separate stream and create events to wait for them. + + When training with DDP mixed precision, this root pre-forward hook kicks + off low precision copies on a separate stream and creates respective + events to wait for them. + """ + # Clear out previous iteration submodule to event. This is because we + # may have populated some events for modules that didn't end up being + # used. + self._submodule_to_event = defaultdict(deque) # type: ignore[var-annotated] + with self._mp_stream: + for submodule in self.module.modules(): + for param in submodule.parameters(recurse=False): + # Do not cast DDP ignored parameters. + if hasattr(param, "_ddp_ignored") and param._ddp_ignored: + continue + _alloc_storage(param._mp_param, param.size()) + # copy() implicitly casts to low precision + with torch.no_grad(): + param._mp_param.copy_(param.data) + # TODO: when zero_grad(set_to_none=False) or in grad + # accumulation case, accumulated grads can be in fp32 + # which can cause errors when running DDP backwards due + # to mismatched incoming and accumulated gradient types. + # So we manually cast the accumulated grad down for now, + # in the future we may shift to FSDP style gradient + # accumulation management where the accumulated gradient + # is saved and .grad field is set to None, bypassing + # this issue. + if param.grad is not None: + param.grad.data = param.grad.to( + self.mixed_precision.param_dtype # type: ignore[union-attr] + ) + param.data = param._mp_param + copy_event = torch.Event() + copy_event.record() + self._submodule_to_event[submodule].append(copy_event) + + def _module_wait_for_copy_hook( + self, + module, + *args: Any, + **kwargs: Any, + ) -> None: + """Before carrying out computation, wait on the appropriate event to ensure low precision copies have finished.""" + try: + event = self._submodule_to_event[module].popleft() + except IndexError: + # copy event has already been waited on + return + + event.wait(stream=torch.accelerator.current_stream()) + for p in module.parameters(recurse=False): + # Don't register hooks if param does not require grad + if not p.requires_grad or (hasattr(p, "_ddp_ignored") and p._ddp_ignored): + continue + # We need to register autograd hook here instead of DDP's ctor + # since we're working with the low precision param. Register them + # via obtaining the gradient accumulator. + tmp = p.expand_as(p) + grad_acc = tmp.grad_fn.next_functions[0][0] + + hook = grad_acc.register_hook( + functools.partial(self._fire_reducer_autograd_hook, p._idx) + ) + p._ddp_mp_hook_state = (grad_acc, hook) + + def _log_and_throw(self, err_type, err_msg): + if self.logger is not None: + self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}") + raise err_type(err_msg) + + def _ddp_init_helper( + self, + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, + ): + """ + DDP init helper function to manage parameters, grad hooks, logging, and SyncBatchNorm. + + Initialization helper function that does the following: + (1) bucketing the parameters for reductions + (2) resetting the bucketing states + (3) registering the grad hooks + (4) Logging construction-time DDP logging data + (5) passing a handle of DDP to SyncBatchNorm Layer + """ + # Notice, the parameters order is not in the order in which they are used, + # especially in models with control flow. + # + # Alongside parameters are not presented in the real execution order, + # if a certain model happens to also + # 1) have other collectives comm ops in its backward graph. + # 2) have unused parameter in subset ranks of the whole world. + # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter, + # matching up with other collectives comm ops on other ranks unexpectedly. + # + # In order to handle this corner case, when the parameters are not in the real execution order, + # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients + # of the whole graph are computed. + # + # Notice, here we only disable bucketing for the first iteration. + # After the first iteration, it's OK to rebuild buckets, + # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph. + + # Can remove this branching once #73732 is landed. + if static_graph is True or self.find_unused_parameters is False: + bucket_size_limits = [sys.maxsize] + else: + if self.bucket_bytes_cap_default: + bucket_size_limits = [ + dist._DEFAULT_FIRST_BUCKET_BYTES, + self.bucket_bytes_cap, + ] + else: + bucket_size_limits = [self.bucket_bytes_cap] + ( + bucket_indices, + per_bucket_size_limits, + ) = dist._compute_bucket_assignment_by_size( + parameters, + bucket_size_limits, + expect_sparse_gradient, + ) + + # Remember index for parameters if we are in mixed precision, as we + # need to pass in index to Reducer's autograd hook via python. + if self.mixed_precision is not None: + for i, p in enumerate(parameters): + p._idx = i + + # Note: reverse list of buckets because we want to approximate the + # order in which their gradients are produced, and assume they + # are used in the forward pass in the order they are defined. + self.reducer = dist.Reducer( + parameters, + list(reversed(bucket_indices)), + list(reversed(per_bucket_size_limits)), + self.process_group, + expect_sparse_gradient, + # The bucket size limit is specified in the constructor. + # Additionally, we allow for a single small bucket for parameters + # that are defined first, such that their gradients don't spill into + # a much larger bucket, adding unnecessary latency after gradient + # computation finishes. Experiments showed 1MB is a reasonable value. + self.bucket_bytes_cap, + self.find_unused_parameters, + self.gradient_as_bucket_view, + param_to_name_mapping, + # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first + # bucket. + ( + dist._DEFAULT_FIRST_BUCKET_BYTES + if self.bucket_bytes_cap_default + else self.bucket_bytes_cap + ), + self.skip_all_reduce_unused_params, + self._use_python_reducer, + ) + + self.logger = dist.Logger(self.reducer) + # Set as a weak reference to avoid reference cycle between + # logger and reducer. + self.reducer.set_logger(self.logger) + + has_sync_bn = False + for submodule in self.module.modules(): + if isinstance(submodule, torch.nn.SyncBatchNorm): + has_sync_bn = True + break + + # Set logging data that can be got during construction time. + self.logger.set_construction_data_and_log( + self.module.__class__.__name__, + [] if self.device_ids is None else self.device_ids, + -1 if self.output_device is None else self.output_device, + self.broadcast_buffers, + has_sync_bn, + static_graph, + ) + + # passing a handle to torch.nn.SyncBatchNorm layer + self._passing_sync_batchnorm_handle(self.module) + + def __getstate__(self): + self._check_default_group() + attrs = copy.copy(self.__dict__) + del attrs["process_group"] + del attrs["reducer"] + del attrs["logger"] + return attrs + + def __setstate__(self, state): + # If serializable, then the process group should be the default one + self.process_group = _get_default_group() + super().__setstate__(state) + self.__dict__.setdefault("require_forward_param_sync", True) + self.__dict__.setdefault("require_backward_grad_sync", True) + parameters, expect_sparse_gradient = self._build_params_for_reducer() + # In debug mode, build a mapping of parameter index -> parameter. + param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + # Builds reducer. + self._ddp_init_helper( + parameters, + expect_sparse_gradient, + param_to_name_mapping, + self.static_graph, + ) + if self.static_graph: + self.reducer._set_static_graph() + assert self.logger is not None + self.logger._set_static_graph() + + def _build_params_for_reducer(self): + # Build tuple of (module, parameter) for all parameters that require grads. + modules_and_parameters = [ + (module, parameter) + for module_name, module in self.module.named_modules() + for parameter in [ + param + # Note that we access module.named_parameters instead of + # parameters(module). parameters(module) is only needed in the + # single-process multi device case, where it accesses replicated + # parameters through _former_parameters. + for param_name, param in module.named_parameters(recurse=False) + if param.requires_grad + and f"{module_name}.{param_name}" not in self.parameters_to_ignore + ] + ] + + # Deduplicate any parameters that might be shared across child modules. + memo = set() + modules_and_parameters = [ + # "p not in memo" is the deduplication check. + # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed. + (m, p) + for m, p in modules_and_parameters + if p not in memo and not memo.add(p) # type: ignore[func-returns-value] + ] + + # Build list of parameters. + parameters = [parameter for _, parameter in modules_and_parameters] + + # Checks if a module will produce a sparse gradient. + def produces_sparse_gradient(module): + if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)): + return module.sparse + return False + + # Build list of booleans indicating whether or not to expect sparse + # gradients for the corresponding parameters. + expect_sparse_gradient = [ + produces_sparse_gradient(module) for module, _ in modules_and_parameters + ] + + self._assign_modules_buffers() + + return parameters, expect_sparse_gradient + + def _assign_modules_buffers(self): + """ + Assign self.module.named_buffers to self.modules_buffers. + + Assigns module buffers to self.modules_buffers which are then used to + broadcast across ranks when broadcast_buffers=True. Note that this + must be called every time buffers need to be synced because buffers can + be reassigned by user module, + see https://github.com/pytorch/pytorch/issues/63916. + """ + # Collect buffers for modules, filtering out buffers that should be ignored. + named_module_buffers = [ + (buffer, buffer_name) + for buffer_name, buffer in self.module.named_buffers() + if buffer_name not in self.parameters_to_ignore + ] + self.modules_buffers = [ + buffer for (buffer, buffer_name) in named_module_buffers + ] + # Dict[str, tensor] representing module buffers not ignored by DDP. + self.named_module_buffers = { + buffer_name: buffer for (buffer, buffer_name) in named_module_buffers + } + + def _build_debug_param_to_name_mapping(self, parameters): + param_to_param_index = {parameters[i]: i for i in range(len(parameters))} + param_set = set(parameters) + param_index_to_param_fqn = {} + for module_name, module in self.module.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + fqn = f"{module_name}.{param_name}" + # Bypass ignored parameters since those are not reduced by DDP + # to begin with. + if fqn not in self.parameters_to_ignore and param.requires_grad: + if param not in param_set: + self._log_and_throw( + ValueError, + f"Param with name {fqn} found in module parameters, but not DDP parameters." + " This indicates a bug in DDP, please report an issue to PyTorch.", + ) + param_index = param_to_param_index[param] + param_index_to_param_fqn[param_index] = fqn + + # Ensure we covered all parameters + if len(param_set) != len(param_index_to_param_fqn): + self._log_and_throw( + ValueError, + ( + "Expected param to name mapping to cover all parameters, but" + f" got conflicting lengths: {len(param_set)} vs " + f"{len(param_index_to_param_fqn)}. This indicates a bug in DDP" + ", please report an issue to PyTorch." + ), + ) + + return param_index_to_param_fqn + + def _get_parameters(self, m, recurse=True): + """Return a generator of module parameters.""" + + def model_parameters(m): + ps = ( + m._former_parameters.values() + if hasattr(m, "_former_parameters") + else m.parameters(recurse=False) + ) + yield from ps + + for mod in m.modules() if recurse else [m]: + yield from model_parameters(mod) + + def _check_default_group(self): + pickle_not_supported = False + try: + if self.process_group != _get_default_group(): + pickle_not_supported = True + except RuntimeError: + pickle_not_supported = True + + if pickle_not_supported: + self._log_and_throw( + RuntimeError, + "DDP Pickling/Unpickling are only supported " + "when using DDP with the default process " + "group. That is, when you have called " + "init_process_group and have not passed " + "process_group argument to DDP constructor", + ) + + @contextmanager + def no_sync(self): + r""" + Context manager to disable gradient synchronizations across DDP processes. + + Within this context, gradients will be accumulated on module + variables, which will later be synchronized in the first + forward-backward pass exiting the context. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg) + >>> with ddp.no_sync(): + >>> for input in inputs: + >>> ddp(input).backward() # no synchronization, accumulate grads + >>> ddp(another_input).backward() # synchronize grads + + .. warning:: + The forward pass should be included inside the context manager, or + else gradients will still be synchronized. + """ + old_require_backward_grad_sync = self.require_backward_grad_sync + self.require_backward_grad_sync = False + try: + yield + finally: + self.require_backward_grad_sync = old_require_backward_grad_sync + + @classmethod + def _get_active_ddp_module(cls): + """`TorchDynamo` requires DDP's status and module for cooperative optimization.""" + return cls._active_ddp_module + + # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in + # for the 'module_to_run' underneath + # see torch._dynamo/eval_frame.py TorchPatcher.patch for more details + @contextmanager + @torch._disable_dynamo(recursive=False) + def _inside_ddp_forward(self): + DistributedDataParallel._active_ddp_module = self + try: + yield + finally: + DistributedDataParallel._active_ddp_module = None + + def _run_ddp_forward(self, *inputs, **kwargs): + if self._use_python_reducer: + return self.module(*inputs, **kwargs) # type: ignore[index] + else: + with self._inside_ddp_forward(): + return self.module(*inputs, **kwargs) # type: ignore[index] + + def _clear_grad_buffer(self): + # Making param.grad points to the grad buffers before backward is based on the + # assumption that the grad accumulation is done in place in autograd engine, + # for some edge cases, if the grad accumulation in autograd engine is not in + # place, then the param.grad and grad buffers are detached. + if self._delay_grad_buffer is not None: + # We batch zero_grad for all params by resetting the whole grad + # buffer when the grad of all params is set to None. + all_param_grad_none = all( + param.grad is None for param in self._delay_all_reduce_params + ) + + for index, param in enumerate(self._delay_all_reduce_params): + if param.grad is None: + param.grad = self._delay_grad_views[index] + if not all_param_grad_none: + param.grad.zero_() + + if all_param_grad_none: + self._delay_grad_buffer.zero_() + + def _lazy_init(self): + # Initialization for DDP that occurs after construction, but lazily + # before the first forward pass. + self._setup_in_backward_optimizers() + self._lazy_init_ran = True + + def _pre_forward(self, *inputs, **kwargs): + if self._use_python_reducer: + return inputs, kwargs + + if not self._lazy_init_ran and not torch.compiler.is_compiling(): + self._lazy_init() + + if self._delay_all_reduce_all_params: + return inputs, kwargs + + if torch.is_grad_enabled() and self.require_backward_grad_sync: + assert self.logger is not None + self.logger.set_runtime_stats_and_log() + self.reducer.prepare_for_forward() + + # Notify the join context that this process has not joined, if + # needed + work = Join.notify_join_context(self) + if work: + self.reducer._set_forward_pass_work_handle( + work, + self._divide_by_initial_world_size, # type: ignore[arg-type] + ) + + # Calling _rebuild_buckets before forward computation, + # It may allocate new buckets before deallocating old buckets + # inside _rebuild_buckets. To save peak memory usage, + # call _rebuild_buckets before the peak memory usage increases + # during forward computation. + # This should be called only once during whole training period. + if torch.is_grad_enabled() and self.reducer._rebuild_buckets(): + logger.info("Reducer buckets have been rebuilt in this iteration.") + self._has_rebuilt_buckets = True + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + if self._check_sync_bufs_pre_fwd(): + self._sync_buffers() + + if self._join_config.enable: + # Notify joined ranks whether they should sync in backwards pass or not. + self._check_global_requires_backward_grad_sync(is_joined_rank=False) + + if self.device_ids: + moved_inputs, moved_kwargs = _to_kwargs( + inputs, + kwargs, + torch.device(self.device_type, self.device_ids[0]), + self.use_side_stream_for_tensor_copies, + ) + args, kwargs = moved_inputs[0], moved_kwargs[0] + # Cast inputs to reduced precision if needed. + if self.mixed_precision is not None: + args, kwargs = _cast_forward_inputs( + self.mixed_precision.param_dtype, + *args, + **kwargs, + ) + return args, kwargs + else: + # Cast inputs to reduced precision if needed. + # TODO (rohan-varma) test this codepath. + if self.mixed_precision is not None: + inputs, kwargs = _cast_forward_inputs( + self.mixed_precision.param_dtype, + *inputs, + **kwargs, + ) + return inputs, kwargs + + def _post_forward(self, output): + if self._use_python_reducer: + return output + + if self._delay_all_reduce_all_params: + self._clear_grad_buffer() + return output + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + if self._check_sync_bufs_post_fwd(): + self._sync_buffers() + + if torch.is_grad_enabled() and self.require_backward_grad_sync: + self.require_forward_param_sync = True + # We'll return the output object verbatim since it is a freeform + # object. We need to find any tensors in this object, though, + # because we need to figure out which parameters were used during + # this forward pass, to ensure we short circuit reduction for any + # unused parameters. Only if `find_unused_parameters` is set. + if self.find_unused_parameters and not self.static_graph: + # Do not need to populate this for static graph. + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + self.require_forward_param_sync = False + + # TODO: DDPSink is currently enabled for unused parameter detection and + # static graph training for first iteration. + if (self.find_unused_parameters and not self.static_graph) or ( + self.static_graph and not self._static_graph_delay_allreduce_enqueued + ): + ( + output_tensor_list, + treespec, + output_is_rref, + ) = _tree_flatten_with_rref(output) + output_placeholders: list[torch.Tensor | None] = [ + None for _ in range(len(output_tensor_list)) + ] + # Do not touch tensors that have no grad_fn, which can cause issues + # such as https://github.com/pytorch/pytorch/issues/60733 + for i, output in enumerate(output_tensor_list): + if torch.is_tensor(output) and output.grad_fn is None: + output_placeholders[i] = output + + # When find_unused_parameters=True, makes tensors which require grad + # run through the DDPSink backward pass. When not all outputs are + # used in loss, this makes those corresponding tensors receive + # undefined gradient which the reducer then handles to ensure + # param.grad field is not touched and we don't error out. + passthrough_tensor_list = _DDPSink.apply( + weakref.ref(self), + *output_tensor_list, + ) + for i in range(len(output_placeholders)): + if output_placeholders[i] is None: + output_placeholders[i] = passthrough_tensor_list[i] + + # Reconstruct output data structure. + output = _tree_unflatten_with_rref( + output_placeholders, treespec, output_is_rref + ) + + # At the end of the forward pass, reset the grad buffer and grad views + self._clear_grad_buffer() + return output + + def forward(self, *inputs, **kwargs): + with torch.autograd.profiler.record_function("DistributedDataParallel.forward"): + inputs, kwargs = self._pre_forward(*inputs, **kwargs) + output = ( + self.module.forward(*inputs, **kwargs) + if self._delay_all_reduce_all_params + else self._run_ddp_forward(*inputs, **kwargs) + ) + return self._post_forward(output) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def to_kwargs(self, inputs, kwargs, device_id): + # Kept for BC + return _to_kwargs( + inputs, + kwargs, + torch.device(self.device_type, device_id), + self.use_side_stream_for_tensor_copies, + ) + + def gather(self, outputs, output_device): + return gather(outputs, output_device, dim=self.dim) + + def train(self, mode=True): + super().train(mode) + return self + + # When running in join mode, schedules an allreduce to notify joined ranks + # of whether backwards pass synchronization will run this iteration or not. + def _check_global_requires_backward_grad_sync(self, is_joined_rank): + if not is_joined_rank and self.require_backward_grad_sync: + requires_sync_tensor = torch.ones(1, device=self.device) + else: + requires_sync_tensor = torch.zeros(1, device=self.device) + + work = dist.all_reduce( + requires_sync_tensor, group=self.process_group, async_op=True + ) + + # (kwen2501) This if condition is a plain translation of previous + # behavior, i.e. in the `is_joined_rank=False` case, `work.wait()` + # is not called and it doesn't care about the result. I am guessing + # that it just wants to fire a matching all-reduce and does not want + # the main stream to wait. + if is_joined_rank: + work.wait() + should_sync_backwards = requires_sync_tensor.item() != 0 + return should_sync_backwards + else: + return None # Return value is not/should not be used. + + # When running in join mode, checks and performs sync of module buffers if + # the models have buffers that should be synchronized in the forward pass. + def _check_and_sync_module_buffers(self): + if self._check_sync_bufs_pre_fwd(): + authoritative_rank = self._find_common_rank(self._distributed_rank, False) + self._sync_module_buffers(authoritative_rank) + + # When running in join model, agrees upon a common rank and broadcast model + # parameters to all other ranks. + def _sync_final_model(self, is_last_joiner): + # Agree upon the process that will be the authoritative model copy. + # The current rank is a candidate for being the authoritative copy if + # is_last_joiner=True. We break ties via picking the larger rank. + self._authoritative_rank = self._find_common_rank( + self._distributed_rank, is_last_joiner + ) + _sync_module_states( + module=self.module, + process_group=self.process_group, + broadcast_bucket_size=self.broadcast_bucket_size, + src=self._authoritative_rank, + params_and_buffers_to_ignore=self.parameters_to_ignore, + broadcast_buffers=self.broadcast_buffers, + ) + + # Schedule comm ops to match those scheduled in the reducer's backward + # pass. + def _match_all_reduce_for_bwd_pass(self): + comm_work = [] + # Schedule comm in the same order as Reducer schedules them, i.e. + # the order of the buckets. Retrieving the bucket order from the reducer + # ensures that we keep the same order in join mode, such as when bucket + # order is rebuilt dynamically. + + # Returns grad_buckets in order, but real tensors are substituted with + # zero tensors of the same shape. + grad_buckets = self.reducer._get_zeros_like_grad_buckets() + for grad_bucket in grad_buckets: + # Joined processes contribute zero gradient. In the case that + # divide_by_initial_world_size=True, we divide grads by the static + # world size, if not, the dividing factor is reduced by the number + # of joined processes. + work = self.reducer._run_comm_hook(grad_bucket) + comm_work.append(work) + for work in comm_work: + work.wait() + + # Allreduces the used parameter mapping across ranks. + def _match_unused_params_allreduce(self): + locally_used_param_map = self.reducer._get_local_used_map() + self.process_group.allreduce(locally_used_param_map) + + def join( + self, + divide_by_initial_world_size: bool = True, + enable: bool = True, + throw_on_early_termination: bool = False, + ): + r""" + Context manager for training with uneven inputs across processes in DDP. + + This context manager will keep track of already-joined DDP processes, + and "shadow" the forward and backward passes by inserting collective + communication operations to match with the ones created by non-joined + DDP processes. This will ensure each collective call has a corresponding + call by already-joined DDP processes, preventing hangs or errors that + would otherwise happen when training with uneven inputs across + processes. Alternatively, if the flag ``throw_on_early_termination`` is + specified to be ``True``, all trainers will throw an error once one rank + runs out of inputs, allowing these errors to be caught and handled + according to application logic. + + Once all DDP processes have joined, the context manager will broadcast + the model corresponding to the last joined process to all processes to + ensure the model is the same across all processes + (which is guaranteed by DDP). + + To use this to enable training with uneven inputs across processes, + simply wrap this context manager around your training loop. No further + modifications to the model or data loading is required. + + .. warning:: + If the model or training loop this context manager is wrapped around + has additional distributed collective operations, such as + ``SyncBatchNorm`` in the model's forward pass, then the flag + ``throw_on_early_termination`` must be enabled. This is because this + context manager is not aware of non-DDP collective communication. + This flag will cause all ranks to throw when any one rank + exhausts inputs, allowing these errors to be caught and recovered + from across all ranks. + + Args: + divide_by_initial_world_size (bool): If ``True``, will divide + gradients by the initial ``world_size`` DDP training was launched + with. If ``False``, will compute the effective world size + (number of ranks that have not depleted their inputs yet) and + divide gradients by that during allreduce. Set + ``divide_by_initial_world_size=True`` to ensure every input + sample including the uneven inputs have equal weight in terms of + how much they contribute to the global gradient. This is + achieved by always dividing the gradient by the initial + ``world_size`` even when we encounter uneven inputs. If you set + this to ``False``, we divide the gradient by the remaining + number of nodes. This ensures parity with training on a smaller + ``world_size`` although it also means the uneven inputs would + contribute more towards the global gradient. Typically, you + would want to set this to ``True`` for cases where the last few + inputs of your training job are uneven. In extreme cases, where + there is a large discrepancy in the number of inputs, setting + this to ``False`` might provide better results. + enable (bool): Whether to enable uneven input detection or not. Pass + in ``enable=False`` to disable in cases where you know that + inputs are even across participating processes. Default is + ``True``. + throw_on_early_termination (bool): Whether to throw an error + or continue training when at least one rank has exhausted + inputs. If ``True``, will throw upon the first rank reaching end + of data. If ``False``, will continue training with a smaller + effective world size until all ranks are joined. Note that if + this flag is specified, then the flag + ``divide_by_initial_world_size`` would be ignored. Default + is ``False``. + + + Example:: + + >>> # xdoctest: +SKIP("Distributed") + >>> import torch + >>> import torch.distributed as dist + >>> import os + >>> import torch.multiprocessing as mp + >>> import torch.nn as nn + >>> # On each spawned worker + >>> def worker(rank): + >>> dist.init_process_group("nccl", rank=rank, world_size=2) + >>> torch.cuda.set_device(rank) + >>> model = nn.Linear(1, 1, bias=False).to(rank) + >>> model = torch.nn.parallel.DistributedDataParallel( + >>> model, device_ids=[rank], output_device=rank + >>> ) + >>> # Rank 1 gets one more input than rank 0. + >>> inputs = [torch.tensor([1]).float() for _ in range(10 + rank)] + >>> with model.join(): + >>> for _ in range(5): + >>> for inp in inputs: + >>> loss = model(inp).sum() + >>> loss.backward() + >>> # Without the join() API, the below synchronization will hang + >>> # blocking for rank 1's allreduce to complete. + >>> torch.cuda.synchronize(device=rank) + """ + return Join( + [self], + enable, + throw_on_early_termination, + divide_by_initial_world_size=divide_by_initial_world_size, + ) + + def join_hook( + self, + **kwargs, + ): + r""" + DDP join hook enables training on uneven inputs by mirroring communications in forward and backward passes. + + Arguments: + kwargs (dict): a :class:`dict` containing any keyword arguments + to modify the behavior of the join hook at run time; all + :class:`Joinable` instances sharing the same join context + manager are forwarded the same value for ``kwargs``. + + The hook supports the following keyword arguments: + divide_by_initial_world_size (bool, optional): + If ``True``, then gradients are divided by the initial world + size that DDP was launched with. + If ``False``, then gradients are divided by the effective world + size (i.e. the number of non-joined processes), meaning that + the uneven inputs contribute more toward the global gradient. + Typically, this should be set to ``True`` if the degree of + unevenness is small but can be set to ``False`` in extreme + cases for possibly better results. + Default is ``True``. + """ + divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True) + return _DDPJoinHook( + self, divide_by_initial_world_size=divide_by_initial_world_size + ) + + @property + def join_device(self): + return self.device + + @property + def join_process_group(self): + return self.process_group + + def _register_buffer_comm_hook( + self, + state, + hook: Callable, + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, + ): + r""" + Allow custom registration of hooks that define how buffer are synchronized across ranks. + + The hook takes in an optional state and is passed in a Dict[str, Tensor] + corresponding to buffer names and the buffers, and can run arbitrary reductions + on buffers as opposed to DDP's default broadcast from rank 0. This is useful for + example if a counter needs to be summed or averaged across ranks every iteration. + + Args: + state (Any): Optional state that is passed to the hook. + hook (Callable): Callable with the following signature: + ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]`` + comm_hook_location (_BufferCommHookLocation): Enum value indicating + where to run the hook. + _BufferCommHookLocation.PRE_FORWARD means that the + hook will run _before_ the forward pass, and + _BufferCommHookLocation.POST_FORWARD means that the + hook will run _after_ the forward pass. + + NOTE: To maximize performance, users can return a + List[torch.futures.Future] from their hook, and DDP will + install and await these hooks appropriately at the end of + the backward pass. This will ensure all buffers are + synchronized by the end of the backward pass. If this + setting is used, it is recommended to pass + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, + which will trigger the hook after the forward pass. + If _BufferCommHookLocation.PRE_FORWARD is used, users must + ensure appropriate synchronization when manipulating GPU + buffers in the forward pass. + """ + assert callable(hook) + self.buffer_hook = _BufferCommHook( + buffer_comm_hook=hook, + buffer_comm_hook_state=state, + buffer_comm_hook_location=comm_hook_location, + ) + + def register_comm_hook(self, state: object, hook: Callable): + r""" + Register communication hook for user-defined DDP aggregation of gradients across multiple workers. + + This hook would be very useful for researchers to try out new ideas. For + example, this hook can be used to implement several algorithms like GossipGrad + and gradient compression which involve different communication strategies for + parameter syncs while running Distributed DataParallel training. + + Args: + state (object): Passed to the hook to maintain any state information during the training process. + Examples include error feedback in gradient compression, + peers to communicate with next in GossipGrad, etc. + + It is locally stored by each worker + and shared by all the gradient tensors on the worker. + hook (Callable): Callable with the following signature: + ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``: + + This function is called once the bucket is ready. The + hook can perform whatever processing is needed and return + a Future indicating completion of any async work (ex: allreduce). + If the hook doesn't perform any communication, it still + must return a completed Future. The Future should hold the + new value of grad bucket's tensors. Once a bucket is ready, + c10d reducer would call this hook and use the tensors returned + by the Future and copy grads to individual parameters. + Note that the future's return type must be a single tensor. + + We also provide an API called ``get_future`` to retrieve a + Future associated with the completion of ``c10d.ProcessGroup.Work``. + ``get_future`` is currently supported for NCCL and also supported for most + operations on GLOO and MPI, except for peer to peer operations (send/recv). + + .. warning :: + Grad bucket's tensors will not be predivided by world_size. User is responsible + to divide by the world_size in case of operations like allreduce. + + .. warning :: + DDP communication hook can only be registered once and should be registered + before calling backward. + + .. warning :: + The Future object that hook returns should contain a single tensor + that has the same shape with the tensors inside grad bucket. + + .. warning :: + ``get_future`` API supports NCCL, and partially GLOO and MPI backends (no support + for peer-to-peer operations like send/recv) and will return a ``torch.futures.Future``. + + Example:: + Below is an example of a noop hook that returns the same tensor. + + >>> # xdoctest: +SKIP('undefined name') + >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: + >>> fut = torch.futures.Future() + >>> fut.set_result(bucket.buffer()) + >>> return fut + >>> ddp.register_comm_hook(state=None, hook=noop) + + Example:: + Below is an example of a Parallel SGD algorithm where gradients are encoded before + allreduce, and then decoded after allreduce. + + >>> # xdoctest: +SKIP('undefined name') + >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: + >>> encoded_tensor = encode(bucket.buffer()) # encode gradients + >>> fut = torch.distributed.all_reduce(encoded_tensor).get_future() + >>> # Define the then callback to decode. + >>> def decode(fut): + >>> decoded_tensor = decode(fut.value()[0]) # decode gradients + >>> return decoded_tensor + >>> return fut.then(decode) + >>> ddp.register_comm_hook(state=None, hook=encode_and_decode) + """ + self._check_comm_hook(hook) + assert self.logger is not None + self.logger._set_comm_hook_name(hook.__qualname__) + self._comm_hooks.append((hook, state)) + dist._register_comm_hook(self.reducer, state, hook) + + def _register_builtin_comm_hook(self, comm_hook_type): + r""" + Register a built-in communication hook that specifies how DDP aggregates gradients across multiple workers. + + The built-in hooks aim to provide efficient C++ implementations for certain hooks, + which might not be as efficient if implemented in Python using a Python communication hook. + + Args: + comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as ALLREDUCE, FP16_COMPRESS, etc. + + .. warning :: + DDP communication hook can only be registered once and should be registered + before calling backward. + + Example:: + Below is an example of a FP16 compression where gradients are + compressed into 16-bit floating-point numbers before allreduce, and + then decompressed after allreduce. + + >>> # xdoctest: +SKIP('undefined name') + >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS) + + """ + assert self.logger is not None + self.logger._set_comm_hook_name(str(comm_hook_type)) + dist._register_builtin_comm_hook(self.reducer, comm_hook_type) + + def _register_fused_optim(self, optim: type, *args, optim_params=None, **kwargs): + r""" + Register an optimizer in DDP to optimize parameter immediately after its gradient reduction. + + Registers an optimizer with DDP such that the optimization for a + parameter will run immediately when that parameter's gradient is + finished with reduction, instead of waiting for all parameters' + gradients to finish reduction. This can result in a training speedup + depending on your workload since the optimizer can run while gradient + reduction for other parameters are still ongoing. In addition, this has + the potential to reduce peak memory consumption during training, as it + only needs to load the per-parameter optimizer states of a single + parameter at a time, instead of loading all per-parameter optimizer + states at once. + + Args: + optim (Type): a ``torch.optim.Optimizer`` class to be registered + as a fused optimizer. + *args (Sequence[Any]): Arguments to forward to `optim`. + optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters + to optimize, similar to `params` argument of traditional `torch.optim` + Optimizers. If this is omitted, all DDP model parameters will be + optimized. + **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim`. + + .. warning :: + _register_fused_optim should only be called once on a DDP instance, + and registering multiple fused optimizers for the same DDP model + is not currently supported. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + .. warning :: + _register_fused_optim and register_comm_hook currently do not + compose together, meaning that custom DDP communication hooks are + not supported with overlapped optimizers. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + .. warning :: + Gradient accumulation and DDP `no_sync` are currently not supported + with overlapped optimizer. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + Example:: + + >>> # xdoctest: +SKIP("No rendezvous handler") + >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') + >>> net = torch.nn.parallel.DistributedDataParallel(model, pg) + >>> lr = 1e-2 + >>> betas = (0.9, 0.99) + >>> eps = 1e-6 + >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps) + >>> # Example with subset of parameters + >>> params_to_opt = [list(net.parameters())[0]] + >>> net._register_fused_optim( + ... torch.optim.Adam, lr, optim_params=params_to_opt, betas=betas, eps=eps + ... ) + """ + # Note: importing in function, otherwise this will cause a circular + # import as optimizer_overlap module needs to import DistributedDataParallel. + from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim + + overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs) + try: + overlapped_optim.register_ddp(self) + except NotImplementedError as e: + raise RuntimeError( + f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}." + ) from e + + def _distributed_broadcast_coalesced( + self, tensors, buffer_size, authoritative_rank=0 + ): + dist._broadcast_coalesced( + self.process_group, tensors, buffer_size, authoritative_rank + ) + + def _check_sync_bufs_post_fwd(self): + return ( + self.will_sync_module_buffers() + and hasattr(self, "buffer_hook") + and self.buffer_hook.buffer_comm_hook_location + == _BufferCommHookLocation.POST_FORWARD + ) + + def _check_sync_bufs_pre_fwd(self): + return self.will_sync_module_buffers() and ( + not hasattr(self, "buffer_hook") + or self.buffer_hook.buffer_comm_hook_location + == _BufferCommHookLocation.PRE_FORWARD + ) + + def will_sync_module_buffers(self): + return ( + self.require_forward_param_sync + and self.broadcast_buffers + and len(self.modules_buffers) > 0 + ) + + def _find_common_rank(self, input_rank, rank_cond): + # -1 indicates that this rank is not under consideration to be the + # common_rank + rank_to_use = torch.tensor( + [input_rank if rank_cond else -1], + device=self.device, + ) + dist.all_reduce(rank_to_use, op=ReduceOp.MAX, group=self.process_group) + if rank_to_use.item() == -1: + self._log_and_throw( + ValueError, + "BUG! Expected rank_cond to be true for at least one process." + " This indicates a bug in PyTorch, please report an issue.", + ) + return rank_to_use.item() + + def _sync_buffers(self): + with torch.no_grad(): + # module buffer sync + # Synchronize buffers across processes. + # If we are running DDP with the join manager, we have to agree + # upon a rank to sync module buffers from, since rank 0 may + # already have been joined and have stale module buffers. + if self._join_config.enable: + authoritative_rank = self._find_common_rank( + self._distributed_rank, True + ) + else: + # The process with rank 0 is considered the authoritative copy. + authoritative_rank = 0 + # Update self.modules_buffers in case any buffers were + # reassigned. + self._assign_modules_buffers() + self._sync_module_buffers(authoritative_rank) + + def _sync_module_buffers(self, authoritative_rank): + if not hasattr(self, "buffer_hook"): + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + else: + hook = self.buffer_hook.buffer_comm_hook + state = self.buffer_hook.buffer_comm_hook_state + futs = hook(state, self.named_module_buffers) + if futs is not None: + self.reducer._install_post_backward_futures(futs) + + def _default_broadcast_coalesced( + self, bufs=None, bucket_size=None, authoritative_rank=0 + ): + """ + Broadcasts buffers from rank 0 to rest of workers. + + If bufs, bucket_size are None, default values self.modules_buffers + and self.broadcast_bucket_size are used instead. + """ + if bufs is None: + bufs = self.modules_buffers + if bucket_size is None: + bucket_size = self.broadcast_bucket_size + + self._distributed_broadcast_coalesced(bufs, bucket_size, authoritative_rank) + + def _passing_sync_batchnorm_handle(self, module): + for layer in module.modules(): + if isinstance(layer, torch.nn.modules.SyncBatchNorm): + if self.device_type == "cpu": + self._log_and_throw( + ValueError, + "SyncBatchNorm layers only work with GPU modules", + ) + + def _check_comm_hook(self, hook): + if not callable(hook): + self._log_and_throw(TypeError, "Communication hook must be callable.") + + sig = inspect.signature(hook) + if ( + sig.parameters["bucket"].annotation != inspect._empty + and sig.parameters["bucket"].annotation != dist.GradBucket + ): + self._log_and_throw( + ValueError, + "Communication hook: bucket annotation should be dist.GradBucket.", + ) + + if ( + sig.return_annotation != inspect._empty + and sig.return_annotation != torch.futures.Future[torch.Tensor] + ): + self._log_and_throw( + ValueError, + "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].", + ) + + if hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"]: + cuda_supported = ( + torch.version.cuda is not None + ) or torch.version.hip is not None + nccl_supported = ( + dist.is_available() + and dist.is_nccl_available() + and torch.cuda.nccl.version() >= (2, 10) + ) + xpu_xccl_supported = ( + dist.is_available() + and dist.is_xccl_available() + and torch.xpu.is_available() + ) + + if not ((cuda_supported and nccl_supported) or xpu_xccl_supported): + self._log_and_throw( + TypeError, + "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+ or XPU and XCCL", + ) + + @property + def _distributed_rank(self): + return dist.get_rank(self.process_group) + + @staticmethod + def _get_data_parallel_params(module, named_params=False): + """Return a generator of parameters managed by a given DDP unit.""" + for param in ( + module.parameters() if not named_params else module.named_parameters() + ): + if not hasattr(param, "_ddp_ignored"): + yield param + + @staticmethod + def _set_params_and_buffers_to_ignore_for_model( + module, params_and_buffers_to_ignore + ): + """ + Set parameters and buffers to be ignored by DDP. + + Expected format for parameters is the fully qualified name: {module_name}.{param_name}, and + similarly, {module_name}.{buffer_name} for buffers. For example: + params_to_ignore = [] + # NB: model here is vanilla PyTorch module, not yet wrapped with DDP. + for module_name, module in model.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + if should_ignore(param): + # Create expected format + fqn = f"{module_name}.{param_name}" + params_to_ignore.append(fqn) + torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model( + model, + params_to_ignore + ) + """ + # This is a workaround to set parameters and buffers DDP should ignore + # during synchronization. It will be removed when the API is finalized + # as part of addressing https://github.com/pytorch/pytorch/issues/43690. + module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore + for name, param in module.named_parameters(): + if name in params_and_buffers_to_ignore: + param._ddp_ignored = True + for name, buffer in module.named_buffers(): + if name in params_and_buffers_to_ignore: + buffer._ddp_ignored = True + + def _get_ddp_logging_data(self): + r""" + Return a dictionary of logging data for debugging and analysis. + + This interface can be called after DistributedDataParallel() is + constructed. It returns a dictionary of logging data. It could help + for debugging and analysis. The logging data includes DistributedDataParallel + constructor input parameters, some internal states of DistributedDataParallel + and performance metrics. Simply print the dictionary and see what + these metrics are. + This is a prototype interface and subject to change in the future. + """ + assert self.logger is not None + ddp_logging_data = self.logger._get_ddp_logging_data() + return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map} + + def _set_ddp_runtime_logging_sample_rate(self, sample_rate): + r""" + Set sample_rate of collecting runtime stats. + + This interface allows users to set sample_rate of collecting + runtime stats. The runtime stats will be recorded for the + first 10 iterations, after 10 iterations runtime stats will be + recorded once every "sample_rate" training iterations. In + default, runtime stats are recorded for the first 10 iterations, + after 10 iterations runtime stats are recorded once every + "kDDPRuntimeLoggingSampleRate=100" training iterations. + This is a prototype interface and subject to change in the future. + """ + if sample_rate < 1: + self._log_and_throw( + ValueError, + "DDP runtime logging sample rate should be equal or greater than 1", + ) + self.reducer._set_ddp_runtime_logging_sample_rate(sample_rate) + + def _set_static_graph(self): + """ + Set static graph for DDP. + + It is recommended to set static graph in the DDP constructor, which will + call this private API internally. + """ + # If self.static_graph has been set, no need to set it again + if self.static_graph: + warnings.warn( + "You've set static_graph to be True, no need to set it again.", + stacklevel=2, + ) + return + self.static_graph = True + self._static_graph_delay_allreduce_enqueued = False + self.reducer._set_static_graph() + assert self.logger is not None + self.logger._set_static_graph() + if self.find_unused_parameters: + warnings.warn( + "You passed find_unused_parameters=true to DistributedDataParallel, " + "`_set_static_graph` will detect unused parameters automatically, so " + "you do not need to set find_unused_parameters=true, just be sure these " + "unused parameters will not change during training loop while calling " + "`_set_static_graph`.", + stacklevel=2, + ) + + def _remove_autograd_hooks(self): + """Remove autograd hooks registered by the reducer on the model parameters.""" + self.reducer._remove_autograd_hooks() + + def _check_reducer_finalized(self): + """ + Check if the reducer has processed all buckets and finalized the backward appropriately. + + It is useful to call this method after calling .backward() in your training loop + in order to avoid subsequent hard to debug errors down the road due to the + reducer not finalizing backward. + """ + self.reducer._check_reducer_finalized() + + def _set_sparse_metadata(self, global_unique_ids): + self.reducer._set_sparse_metadata(global_unique_ids) + + def _update_process_group(self, new_process_group): + """ + Dynamically updates the process group for DDP so that we can shrink/expand DDP + world size without having to reinitialize DDP. + + NOTE: If you are using custom communications hooks via, register_comm_hook, + you need to update the process groups for those hooks separately. + """ + # Force a rebuild of buckets for a new process group. This ensures all ranks + # are synchronized in terms of when they will rebuild buckets and also + # re-evaluates previous assumptions of buckets given the world size might have + # changed. + self._has_rebuilt_buckets = False + self.reducer._reset_state() + + if not _rank_not_in_group(new_process_group): + self.process_group = new_process_group + self.reducer._update_process_group(new_process_group) + + def _set_ddp_sink_clone(self, val: bool): + """ + Sets whether or not DDPSink should clone the output tensors or not. + The default is True since if the loss is modified in place we run + into the view is modified in-place error. + + Although, cloning the tensors can add significant memory and + performance hit if the number and size of tensors are large. As + a result, this can be set to False if you are not modifying the + loss in place. + """ + self._ddp_sink_clone = val diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..6c26aaf5048e908ab72978b9d8562d4997c17928 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py @@ -0,0 +1,135 @@ +import threading +from collections.abc import Sequence +from typing import Any, cast + +import torch +from torch._utils import ExceptionWrapper +from torch.cuda._utils import _get_device_index +from torch.nn.modules import Module + + +__all__ = ["get_a_var", "parallel_apply"] + + +def get_a_var( + obj: torch.Tensor | list[Any] | tuple[Any, ...] | dict[Any, Any], +) -> torch.Tensor | None: + if isinstance(obj, torch.Tensor): + return obj + + if isinstance(obj, (list, tuple)): + for result in map(get_a_var, obj): + if isinstance(result, torch.Tensor): + return result + if isinstance(obj, dict): + for result in map(get_a_var, obj.items()): + if isinstance(result, torch.Tensor): + return result + return None + + +def parallel_apply( + modules: Sequence[Module], + inputs: Sequence[Any], + kwargs_tup: Sequence[dict[str, Any]] | None = None, + devices: Sequence[int | torch.device | None] | None = None, +) -> list[Any]: + r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`. + + Args: + modules (Module): modules to be parallelized + inputs (tensor): inputs to the modules + devices (list of int or torch.device): CUDA devices + + :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and + :attr:`devices` (if given) should all have same length. Moreover, each + element of :attr:`inputs` can either be a single object as the only argument + to a module, or a collection of positional arguments. + """ + assert len(modules) == len(inputs), ( + f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}" + ) + if kwargs_tup is not None: + assert len(modules) == len(kwargs_tup) + else: + kwargs_tup = (cast(dict[str, Any], {}),) * len(modules) + if devices is not None: + assert len(modules) == len(devices) + else: + devices = [None] * len(modules) + devices = [_get_device_index(x, True) for x in devices] + streams = [torch.accelerator.current_stream(x) for x in devices] + assert torch.accelerator.is_available(), "No available accelerator found." + device_type = torch.accelerator.current_accelerator().type # type: ignore[union-attr] + lock = threading.Lock() + results = {} + grad_enabled, autocast_enabled = ( + torch.is_grad_enabled(), + torch.is_autocast_enabled(), + ) + + def _worker( + i: int, + module: Module, + input: Any, + kwargs: dict[str, Any], + device: int | torch.device | None = None, + stream: torch.Stream | None = None, + ) -> None: + torch.set_grad_enabled(grad_enabled) + if device is None: + t = get_a_var(input) + if t is None: + with lock: + results[i] = ExceptionWrapper( + where=f"in replica {i}, no device was provided and no tensor input was found; " + "device cannot be resolved" + ) + return + device = t.get_device() + if isinstance(device, torch.device): + device = device.index + if stream is None: + stream = torch.accelerator.current_stream(device) + try: + with ( + torch.accelerator.device_index(device), + stream, + torch.amp.autocast(device_type, enabled=autocast_enabled), + ): + # this also avoids accidental slicing of `input` if it is a Tensor + if not isinstance(input, (list, tuple)): + input = (input,) + output = module(*input, **kwargs) + with lock: + results[i] = output + except Exception: + with lock: + results[i] = ExceptionWrapper( + where=f"in replica {i} on device {device}" + ) + + if len(modules) > 1: + threads = [ + threading.Thread( + target=_worker, args=(i, module, input, kwargs, device, stream) + ) + for i, (module, input, kwargs, device, stream) in enumerate( + zip(modules, inputs, kwargs_tup, devices, streams, strict=True) + ) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + else: + _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0]) + + outputs = [] + for i in range(len(inputs)): + output = results[i] + if isinstance(output, ExceptionWrapper): + output.reraise() + outputs.append(output) + return outputs diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py new file mode 100644 index 0000000000000000000000000000000000000000..7e7844ab4aba222055f726492df33d2a61aba880 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py @@ -0,0 +1,203 @@ +from collections import OrderedDict +from collections.abc import Iterator, Sequence +from typing import cast, TYPE_CHECKING, TypeVar +from typing_extensions import TypeIs + +import torch +from torch._utils import _get_device_index +from torch.nn.modules import Module +from torch.nn.parallel import comm + + +if TYPE_CHECKING: + from torch._C import ScriptMethod + from torch.jit import ScriptModule + from torch.jit._state import EnabledProxy + + +__all__ = ["replicate"] + + +def _is_script_module(module: Module) -> TypeIs["ScriptModule"]: + import torch.jit + + return isinstance(module, torch.jit.ScriptModule) + + +def _is_script_method(module: object) -> TypeIs["ScriptMethod"]: + import torch.jit + + return isinstance(module, torch._C.ScriptMethod) + + +def _init_script_module() -> "ScriptModule": + import torch.jit + + return torch.jit.ScriptModule() + + +def _is_jit_enabled() -> "EnabledProxy": + import torch.jit._state + + return torch.jit._state._enabled + + +# Check if we can safely replicate the module. +# there are two types of module: +# 1. python modules +# 2. ScriptModule +# +# currently a module cannot be replicated properly if the descendants of +# any ScriptModule contains python module (type 1 above) +def _replicatable_module(module: Module, memo: set[Module] | None = None) -> bool: + # module.modules() contains module itself as the first element + def descendant_modules(module: Module) -> Iterator[Module]: + gen = module.modules() + next(gen) + return gen + + if not _is_jit_enabled(): + return True + if memo is None: + memo = set() + + # memoize visited modules + memo.add(module) + if _is_script_module(module): + memo.update(descendant_modules(module)) + return all( + _is_script_module(descendant) for descendant in descendant_modules(module) + ) + + for child in module.children(): + # since any unreplicatable module will cause the check to return + # False early, visited modules here can be safely ignored. + if child in memo: + continue + if not _replicatable_module(child, memo): + return False + + return True + + +def _broadcast_coalesced_reshape( + tensors: Sequence[torch.Tensor], + devices: Sequence[int | torch.device], + detach: bool = False, +) -> list[list[torch.Tensor]]: + from torch.nn.parallel._functions import Broadcast + + if detach: + return comm.broadcast_coalesced(tensors, devices) + else: + # Use the autograd function to broadcast if not detach + if len(tensors) > 0: + tensor_copies = Broadcast.apply(devices, *tensors) + return [ + tensor_copies[i : i + len(tensors)] + for i in range(0, len(tensor_copies), len(tensors)) + ] + else: + return [] + + +T = TypeVar("T", bound=Module) + + +def replicate( + network: T, + devices: Sequence[int | torch.device], + detach: bool = False, +) -> list[T]: + if not _replicatable_module(network): + raise RuntimeError( + "Cannot replicate network where python modules are children of ScriptModule" + ) + + if not devices: + return [] + + devices = [_get_device_index(x, True) for x in devices] + num_replicas = len(devices) + + params = list(network.parameters()) + param_indices = {param: idx for idx, param in enumerate(params)} + param_copies = _broadcast_coalesced_reshape(params, devices, detach) + + buffers = list(network.buffers()) + buffers_rg: list[torch.Tensor] = [] + buffers_not_rg: list[torch.Tensor] = [] + for buf in buffers: + if buf.requires_grad and not detach: + buffers_rg.append(buf) + else: + buffers_not_rg.append(buf) + + buffer_indices_rg = {buf: idx for idx, buf in enumerate(buffers_rg)} + buffer_indices_not_rg = {buf: idx for idx, buf in enumerate(buffers_not_rg)} + + buffer_copies_rg = _broadcast_coalesced_reshape(buffers_rg, devices, detach=detach) + buffer_copies_not_rg = _broadcast_coalesced_reshape( + buffers_not_rg, devices, detach=True + ) + + modules = list(network.modules()) + module_copies: list[list[Module]] = [[] for _ in devices] + module_indices: dict[Module, int] = {} + + for i, module in enumerate(modules): + module_indices[module] = i + for j in range(num_replicas): + replica = module._replicate_for_data_parallel() + # This is a temporary fix for DDP. DDP needs to access the + # replicated model parameters. It used to do so through + # `mode.parameters()`. The fix added in #33907 for DP stops the + # `parameters()` API from exposing the replicated parameters. + # Hence, we add a `_former_parameters` dict here to support DDP. + replica._former_parameters = OrderedDict() + + module_copies[j].append(replica) + + for i, module in enumerate(modules): + for key, child in module._modules.items(): + if child is None: + for j in range(num_replicas): + replica = module_copies[j][i] + replica._modules[key] = None + else: + module_idx = module_indices[child] + for j in range(num_replicas): + replica = module_copies[j][i] + setattr(replica, key, module_copies[j][module_idx]) + for key, param in module._parameters.items(): + if param is None: + for j in range(num_replicas): + replica = module_copies[j][i] + replica._parameters[key] = None + else: + param_idx = param_indices[param] + for j in range(num_replicas): + replica = module_copies[j][i] + param_copy = param_copies[j][param_idx] + # parameters in replicas are no longer leaves, + # so setattr them as non-parameter attributes + setattr(replica, key, param_copy) + # expose the parameter for DDP + replica._former_parameters[key] = param_copy # type: ignore[operator, index] + for key, buf in module._buffers.items(): # type: ignore[assignment] + if buf is None: + for j in range(num_replicas): + replica = module_copies[j][i] + replica._buffers[key] = None + else: + if buf.requires_grad and not detach: + buffer_copies = buffer_copies_rg + buffer_idx = buffer_indices_rg[buf] + else: + buffer_copies = buffer_copies_not_rg + buffer_idx = buffer_indices_not_rg[buf] + for j in range(num_replicas): + replica = module_copies[j][i] + setattr(replica, key, buffer_copies[j][buffer_idx]) + + return [cast(T, module_copies[j][0]) for j in range(num_replicas)] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..27aeaf19944dcadab63b25d0c9789c31dff322da --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py @@ -0,0 +1,154 @@ +# mypy: allow-untyped-defs +from collections.abc import Sequence +from typing import Any, overload, TypeVar +from typing_extensions import deprecated + +import torch +from torch.nn.parallel._functions import Gather, Scatter + + +__all__ = ["scatter", "scatter_kwargs", "gather"] + + +@deprecated( + "`is_namedtuple` is deprecated, please use the python checks instead", + category=FutureWarning, +) +def is_namedtuple(obj: Any) -> bool: + # Check if type was created from collections.namedtuple or a typing.NamedTuple. + return _is_namedtuple(obj) + + +def _is_namedtuple(obj: Any) -> bool: + # Check if type was created from collections.namedtuple or a typing.NamedTuple. + return ( + isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields") + ) + + +T = TypeVar("T", dict, list, tuple) + + +# For some reason, 'scatter' returns a tuple when given a single Tensor input but a list otherwise. +@overload +def scatter( + inputs: torch.Tensor, + target_gpus: Sequence[int | torch.device], + dim: int = ..., +) -> tuple[torch.Tensor, ...]: ... + + +@overload +def scatter( + inputs: T, + target_gpus: Sequence[int | torch.device], + dim: int = ..., +) -> list[T]: ... + + +def scatter(inputs, target_gpus, dim=0): + r"""Slice tensors into approximately equal chunks and distributes them across given GPUs. + + Duplicates references to objects that are not tensors. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + return Scatter.apply(target_gpus, None, dim, obj) + if _is_namedtuple(obj): + # pyrefly: ignore [no-matching-overload] + return [ + # pyrefly: ignore [no-matching-overload] + type(obj)(*args) + # pyrefly: ignore # no-matching-overload + for args in zip(*map(scatter_map, obj), strict=False) + ] + if isinstance(obj, tuple) and len(obj) > 0: + # pyrefly: ignore [no-matching-overload] + return list(zip(*map(scatter_map, obj), strict=False)) + if isinstance(obj, list) and len(obj) > 0: + # pyrefly: ignore [no-matching-overload] + return [list(i) for i in zip(*map(scatter_map, obj), strict=False)] + if isinstance(obj, dict) and len(obj) > 0: + # pyrefly: ignore [no-matching-overload] + return [ + # pyrefly: ignore [no-matching-overload] + type(obj)(i) + # pyrefly: ignore # no-matching-overload + for i in zip(*map(scatter_map, obj.items()), strict=False) + ] + return [obj for _ in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + res = scatter_map(inputs) + finally: + scatter_map = None # type: ignore[assignment] + return res + + +def scatter_kwargs( + inputs: tuple[Any, ...], + kwargs: dict[str, Any] | None, + target_gpus: Sequence[int | torch.device], + dim: int = 0, +) -> tuple[tuple[Any, ...], tuple[dict[str, Any], ...]]: + r"""Scatter with support for kwargs dictionary.""" + scattered_inputs = scatter(inputs, target_gpus, dim) if inputs else [] + scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] + if len(scattered_inputs) < len(scattered_kwargs): + scattered_inputs.extend( + () for _ in range(len(scattered_kwargs) - len(scattered_inputs)) + ) + elif len(scattered_kwargs) < len(inputs): + scattered_kwargs.extend( + {} for _ in range(len(scattered_inputs) - len(scattered_kwargs)) + ) + return tuple(scattered_inputs), tuple(scattered_kwargs) + + +def gather(outputs: Any, target_device: int | torch.device, dim: int = 0) -> Any: + r"""Gather tensors from different GPUs on a specified device. + + This function is useful for gathering the results of a distributed computation. + It takes a sequence of objects, one for each GPU, and returns a single object + on the specified device. + + Args: + outputs (Any): A sequence of objects (potentially tensors) to gather. + target_device (Union[int, torch.device]): The device to gather the tensors to. + Use 'cpu' for CPU to avoid a deprecation warning. + dim (int, optional): The dimension along which to gather. Default: 0. + + Returns: + Any: A gathered object (potentially tensor) on the specified device. + """ + + def gather_map(outputs): + out = outputs[0] + if isinstance(out, torch.Tensor): + return Gather.apply(target_device, dim, *outputs) + if out is None: + return None + if isinstance(out, dict): + if not all(len(out) == len(d) for d in outputs): + raise ValueError("All dicts must have the same number of keys") + # pyrefly: ignore [not-callable] + return type(out)((k, gather_map([d[k] for d in outputs])) for k in out) + if _is_namedtuple(out): + # pyrefly: ignore [no-matching-overload] + return type(out)._make(map(gather_map, zip(*outputs, strict=True))) + # pyrefly: ignore [no-matching-overload] + return type(out)(map(gather_map, zip(*outputs, strict=True))) + + # Recursive function calls like this create reference cycles. + # Setting the function to None clears the refcycle. + try: + res = gather_map(outputs) + finally: + gather_map = None # type: ignore[assignment] + return res diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7628c5c15992efa600ea5520aed955ba42c6146 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py @@ -0,0 +1 @@ +from torch.nn.quantizable.modules import * # noqa: F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e2bbbc13202db1cbddaad4b05241a62190adc46 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py @@ -0,0 +1,39 @@ +from torch.nn.quantized import dynamic, functional, modules # noqa: F403 +from torch.nn.quantized.modules import * # noqa: F403 +from torch.nn.quantized.modules import MaxPool2d + + +__all__ = [ + "BatchNorm2d", + "BatchNorm3d", + "Conv1d", + "Conv2d", + "Conv3d", + "ConvTranspose1d", + "ConvTranspose2d", + "ConvTranspose3d", + "DeQuantize", + "Dropout", + "ELU", + "Embedding", + "EmbeddingBag", + "GroupNorm", + "Hardswish", + "InstanceNorm1d", + "InstanceNorm2d", + "InstanceNorm3d", + "LayerNorm", + "LeakyReLU", + "Linear", + "LSTM", + "MultiheadAttention", + "PReLU", + "Quantize", + "ReLU6", + "Sigmoid", + "Softmax", + # Wrapper modules + "FloatFunctional", + "FXFloatFunctional", + "QFunctional", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..d763e171fdb432c8ba2059cc2332e7ac6424854a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py @@ -0,0 +1,10 @@ +r"""nn.quantized.functional. + +Quantized equivalents of the `nn.functional`. + +Note:: + This location is in the process of being deprecated. + Please, use the `torch.ao.nn.quantized.functional` instead. +""" + +from torch.ao.nn.quantized.functional import * # noqa: F401,F403 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e9253264d1e0eaf7fef1ee4ada06d2bf0be5cda7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py @@ -0,0 +1,48 @@ +from . import parametrizations, parametrize, rnn, stateless +from .clip_grad import ( # pyrefly: ignore # deprecated; pyrefly: ignore [deprecated] + _clip_grads_with_norm_ as clip_grads_with_norm_, + _get_total_norm as get_total_norm, + clip_grad_norm, + clip_grad_norm_, + clip_grad_value_, +) +from .convert_parameters import parameters_to_vector, vector_to_parameters +from .fusion import ( + fuse_conv_bn_eval, + fuse_conv_bn_weights, + fuse_linear_bn_eval, + fuse_linear_bn_weights, +) +from .init import skip_init +from .memory_format import ( + convert_conv2d_weight_memory_format, + convert_conv3d_weight_memory_format, +) +from .spectral_norm import remove_spectral_norm, spectral_norm +from .weight_norm import remove_weight_norm, weight_norm + + +__all__ = [ + "clip_grad_norm", + "clip_grad_norm_", + "clip_grads_with_norm_", + "clip_grad_value_", + "convert_conv2d_weight_memory_format", + "convert_conv3d_weight_memory_format", + "fuse_conv_bn_eval", + "fuse_conv_bn_weights", + "fuse_linear_bn_eval", + "fuse_linear_bn_weights", + "get_total_norm", + "parameters_to_vector", + "parametrizations", + "parametrize", + "remove_spectral_norm", + "remove_weight_norm", + "rnn", + "skip_init", + "spectral_norm", + "stateless", + "vector_to_parameters", + "weight_norm", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a25b647307900e42b11d1cdafc8d9f8785d1a620 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py @@ -0,0 +1,53 @@ +import importlib +import warnings +from collections.abc import Callable + + +_MESSAGE_TEMPLATE = ( + r"Usage of '{old_location}' is deprecated; please use '{new_location}' instead." +) + + +def lazy_deprecated_import( + all: list[str], + old_module: str, + new_module: str, +) -> Callable: + r"""Import utility to lazily import deprecated packages / modules / functional. + + The old_module and new_module are also used in the deprecation warning defined + by the `_MESSAGE_TEMPLATE`. + + Args: + all: The list of the functions that are imported. Generally, the module's + __all__ list of the module. + old_module: Old module location + new_module: New module location / Migrated location + + Returns: + Callable to assign to the `__getattr__` + + Usage: + + # In the `torch/nn/quantized/functional.py` + from torch.nn.utils._deprecation_utils import lazy_deprecated_import + _MIGRATED_TO = "torch.ao.nn.quantized.functional" + __getattr__ = lazy_deprecated_import( + all=__all__, + old_module=__name__, + new_module=_MIGRATED_TO) + """ + warning_message = _MESSAGE_TEMPLATE.format( + old_location=old_module, new_location=new_module + ) + + def getattr_dunder(name: str) -> None: + if name in all: + # We are using the "RuntimeWarning" to make sure it is not + # ignored by default. + warnings.warn(warning_message, RuntimeWarning, stacklevel=2) + package = importlib.import_module(new_module) + return getattr(package, name) + raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.") + + return getattr_dunder diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py new file mode 100644 index 0000000000000000000000000000000000000000..0935490856aebf3503aa126e51d342c3bac0b529 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py @@ -0,0 +1,373 @@ +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from collections.abc import Iterable + +import torch + + +_MISSING: torch.Tensor = object() # type: ignore[assignment] + + +def set_tensor(module: "torch.nn.Module", name: str, tensor: torch.Tensor) -> None: + if not isinstance(module, torch.nn.Module): + raise TypeError(f"{module} is not an instance of torch.nn.Module") + if not isinstance(tensor, torch.Tensor) and tensor is not None: + raise TypeError(f"{tensor} is not an instance of torch.Tensor") + if "." in name: + raise KeyError('tensor name can\'t contain "."') + if name == "": + raise KeyError('tensor name can\'t be empty string ""') + if name in module._parameters: + module._parameters[name] = tensor # type: ignore[assignment] + elif name in module._buffers: + module._buffers[name] = tensor + else: + setattr(module, name, tensor) + + +def swap_tensor( + module: "torch.nn.Module", + name: str, + tensor: torch.Tensor, + allow_missing: bool = False, +) -> torch.Tensor: + if not isinstance(module, torch.nn.Module): + raise TypeError(f"{module} is not an instance of torch.nn.Module") + if ( + tensor is not _MISSING + and not isinstance(tensor, torch.Tensor) + and tensor is not None + ): + raise TypeError(f"{tensor} is not an instance of torch.Tensor") + if "." in name: + raise KeyError('tensor name can\'t contain "."') + if name == "": + raise KeyError('tensor name can\'t be empty string ""') + + orig_tensor: torch.Tensor + if name in module._parameters: + orig_tensor = module._parameters[name] # type: ignore[assignment] + if tensor is not _MISSING: + module._parameters[name] = tensor # type: ignore[assignment] + else: + del module._parameters[name] + elif name in module._buffers: + orig_tensor = module._buffers[name] # type: ignore[assignment] + if tensor is not _MISSING: + module._buffers[name] = tensor + else: + del module._buffers[name] + else: + if hasattr(module, name): + orig_tensor = getattr(module, name) + else: + if not allow_missing: + raise AttributeError(f"{module._get_name()} has no attribute `{name}`") + orig_tensor = _MISSING + if ( + orig_tensor is not _MISSING + and not isinstance(orig_tensor, torch.Tensor) + and orig_tensor is not None + ): + raise TypeError( + f"attribute `{name}`: {orig_tensor} is not an instance of torch.Tensor" + ) + if tensor is not _MISSING: + setattr(module, name, tensor) + elif hasattr(module, name): + delattr(module, name) + # pyrefly: ignore [bad-return] + return orig_tensor + + +def swap_submodule( + module: "torch.nn.Module", + name: str, + submodule: "torch.nn.Module", +) -> "torch.nn.Module": + if not isinstance(module, torch.nn.Module): + raise TypeError(f"{module} is not an instance of torch.nn.Module") + if not isinstance(submodule, torch.nn.Module): + raise TypeError(f"{submodule} is not an instance of torch.nn.Module") + if "." in name: + raise KeyError('submodule name can\'t contain "."') + if name == "": + raise KeyError('submodule name can\'t be empty string ""') + if name not in module._modules: + raise KeyError(f"submodule {name} does not exist") + + orig_submodule = module._modules[name] + if not isinstance(orig_submodule, torch.nn.Module): + raise TypeError(f"{name} attribute is not an instance of torch.nn.Module") + module._modules[name] = submodule + return orig_submodule + + +class NamedMemberAccessor: + """ + A class that provides a way to access the submodules and parameters/buffers of a module. + + It provides caching mechanism to speed up submodule lookups. + This is useful for functional programming to manipulate the module state. + """ + + def __init__(self, module: "torch.nn.Module") -> None: + self.module = module + self.memo: dict[str, torch.nn.Module] = {} + + # Nested attribute access + + def get_submodule(self, name: str) -> "torch.nn.Module": + """ + Return the submodule specified by the given path. + + For example, to get the submodule mod.layer1.conv1, + use accessor.get_submodule("layer1.conv1") + + Compare to mod.get_submodule("layer1.conv1"), this method will cache the + intermediate submodule access to speed up future lookups. + """ + if not name: + return self.module + + if name in self.memo: + return self.memo[name] + else: + prefix, dot, attr = name.rpartition(".") + if dot: + module = self.get_submodule(prefix) + else: + module = self.module + try: + submodule = getattr(module, attr) + except AttributeError as ex: + raise AttributeError( + f"{module._get_name()} has no attribute `{attr}`" + ) from ex + if not isinstance(submodule, torch.nn.Module): + raise TypeError( + f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module" + ) + self.memo[name] = submodule + return submodule + + def swap_submodule(self, path: str, value: "torch.nn.Module") -> "torch.nn.Module": + """ + Swap the submodule specified by the given ``path`` to ``value``. + + For example, to swap the attribute mod.layer1.conv1 use + ``accessor.swap_submodule("layer1.conv1", conv2)``. + """ + prefix, _, attr = path.rpartition(".") + return swap_submodule(self.get_submodule(prefix), attr, value) + + def get_tensor(self, name: str) -> torch.Tensor: + """ + Get the tensor specified by the given path to value. + + For example, to get the attribute mod.layer1.conv1.weight, + use accessor.get_tensor('layer1.conv1.weight') + + Compare to mod.get_parameter("layer1.conv1.weight"), this method will + cache the intermediate submodule access to speed up future lookups. + """ + prefix, _, attr = name.rpartition(".") + submodule = self.get_submodule(prefix) + try: + tensor = getattr(submodule, attr) + except AttributeError as ex: + raise AttributeError( + f"{submodule._get_name()} has no attribute `{name}`" + ) from ex + if not isinstance(tensor, torch.Tensor) and tensor is not None: + raise TypeError(f"{tensor} is not an instance of torch.Tensor") + return tensor # type: ignore[return-value] + + def set_tensor(self, name: str, value: torch.Tensor) -> None: + """ + Set the attribute specified by the given path to value. + + For example, to set the attribute mod.layer1.conv1.weight, + use accessor.set_tensor("layer1.conv1.weight", value) + """ + prefix, _, attr = name.rpartition(".") + set_tensor(self.get_submodule(prefix), attr, value) + + def del_tensor(self, name: str) -> None: + """ + Delete the attribute specified by the given path. + + For example, to delete the attribute mod.layer1.conv1.weight, + use accessor.del_tensor("layer1.conv1.weight") + """ + prefix, _, attr = name.rpartition(".") + submodule = self.get_submodule(prefix) + try: + delattr(submodule, attr) + except AttributeError as ex: + raise AttributeError( + f"{submodule._get_name()} has no attribute `{name}`" + ) from ex + + def swap_tensor( + self, name: str, value: torch.Tensor, allow_missing: bool = False + ) -> torch.Tensor: + """ + Swap the attribute specified by the given path to value. + + For example, to swap the attribute mod.layer1.conv1.weight, + use accessor.swap_tensor("layer1.conv1.weight", value) + """ + prefix, _, attr = name.rpartition(".") + return swap_tensor( + self.get_submodule(prefix), attr, value, allow_missing=allow_missing + ) + + # Batched operations + + def get_tensors(self, names: Iterable[str]) -> list[torch.Tensor]: + """ + Get the tensors specified by the given paths. + + For example, to get the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.get_tensors(["layer1.conv1.weight", + "layer1.conv1.bias"]) + """ + return [self.get_tensor(name) for name in names] + + def set_tensors(self, names: Iterable[str], values: Iterable[torch.Tensor]) -> None: + """ + Set the attributes specified by the given paths to values. + + For example, to set the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.set_tensors(["layer1.conv1.weight", + "layer1.conv1.bias"], [weight, bias]) + """ + if not isinstance(names, (list, tuple)): + names = list(names) + if not isinstance(values, (list, tuple)): + values = list(values) + assert len(names) == len(values), "names and values must have the same length" + + for name, value in zip(names, values, strict=True): + self.set_tensor(name, value) + + def set_tensors_dict(self, named_tensors: dict[str, torch.Tensor]) -> None: + """ + Set the attributes specified by the given paths to values. + + For example, to set the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.set_tensors_dict({ + "layer1.conv1.weight": weight, + "layer1.conv1.bias": bias, + }) + """ + for name, value in named_tensors.items(): + self.set_tensor(name, value) + + def del_tensors(self, names: Iterable[str]) -> None: + """ + Delete the attributes specified by the given paths. + + For example, to delete the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.del_tensors(["layer1.conv1.weight", + "layer1.conv1.bias"]) + """ + for name in names: + self.del_tensor(name) + + def swap_tensors( + self, + names: Iterable[str], + values: Iterable[torch.Tensor], + allow_missing: bool = False, + ) -> list[torch.Tensor]: + """ + Swap the attributes specified by the given paths to values. + + For example, to swap the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.swap_tensors(["layer1.conv1.weight", + "layer1.conv1.bias"], [weight, bias]) + """ + if not isinstance(names, (list, tuple)): + names = list(names) + if not isinstance(values, (list, tuple)): + values = list(values) + assert len(names) == len(values), "names and values must have the same length" + + return [ + self.swap_tensor(name, value, allow_missing=allow_missing) + for name, value in zip(names, values, strict=True) + ] + + def swap_tensors_dict( + self, named_tensors: dict[str, torch.Tensor], allow_missing: bool = False + ) -> tuple[dict[str, torch.Tensor], list[str]]: + """ + Swap the attributes specified by the given paths to values. + + For example, to swap the attributes mod.layer1.conv1.weight and + mod.layer1.conv1.bias, use accessor.swap_tensors_dict({ + "layer1.conv1.weight": weight, + "layer1.conv1.bias": bias, + }) + """ + orig_named_tensors = {} + missing_keys = [] + try: + for name, tensor in named_tensors.items(): + orig_tensor = self.swap_tensor(name, tensor, allow_missing=True) + if orig_tensor is _MISSING: + missing_keys.append(name) + orig_named_tensors[name] = orig_tensor + except Exception: + # Swap back if any exception occurs + for name, orig_tensor in orig_named_tensors.items(): + self.swap_tensor(name, orig_tensor, allow_missing=True) + raise + if missing_keys and not allow_missing: + # Swap back if any key is missing when allow_missing is False + for name, orig_tensor in orig_named_tensors.items(): + self.swap_tensor(name, orig_tensor, allow_missing=True) + raise RuntimeError(f"Missing key(s): {', '.join(map(repr, missing_keys))}.") + return orig_named_tensors, missing_keys + + def check_keys(self, keys: Iterable[str]) -> tuple[list[str], list[str]]: + """Check that the given keys are valid.""" + keys = set(keys) + valid_keys = {name for name, _ in self.named_tensors(remove_duplicate=False)} + missing_keys = valid_keys - keys + unexpected_keys = keys - valid_keys + return sorted(missing_keys), sorted(unexpected_keys) + + # Shortcut methods + + def named_parameters( + self, + remove_duplicate: bool = True, + ) -> Iterable[tuple[str, torch.Tensor]]: + """Iterate over all the parameters in the module.""" + yield from self.module.named_parameters(remove_duplicate=remove_duplicate) + + def named_buffers( + self, + remove_duplicate: bool = True, + ) -> Iterable[tuple[str, torch.Tensor]]: + """Iterate over all the buffers in the module.""" + yield from self.module.named_buffers(remove_duplicate=remove_duplicate) + + def named_tensors( + self, + remove_duplicate: bool = True, + ) -> Iterable[tuple[str, torch.Tensor]]: + """Iterate over all the tensors in the module.""" + yield from self.module.named_parameters(remove_duplicate=remove_duplicate) + yield from self.module.named_buffers(remove_duplicate=remove_duplicate) + + def named_modules( + self, + remove_duplicate: bool = True, + ) -> Iterable[tuple[str, "torch.nn.Module"]]: + """Iterate over all the modules in the module.""" + yield from self.module.named_modules(remove_duplicate=remove_duplicate) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..2eae0865845eec9c426c5cc3b7bff1b11b5b1230 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py @@ -0,0 +1,126 @@ +# mypy: allow-untyped-defs +import functools + +import torch +from torch.nn.utils._expanded_weights.expanded_weights_impl import ExpandedWeight +from torch.utils import _pytree as pytree + + +# dependency on `functional_call` means that this can't be exposed in utils +# without creating circular dependency +def call_for_per_sample_grads( + module, + *, + batch_size=None, + loss_reduction="sum", + batch_first=True, +): + r""" + Return a forward function for a module, populating grad_sample with per sample gradients on backward invocation. + + Args: + module: The ``nn.Module`` to get per sample gradients with respect to. All trainable + parameters will compute per sample gradients, located in a ``grad_sample`` + field when ``backward`` is invoked + batch_size: The batch size of the input. If None is passed, all tensor arguments in args and kwargs must have + the same batch size, which is the size of the first dimension. Otherwise, it must be passed manually. + Default: None + loss_reduction: Indicates if the loss reduction (for aggregating the gradients) is a sum or a mean operation. If + "mean", per sample gradients will be scaled by the batch size to offset the crossbatch interaction from + running mean across a batch. Must be "mean" or "sum". Default: "sum" + batch_first: Indicates if the batch dimension is the first dimension. If True, the batch dimension is the first + dimension. If False, it's the second dimension. Default: True. + + Examples:: + >>> # xdoctest: +SKIP + >>> model = nn.Linear(4, 3) + >>> batched_input = torch.randn(5, 4) # batch size of 5 + >>> res = call_for_per_sample_grads(model)(batched_input).sum() + >>> res.backward() + >>> assert model.weight.shape == (3, 4) + >>> assert model.weight.grad_sample.shape == (5, 3, 4) + >>> assert model.weight.grad is None + >>> assert model.bias.shape == (3,) + >>> assert model.bias.grad_sample.shape == (5, 3) + >>> assert model.bias.grad is None + + An example using "mean" loss reduction. The grad_sample fields will be scaled by batch_size from what they would be + if we ran the same code with loss_reduction="sum". This is because the mean at the end will scale all + grad_outputs by 1 / batch_size from cross batch interaction. + >>> model = nn.Linear(4, 3) + >>> batched_input = torch.randn(5, 4) # batch size of 5 + >>> res = call_for_per_sample_grads(model, 5, loss_reduction="mean")( + ... batched_input + ... ).mean() + >>> res.backward() + + Note:: + Does not work with any `nn.RNN`, including `nn.GRU` or `nn.LSTM`. Please use custom + rewrites that wrap an `nn.Linear` module. See Opacus for an example + """ + + def maybe_build_expanded_weight(og_tensor, batch_size): + if og_tensor.requires_grad: + return ExpandedWeight(og_tensor, batch_size, loss_reduction) + else: + return og_tensor + + def compute_batch_size(*args, **kwargs): + args_and_kwargs = pytree.arg_tree_leaves(*args, **kwargs) + batch_size = None + for arg in args_and_kwargs: + if not isinstance(arg, torch.Tensor): + continue + + arg_batch_size = arg.shape[0] if batch_first else arg.shape[1] + if batch_size is not None and batch_size != arg_batch_size: + raise RuntimeError( + "When computing batch size, found at least one input with batch size " + f"{batch_size} and one with batch size {arg_batch_size}. Please specify it " + "explicitly using the batch size kwarg in call_for_per_sample_grads" + ) + batch_size = arg_batch_size + if batch_size is None: + raise RuntimeError( + "Unable to find a tensor in the passed args and kwargs. They may not be pytree-able " + "and so ExpandedWeights cannot compute the batch size from the inputs. Please specify " + "it explicitly" + ) + return batch_size + + if loss_reduction not in ["sum", "mean"]: + raise RuntimeError( + f"Expected loss_reduction argument to be sum or mean, got {loss_reduction}" + ) + + if not isinstance(module, torch.nn.Module): + raise RuntimeError( + f"Module passed must be nn.Module, got {type(module).__name__}" + ) + if not (batch_size is None or isinstance(batch_size, int)): + raise RuntimeError( + f"Batch size passed must be None or an integer, got {type(batch_size).__name__}" + ) + if batch_size is not None and batch_size < 1: + raise RuntimeError(f"Batch size must be positive, got {batch_size}") + for weight in module.parameters(): + if hasattr(weight, "grad_sample") and weight.grad_sample is not None: # type: ignore[attr-defined] + raise RuntimeError( + "Current Expanded Weights accumulates the gradients, which will be incorrect for multiple " + f"calls without clearing gradients. Please clear out the grad_sample parameter of {weight} or " + "post an issue to pytorch/pytorch to prioritize correct behavior" + ) + + @functools.wraps(module.forward) + def wrapper(*args, **kwargs): + wrapper_batch_size = batch_size + if wrapper_batch_size is None: + wrapper_batch_size = compute_batch_size(*args, **kwargs) + + params = { + name: maybe_build_expanded_weight(value, wrapper_batch_size) + for (name, value) in module.named_parameters() + } + return torch.func.functional_call(module, params, args, kwargs) + + return wrapper diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..30202708bfa38bb8437627152fb76061955e31f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py @@ -0,0 +1,299 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import functools +import types +import typing +import warnings +from collections.abc import Callable +from typing import cast, TypeAlias, TypeVar +from typing_extensions import deprecated, ParamSpec + +import torch +from torch import Tensor +from torch.utils._foreach_utils import ( + _device_has_foreach_support, + _group_tensors_by_device_and_dtype, + _has_foreach_support, +) + + +__all__: list[str] = [ + "clip_grad_norm", + "clip_grad_norm_", + "clip_grad_value_", +] + + +_tensor_or_tensors: TypeAlias = torch.Tensor | typing.Iterable[torch.Tensor] # noqa: PYI042 + +_P = ParamSpec("_P") +_R = TypeVar("_R") + + +def _no_grad(func: Callable[_P, _R]) -> Callable[_P, _R]: + """ + This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions + clip_grad_norm_ and clip_grad_value_ themselves. + """ + + def _no_grad_wrapper(*args, **kwargs): + with torch.no_grad(): + # pyrefly: ignore [invalid-param-spec] + return func(*args, **kwargs) + + functools.update_wrapper(_no_grad_wrapper, func) + # pyrefly: ignore [bad-return] + return _no_grad_wrapper + + +@_no_grad +def _get_total_norm( + tensors: _tensor_or_tensors, + norm_type: float = 2.0, + error_if_nonfinite: bool = False, + foreach: bool | None = None, +) -> torch.Tensor: + r"""Compute the norm of an iterable of tensors. + + The norm is computed over the norms of the individual tensors, as if the norms of + the individual tensors were concatenated into a single vector. + + Args: + tensors (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will be normalized + norm_type (float): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + error_if_nonfinite (bool): if True, an error is thrown if the total + norm of :attr:`tensors` is ``nan``, ``inf``, or ``-inf``. + Default: ``False`` + foreach (bool): use the faster foreach-based implementation. + If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently + fall back to the slow implementation for other device types. + Default: ``None`` + + Returns: + Total norm of the tensors (viewed as a single vector). + """ + if isinstance(tensors, torch.Tensor): + tensors = [tensors] + else: + tensors = list(tensors) + norm_type = float(norm_type) + if len(tensors) == 0: + return torch.tensor(0.0) + first_device = tensors[0].device + grouped_tensors: dict[ + tuple[torch.device, torch.dtype], tuple[list[list[Tensor]], list[int]] + ] = _group_tensors_by_device_and_dtype( + [tensors] # type: ignore[list-item] + ) # type: ignore[assignment] + + norms: list[Tensor] = [] + for (device, _), ([device_tensors], _) in grouped_tensors.items(): + if (foreach is None and _has_foreach_support(device_tensors, device)) or ( + foreach and _device_has_foreach_support(device) + ): + norms.extend(torch._foreach_norm(device_tensors, norm_type)) + elif foreach: + raise RuntimeError( + f"foreach=True was passed, but can't use the foreach API on {device.type} tensors" + ) + else: + norms.extend( + [torch.linalg.vector_norm(g, norm_type) for g in device_tensors] + ) + + total_norm = torch.linalg.vector_norm( + torch.stack([norm.to(first_device) for norm in norms]), norm_type + ) + + if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()): + raise RuntimeError( + f"The total norm of order {norm_type} for gradients from " + "`parameters` is non-finite, so it cannot be clipped. To disable " + "this error and scale the gradients by the non-finite norm anyway, " + "set `error_if_nonfinite=False`" + ) + return total_norm + + +@_no_grad +def _clip_grads_with_norm_( + parameters: _tensor_or_tensors, + max_norm: float, + total_norm: torch.Tensor, + foreach: bool | None = None, +) -> None: + r"""Scale the gradients of an iterable of parameters given a pre-calculated total norm and desired max norm. + + The gradients will be scaled by the following calculation + + .. math:: + grad = grad * \min(\frac{max\_norm}{total\_norm + 1e-6}, 1) + + Gradients are modified in-place. + + Note: The scale coefficient is clamped to a maximum of 1.0 to prevent gradient amplification. + This ensures that gradients are only scaled down when the total norm exceeds max_norm. + + This function is equivalent to :func:`torch.nn.utils.clip_grad_norm_` with a pre-calculated + total norm. + + Args: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + max_norm (float): max norm of the gradients + total_norm (Tensor): total norm of the gradients to use for clipping + foreach (bool): use the faster foreach-based implementation. + If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently + fall back to the slow implementation for other device types. + Default: ``None`` + + Returns: + None + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + grads = [p.grad for p in parameters if p.grad is not None] + max_norm = float(max_norm) + if len(grads) == 0: + return + grouped_grads: dict[ + tuple[torch.device, torch.dtype], tuple[list[list[Tensor]], list[int]] + ] = _group_tensors_by_device_and_dtype([grads]) # type: ignore[assignment] + + clip_coef = max_norm / (total_norm + 1e-6) + # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so + # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization + # when the gradients do not reside in CPU memory. + clip_coef_clamped = torch.clamp(clip_coef, max=1.0) + for (device, _), ([device_grads], _) in grouped_grads.items(): + if (foreach is None and _has_foreach_support(device_grads, device)) or ( + foreach and _device_has_foreach_support(device) + ): + torch._foreach_mul_(device_grads, clip_coef_clamped.to(device)) + elif foreach: + raise RuntimeError( + f"foreach=True was passed, but can't use the foreach API on {device.type} tensors" + ) + else: + clip_coef_clamped_device = clip_coef_clamped.to(device) + for g in device_grads: + g.mul_(clip_coef_clamped_device) + + +@_no_grad +def clip_grad_norm_( + parameters: _tensor_or_tensors, + max_norm: float, + norm_type: float = 2.0, + error_if_nonfinite: bool = False, + foreach: bool | None = None, +) -> torch.Tensor: + r"""Clip the gradient norm of an iterable of parameters. + + The norm is computed over the norms of the individual gradients of all parameters, + as if the norms of the individual gradients were concatenated into a single vector. + Gradients are modified in-place. + + This function is equivalent to :func:`torch.nn.utils.get_total_norm` followed by + :func:`torch.nn.utils.clip_grads_with_norm_` with the ``total_norm`` returned by ``get_total_norm``. + + Args: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + max_norm (float): max norm of the gradients + norm_type (float, optional): type of the used p-norm. Can be ``'inf'`` for + infinity norm. Default: 2.0 + error_if_nonfinite (bool, optional): if True, an error is thrown if the total + norm of the gradients from :attr:`parameters` is ``nan``, + ``inf``, or ``-inf``. Default: False + foreach (bool, optional): use the faster foreach-based implementation. + If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently + fall back to the slow implementation for other device types. + Default: ``None`` + + Returns: + Total norm of the parameter gradients (viewed as a single vector). + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + else: + is_generator = isinstance(parameters, types.GeneratorType) + # prevent generators from being exhausted + parameters = list(parameters) + if is_generator and len(parameters) == 0: + warnings.warn( + "`parameters` is an empty generator, no gradient clipping will occur.", + stacklevel=3, + ) + grads = [p.grad for p in parameters if p.grad is not None] + total_norm = _get_total_norm(grads, norm_type, error_if_nonfinite, foreach) + _clip_grads_with_norm_(parameters, max_norm, total_norm, foreach) + return total_norm + + +@deprecated( + "`torch.nn.utils.clip_grad_norm` is now deprecated " + "in favor of `torch.nn.utils.clip_grad_norm_`.", + category=FutureWarning, +) +def clip_grad_norm( + parameters: _tensor_or_tensors, + max_norm: float, + norm_type: float = 2.0, + error_if_nonfinite: bool = False, + foreach: bool | None = None, +) -> torch.Tensor: + r"""Clip the gradient norm of an iterable of parameters. + + .. warning:: + This method is now deprecated in favor of + :func:`torch.nn.utils.clip_grad_norm_`. + """ + return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach) + + +@_no_grad +def clip_grad_value_( + parameters: _tensor_or_tensors, + clip_value: float, + foreach: bool | None = None, +) -> None: + r"""Clip the gradients of an iterable of parameters at specified value. + + Gradients are modified in-place. + + Args: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + clip_value (float): maximum allowed value of the gradients. + The gradients are clipped in the range + :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]` + foreach (bool, optional): use the faster foreach-based implementation + If ``None``, use the foreach implementation for CUDA and CPU native tensors and + silently fall back to the slow implementation for other device types. + Default: ``None`` + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + clip_value = float(clip_value) + + grads = [p.grad for p in parameters if p.grad is not None] + # pyrefly: ignore [bad-argument-type] + grouped_grads = _group_tensors_by_device_and_dtype([grads]) + + for (device, _), ([grads], _) in grouped_grads.items(): + if ( + foreach is None + and _has_foreach_support(cast(list[Tensor], grads), device=device) + ) or (foreach and _device_has_foreach_support(device)): + torch._foreach_clamp_min_(cast(list[Tensor], grads), -clip_value) + torch._foreach_clamp_max_(cast(list[Tensor], grads), clip_value) + elif foreach: + raise RuntimeError( + f"foreach=True was passed, but can't use the foreach API on {device.type} tensors" + ) + else: + for grad in grads: + cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..6a56da711ecda3c6e3d5770783f100a8890bbf55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py @@ -0,0 +1,90 @@ +from collections.abc import Iterable + +import torch + + +def parameters_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor: + r"""Flatten an iterable of parameters into a single vector. + + Args: + parameters (Iterable[Tensor]): an iterable of Tensors that are the + parameters of a model. + + Returns: + The parameters represented by a single vector + """ + # Flag for the device where the parameter is located + param_device = None + + vec = [] + for param in parameters: + # Ensure the parameters are located in the same device + param_device = _check_param_device(param, param_device) + + vec.append(param.view(-1)) + return torch.cat(vec) + + +def vector_to_parameters(vec: torch.Tensor, parameters: Iterable[torch.Tensor]) -> None: + r"""Copy slices of a vector into an iterable of parameters. + + Args: + vec (Tensor): a single vector representing the parameters of a model. + parameters (Iterable[Tensor]): an iterable of Tensors that are the + parameters of a model. + """ + # Ensure vec of type Tensor + if not isinstance(vec, torch.Tensor): + raise TypeError(f"expected torch.Tensor, but got: {torch.typename(vec)}") + # Flag for the device where the parameter is located + param_device = None + + # Pointer for slicing the vector for each parameter + pointer = 0 + for param in parameters: + # Ensure the parameters are located in the same device + param_device = _check_param_device(param, param_device) + + # The length of the parameter + num_param = param.numel() + # Slice the vector, reshape it, and replace the old data of the parameter + param.data = vec[pointer : pointer + num_param].view_as(param).data + + # Increment the pointer + pointer += num_param + + +def _check_param_device(param: torch.Tensor, old_param_device: int | None) -> int: + r"""Check if the parameters are located on the same device. + + Currently, the conversion between model parameters and single vector form is not supported + for multiple allocations, e.g. parameters in different GPUs/PrivateUse1s, or mixture of CPU/GPU/PrivateUse1. + + Args: + param ([Tensor]): a Tensor of a parameter of a model + old_param_device (int): the device where the first parameter of a + model is allocated. + + Returns: + old_param_device (int): report device for the first time + """ + # Meet the first parameter + support_device_types = ["cuda", torch._C._get_privateuse1_backend_name()] + if old_param_device is None: + old_param_device = ( + param.get_device() if param.device.type in support_device_types else -1 + ) + else: + warn = False + if ( + param.device.type in support_device_types + ): # Check if in same GPU/PrivateUse1 + warn = param.get_device() != old_param_device + else: # Check if in CPU + warn = old_param_device != -1 + if warn: + raise TypeError( + "Found two parameters on different devices, " + "this is currently not supported." + ) + return old_param_device diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..35406785305117f979479bc2baec0f65d6fdb7af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import copy +from typing import TypeVar + +import torch + + +__all__ = [ + "fuse_conv_bn_eval", + "fuse_conv_bn_weights", + "fuse_linear_bn_eval", + "fuse_linear_bn_weights", +] + +ConvT = TypeVar("ConvT", bound="torch.nn.modules.conv._ConvNd") +LinearT = TypeVar("LinearT", bound="torch.nn.Linear") + + +def fuse_conv_bn_eval( + conv: ConvT, + bn: torch.nn.modules.batchnorm._BatchNorm, + transpose: bool = False, +) -> ConvT: + r"""Fuse a convolutional module and a BatchNorm module into a single, new convolutional module. + + Args: + conv (torch.nn.modules.conv._ConvNd): A convolutional module. + bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module. + transpose (bool, optional): If True, transpose the convolutional weight. Defaults to False. + + Returns: + torch.nn.modules.conv._ConvNd: The fused convolutional module. + + .. note:: + Both ``conv`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed. + """ + assert not (conv.training or bn.training), "Fusion only for eval!" + fused_conv = copy.deepcopy(conv) + + assert bn.running_mean is not None and bn.running_var is not None + fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights( + fused_conv.weight, + fused_conv.bias, + bn.running_mean, + bn.running_var, + bn.eps, + bn.weight, + bn.bias, + transpose, + ) + + return fused_conv + + +def fuse_conv_bn_weights( + conv_w: torch.Tensor, + conv_b: torch.Tensor | None, + bn_rm: torch.Tensor, + bn_rv: torch.Tensor, + bn_eps: float, + bn_w: torch.Tensor | None, + bn_b: torch.Tensor | None, + transpose: bool = False, +) -> tuple[torch.nn.Parameter, torch.nn.Parameter]: + r"""Fuse convolutional module parameters and BatchNorm module parameters into new convolutional module parameters. + + Args: + conv_w (torch.Tensor): Convolutional weight. + conv_b (Optional[torch.Tensor]): Convolutional bias. + bn_rm (torch.Tensor): BatchNorm running mean. + bn_rv (torch.Tensor): BatchNorm running variance. + bn_eps (float): BatchNorm epsilon. + bn_w (Optional[torch.Tensor]): BatchNorm weight. + bn_b (Optional[torch.Tensor]): BatchNorm bias. + transpose (bool, optional): If True, transpose the conv weight. Defaults to False. + + Returns: + Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused convolutional weight and bias. + """ + conv_weight_dtype = conv_w.dtype + conv_bias_dtype = conv_b.dtype if conv_b is not None else conv_weight_dtype + if conv_b is None: + conv_b = torch.zeros_like(bn_rm) + if bn_w is None: + bn_w = torch.ones_like(bn_rm) + if bn_b is None: + bn_b = torch.zeros_like(bn_rm) + bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps) + + if transpose: + shape = [1, -1] + [1] * (len(conv_w.shape) - 2) + else: + shape = [-1, 1] + [1] * (len(conv_w.shape) - 2) + + fused_conv_w = (conv_w * (bn_w * bn_var_rsqrt).reshape(shape)).to( + dtype=conv_weight_dtype + ) + fused_conv_b = ((conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b).to( + dtype=conv_bias_dtype + ) + + return ( + torch.nn.Parameter(fused_conv_w, conv_w.requires_grad), + torch.nn.Parameter(fused_conv_b, conv_b.requires_grad), + ) + + +def fuse_linear_bn_eval( + linear: LinearT, + bn: torch.nn.modules.batchnorm._BatchNorm, +) -> LinearT: + r"""Fuse a linear module and a BatchNorm module into a single, new linear module. + + Args: + linear (torch.nn.Linear): A Linear module. + bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module. + + Returns: + torch.nn.Linear: The fused linear module. + + .. note:: + Both ``linear`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed. + """ + assert not (linear.training or bn.training), "Fusion only for eval!" + fused_linear = copy.deepcopy(linear) + + """ + Linear-BN needs to be fused while preserving the shapes of linear weight/bias. + To preserve the shapes of linear weight/bias, the channel dim of bn needs to be broadcastable with the last dim of linear, + because bn operates over the channel dim, (N, C_in, H, W) while linear operates over the last dim, (*, H_in). + To be broadcastable, the number of features in bn and + the number of output features from linear must satisfy the following condition: + 1. they are equal, or + 2. the number of features in bn is 1 + Otherwise, skip the folding path + """ + assert linear.out_features == bn.num_features or bn.num_features == 1, ( + "To fuse, linear.out_features == bn.num_features or bn.num_features == 1" + ) + + assert bn.running_mean is not None and bn.running_var is not None + fused_linear.weight, fused_linear.bias = fuse_linear_bn_weights( + fused_linear.weight, + fused_linear.bias, + bn.running_mean, + bn.running_var, + bn.eps, + bn.weight, + bn.bias, + ) + + return fused_linear + + +def fuse_linear_bn_weights( + linear_w: torch.Tensor, + linear_b: torch.Tensor | None, + bn_rm: torch.Tensor, + bn_rv: torch.Tensor, + bn_eps: float, + bn_w: torch.Tensor, + bn_b: torch.Tensor, +) -> tuple[torch.nn.Parameter, torch.nn.Parameter]: + r"""Fuse linear module parameters and BatchNorm module parameters into new linear module parameters. + + Args: + linear_w (torch.Tensor): Linear weight. + linear_b (Optional[torch.Tensor]): Linear bias. + bn_rm (torch.Tensor): BatchNorm running mean. + bn_rv (torch.Tensor): BatchNorm running variance. + bn_eps (float): BatchNorm epsilon. + bn_w (torch.Tensor): BatchNorm weight. + bn_b (torch.Tensor): BatchNorm bias. + + Returns: + Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused linear weight and bias. + """ + linear_weight_dtype = linear_w.dtype + linear_bias_dtype = linear_b.dtype if linear_b is not None else linear_weight_dtype + if linear_b is None: + linear_b = torch.zeros_like(bn_rm) + bn_scale = bn_w * torch.rsqrt(bn_rv + bn_eps) + + fused_w = linear_w * bn_scale.unsqueeze(-1).to(dtype=linear_weight_dtype) + fused_b = ((linear_b - bn_rm) * bn_scale + bn_b).to(dtype=linear_bias_dtype) + + return torch.nn.Parameter(fused_w, linear_w.requires_grad), torch.nn.Parameter( + fused_b, linear_b.requires_grad + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py new file mode 100644 index 0000000000000000000000000000000000000000..10fa03b7c01c2eac7e474ef55f433e4704e6c778 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py @@ -0,0 +1,55 @@ +# mypy: allow-untyped-defs +import inspect + +import torch + + +def skip_init(module_cls, *args, **kwargs): + r""" + Given a module class object and args / kwargs, instantiate the module without initializing parameters / buffers. + + This can be useful if initialization is slow or if custom initialization will + be performed, making the default initialization unnecessary. There are some caveats to this, due to + the way this function is implemented: + + 1. The module must accept a `device` arg in its constructor that is passed to any parameters + or buffers created during construction. + + 2. The module must not perform any computation on parameters in its constructor except + initialization (i.e. functions from :mod:`torch.nn.init`). + + If these conditions are satisfied, the module can be instantiated with parameter / buffer values + uninitialized, as if having been created using :func:`torch.empty`. + + Args: + module_cls: Class object; should be a subclass of :class:`torch.nn.Module` + args: args to pass to the module's constructor + kwargs: kwargs to pass to the module's constructor + + Returns: + Instantiated module with uninitialized parameters / buffers + + Example:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> import torch + >>> m = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1) + >>> m.weight + Parameter containing: + tensor([[0.0000e+00, 1.5846e+29, 7.8307e+00, 2.5250e-29, 1.1210e-44]], + requires_grad=True) + >>> m2 = torch.nn.utils.skip_init(torch.nn.Linear, in_features=6, out_features=1) + >>> m2.weight + Parameter containing: + tensor([[-1.4677e+24, 4.5915e-41, 1.4013e-45, 0.0000e+00, -1.4677e+24, + 4.5915e-41]], requires_grad=True) + + """ + if not issubclass(module_cls, torch.nn.Module): + raise RuntimeError(f"Expected a Module; got {module_cls}") + if "device" not in inspect.signature(module_cls).parameters: + raise RuntimeError("Module must support a 'device' arg to skip initialization") + + final_device = kwargs.pop("device", "cpu") + kwargs["device"] = "meta" + return module_cls(*args, **kwargs).to_empty(device=final_device) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py new file mode 100644 index 0000000000000000000000000000000000000000..06eb55a02572d79b6f254624aaea90d86e5430a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +from typing import TypeVar + +import torch + + +_M = TypeVar("_M", bound="torch.nn.Module") + + +def convert_conv2d_weight_memory_format( + module: _M, memory_format: torch.memory_format +) -> _M: + r"""Convert ``memory_format`` of ``nn.Conv2d.weight`` to ``memory_format``. + + The conversion recursively applies to nested ``nn.Module``, including ``module``. + Note that it only changes the memory_format, but not the semantics of each dimensions. + This function is used to facilitate the computation to adopt NHWC kernels, which + provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0 + + .. note:: + Calling ``model.to(memory_format=torch.channels_last)`` is more aggressive + than the utility function ``convert_conv2d_weight_memory_format``. Any + layer with 4d weight will be affected by ``model.to``, which does not + necessarily benefit from conversion to specified ``memory_format``. + One place we are confident in is that NHWC(channels_last) conversion for + convolution in cuDNN, as it is beneficial to run convolution in NHWC, + even in cases where we have to apply permutation to input tensors. + + Hence our strategy here is to convert only the weight of convolution to + channels_last. This ensures that; + 1. Fast convolution kernels will be used, the benefit of which could + outweigh overhead of permutation (if input is not in the same format). + 2. No unnecessary permutations are applied on layers that do not benefit + from memory_format conversion. + + The optimal case is that, layers between convolution layers are channels + last compatible. Input tensor would be permuted to channels last when it + encounters the first convolution layer and stay in that memory format. + Hence following convolutions will not need to permute its input tensor. + + In case where a channels last incompatible layer is between convolution + layers, we need to permute the input tensor back to contiguous format + for that layer. The input tensor will go through the remaining layers in + contiguous format and be permuted to channels last when it encounters + another convolution layer. There's no point in propagating that + permutation to an earlier layer, as most layers are quite agnostic to + ``memory_format``. + + This claim might change when PyTorch supports fusion of permutation, as + there might have been a better spot to fuse the permutation other than + immediately before a convolution. + + Args: + module (nn.Module): ``nn.Conv2d`` & ``nn.ConvTranspose2d`` or container + ``nn.Module`` + memory_format: user specified ``memory_format``, + e.g. ``torch.channels_last`` or ``torch.contiguous_format`` + + Returns: + The original module with updated ``nn.Conv2d`` + + Example: + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG) + >>> input = torch.randint( + ... 1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda" + ... ) + >>> model = nn.Sequential( + >>> nn.Conv2d(8, 4, 3)).cuda().half() + >>> # This is identical to: + >>> # nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last) + >>> model = nn.utils.convert_conv2d_weight_memory_format( + ... model, torch.channels_last + ... ) + >>> out = model(input) + """ + # TODO: expand this to `_ConvNd` when channels_last support is extended + # beyond only 4d tensors. + if isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)): + weight_data = module.weight.detach().clone(memory_format=memory_format) + module.weight.data = weight_data.resize_( + weight_data.size(), memory_format=memory_format + ) + for child in module.children(): + convert_conv2d_weight_memory_format(child, memory_format) + # pyrefly: ignore [bad-return] + return module + + +def convert_conv3d_weight_memory_format( + module: _M, memory_format: torch.memory_format +) -> _M: + r"""Convert ``memory_format`` of ``nn.Conv3d.weight`` to ``memory_format`` + The conversion recursively applies to nested ``nn.Module``, including ``module``. + Note that it only changes the memory_format, but not the semantics of each dimensions. + This function is used to facilitate the computation to adopt NHWC kernels, which + provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0 + + .. note:: + Calling ``model.to(memory_format=torch.channels_last_3d)`` is more aggressive + than the utility function ``convert_conv3d_weight_memory_format``. Any + layer with 4d weight will be affected by ``model.to``, which does not + necessarily benefit from conversion to specified ``memory_format``. + One place we are confident in is that NDHWC(channels_last_3d) conversion for + convolution in cuDNN, as it is beneficial to run convolution in NDHWC, + even in cases where we have to apply permutation to input tensors. + + Hence our strategy here is to convert only the weight of convolution to + channels_last_3d. This ensures that; + 1. Fast convolution kernels will be used, the benefit of which could + outweigh overhead of permutation (if input is not in the same format). + 2. No unnecessary permutations are applied on layers that do not benefit + from memory_format conversion. + + The optimal case is that, layers between convolution layers are channels + last compatible. Input tensor would be permuted to channels last when it + encounters the first convolution layer and stay in that memory format. + Hence following convolutions will not need to permute its input tensor. + + In case where a channels last incompatible layer is between convolution + layers, we need to permute the input tensor back to contiguous format + for that layer. The input tensor will go through the remaining layers in + contiguous format and be permuted to channels last when it encounters + another convolution layer. There's no point in propagating that + permutation to an earlier layer, as most layers are quite agnostic to + ``memory_format``. + + This claim might change when PyTorch supports fusion of permutation, as + there might have been a better spot to fuse the permutation other than + immediately before a convolution. + + Args: + module (nn.Module): ``nn.Conv3d`` & ``nn.ConvTranspose3d`` or container + ``nn.Module`` + memory_format: user specified ``memory_format``, + e.g. ``torch.channels_last`` or ``torch.contiguous_format`` + + Returns: + The original module with updated ``nn.Conv3d`` + + Example: + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG) + >>> input = torch.randint( + ... 1, 10, (2, 8, 4, 4, 4), dtype=torch.float16, device="cuda" + ... ) + >>> model = nn.Sequential( + >>> nn.Conv3d(8, 4, 3)).cuda().half() + >>> # This is identical to: + >>> # nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last_3d) + >>> model = nn.utils.convert_conv3d_weight_memory_format( + ... model, torch.channels_last_3d + ... ) + >>> out = model(input) + """ + + # TODO: expand this to `_ConvNd` when channels_last support is extended + # beyond only 4d tensors. + if isinstance(module, (torch.nn.Conv3d, torch.nn.ConvTranspose3d)): + weight_data = module.weight.detach().clone(memory_format=memory_format) + module.weight.data = weight_data.resize_( + weight_data.size(), memory_format=memory_format + ) + for child in module.children(): + convert_conv3d_weight_memory_format(child, memory_format) + # pyrefly: ignore [bad-return] + return module + + +__all__ = [ + "convert_conv2d_weight_memory_format", + "convert_conv3d_weight_memory_format", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py new file mode 100644 index 0000000000000000000000000000000000000000..3a51bbc15c5969bc742bf954243bd8b1b9333bbe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py @@ -0,0 +1,630 @@ +# mypy: allow-untyped-defs +from enum import auto, Enum + +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.nn.modules import Module +from torch.nn.utils import parametrize + + +__all__ = ["orthogonal", "spectral_norm", "weight_norm"] + + +def _is_orthogonal(Q, eps=None): + n, k = Q.size(-2), Q.size(-1) + Id = torch.eye(k, dtype=Q.dtype, device=Q.device) + # A reasonable eps, but not too large + eps = 10.0 * n * torch.finfo(Q.dtype).eps + return torch.allclose(Q.mH @ Q, Id, atol=eps) + + +def _make_orthogonal(A): + """Assume that A is a tall matrix. + + Compute the Q factor s.t. A = QR (A may be complex) and diag(R) is real and non-negative. + """ + X, tau = torch.geqrf(A) + Q = torch.linalg.householder_product(X, tau) + # The diagonal of X is the diagonal of R (which is always real) so we normalise by its signs + Q *= X.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2) + return Q + + +class _OrthMaps(Enum): + matrix_exp = auto() + cayley = auto() + householder = auto() + + +class _Orthogonal(Module): + base: Tensor + + def __init__( + self, weight, orthogonal_map: _OrthMaps, *, use_trivialization=True + ) -> None: + super().__init__() + + # Note [Householder complex] + # For complex tensors, it is not possible to compute the tensor `tau` necessary for + # linalg.householder_product from the reflectors. + # To see this, note that the reflectors have a shape like: + # 0 0 0 + # * 0 0 + # * * 0 + # which, for complex matrices, give n(n-1) (real) parameters. Now, you need n^2 parameters + # to parametrize the unitary matrices. Saving tau on its own does not work either, because + # not every combination of `(A, tau)` gives a unitary matrix, meaning that if we optimise + # them as independent tensors we would not maintain the constraint + # An equivalent reasoning holds for rectangular matrices + if weight.is_complex() and orthogonal_map == _OrthMaps.householder: + raise ValueError( + "The householder parametrization does not support complex tensors." + ) + + self.shape = weight.shape + self.orthogonal_map = orthogonal_map + if use_trivialization: + self.register_buffer("base", None) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + n, k = X.size(-2), X.size(-1) + transposed = n < k + if transposed: + X = X.mT + n, k = k, n + # Here n > k and X is a tall matrix + if ( + self.orthogonal_map == _OrthMaps.matrix_exp + or self.orthogonal_map == _OrthMaps.cayley + ): + # We just need n x k - k(k-1)/2 parameters + X = X.tril() + if n != k: + # Embed into a square matrix + X = torch.cat( + [X, X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1 + ) + A = X - X.mH + # A is skew-symmetric (or skew-hermitian) + if self.orthogonal_map == _OrthMaps.matrix_exp: + Q = torch.matrix_exp(A) + elif self.orthogonal_map == _OrthMaps.cayley: + # Computes the Cayley retraction (I+A/2)(I-A/2)^{-1} + Id = torch.eye(n, dtype=A.dtype, device=A.device) + Q = torch.linalg.solve( + torch.add(Id, A, alpha=-0.5), torch.add(Id, A, alpha=0.5) + ) + # Q is now orthogonal (or unitary) of size (..., n, n) + if n != k: + # pyrefly: ignore [unbound-name] + Q = Q[..., :k] + # Q is now the size of the X (albeit perhaps transposed) + else: + # X is real here, as we do not support householder with complex numbers + A = X.tril(diagonal=-1) + tau = 2.0 / (1.0 + (A * A).sum(dim=-2)) + Q = torch.linalg.householder_product(A, tau) + # The diagonal of X is 1's and -1's + # We do not want to differentiate through this or update the diagonal of X hence the casting + Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2) + + if hasattr(self, "base"): + # pyrefly: ignore [unbound-name] + Q = self.base @ Q + if transposed: + # pyrefly: ignore [unbound-name] + Q = Q.mT + return Q # type: ignore[possibly-undefined] + + @torch.autograd.no_grad() + def right_inverse(self, Q: torch.Tensor) -> torch.Tensor: + if Q.shape != self.shape: + raise ValueError( + f"Expected a matrix or batch of matrices of shape {self.shape}. " + f"Got a tensor of shape {Q.shape}." + ) + + Q_init = Q + n, k = Q.size(-2), Q.size(-1) + transpose = n < k + if transpose: + Q = Q.mT + n, k = k, n + + # We always make sure to always copy Q in every path + if not hasattr(self, "base"): + # Note [right_inverse expm cayley] + # If we do not have use_trivialization=True, we just implement the inverse of the forward + # map for the Householder. To see why, think that for the Cayley map, + # we would need to find the matrix X \in R^{n x k} such that: + # Y = torch.cat([X.tril(), X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1) + # A = Y - Y.mH + # cayley(A)[:, :k] + # gives the original tensor. It is not clear how to do this. + # Perhaps via some algebraic manipulation involving the QR like that of + # Corollary 2.2 in Edelman, Arias and Smith? + if ( + self.orthogonal_map == _OrthMaps.cayley + or self.orthogonal_map == _OrthMaps.matrix_exp + ): + raise NotImplementedError( + "It is not possible to assign to the matrix exponential " + "or the Cayley parametrizations when use_trivialization=False." + ) + + # If parametrization == _OrthMaps.householder, make Q orthogonal via the QR decomposition. + # Here Q is always real because we do not support householder and complex matrices. + # See note [Householder complex] + A, tau = torch.geqrf(Q) + # We want to have a decomposition X = QR with diag(R) > 0, as otherwise we could + # decompose an orthogonal matrix Q as Q = (-Q)@(-Id), which is a valid QR decomposition + # The diagonal of Q is the diagonal of R from the qr decomposition + A.diagonal(dim1=-2, dim2=-1).sign_() + # Equality with zero is ok because LAPACK returns exactly zero when it does not want + # to use a particular reflection + A.diagonal(dim1=-2, dim2=-1)[tau == 0.0] *= -1 + return A.mT if transpose else A + else: + if n == k: + # We check whether Q is orthogonal + if not _is_orthogonal(Q): + Q = _make_orthogonal(Q) + else: # Is orthogonal + Q = Q.clone() + else: + # Complete Q into a full n x n orthogonal matrix + N = torch.randn( + *(Q.size()[:-2] + (n, n - k)), dtype=Q.dtype, device=Q.device + ) + Q = torch.cat([Q, N], dim=-1) + Q = _make_orthogonal(Q) + self.base = Q + + # It is necessary to return the -Id, as we use the diagonal for the + # Householder parametrization. Using -Id makes: + # householder(torch.zeros(m,n)) == torch.eye(m,n) + # Poor man's version of eye_like + neg_Id = torch.zeros_like(Q_init) + neg_Id.diagonal(dim1=-2, dim2=-1).fill_(-1.0) + return neg_Id + + +def orthogonal( + module: Module, + name: str = "weight", + orthogonal_map: str | None = None, + *, + use_trivialization: bool = True, +) -> Module: + r"""Apply an orthogonal or unitary parametrization to a matrix or a batch of matrices. + + Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, the parametrized + matrix :math:`Q \in \mathbb{K}^{m \times n}` is **orthogonal** as + + .. math:: + + \begin{align*} + Q^{\text{H}}Q &= \mathrm{I}_n \mathrlap{\qquad \text{if }m \geq n}\\ + QQ^{\text{H}} &= \mathrm{I}_m \mathrlap{\qquad \text{if }m < n} + \end{align*} + + where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex + and the transpose when :math:`Q` is real-valued, and + :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix. + In plain words, :math:`Q` will have orthonormal columns whenever :math:`m \geq n` + and orthonormal rows otherwise. + + If the tensor has more than two dimensions, we consider it as a batch of matrices of shape `(..., m, n)`. + + The matrix :math:`Q` may be parametrized via three different ``orthogonal_map`` in terms of the original tensor: + + - ``"matrix_exp"``/``"cayley"``: + the :func:`~torch.matrix_exp` :math:`Q = \exp(A)` and the `Cayley map`_ + :math:`Q = (\mathrm{I}_n + A/2)(\mathrm{I}_n - A/2)^{-1}` are applied to a skew-symmetric + :math:`A` to give an orthogonal matrix. + - ``"householder"``: computes a product of Householder reflectors + (:func:`~torch.linalg.householder_product`). + + ``"matrix_exp"``/``"cayley"`` often make the parametrized weight converge faster than + ``"householder"``, but they are slower to compute for very thin or very wide matrices. + + If ``use_trivialization=True`` (default), the parametrization implements the "Dynamic Trivialization Framework", + where an extra matrix :math:`B \in \mathbb{K}^{n \times n}` is stored under + ``module.parametrizations.weight[0].base``. This helps the + convergence of the parametrized layer at the expense of some extra memory use. + See `Trivializations for Gradient-Based Optimization on Manifolds`_ . + + Initial value of :math:`Q`: + If the original tensor is not parametrized and ``use_trivialization=True`` (default), the initial value + of :math:`Q` is that of the original tensor if it is orthogonal (or unitary in the complex case) + and it is orthogonalized via the QR decomposition otherwise (see :func:`torch.linalg.qr`). + Same happens when it is not parametrized and ``orthogonal_map="householder"`` even when ``use_trivialization=False``. + Otherwise, the initial value is the result of the composition of all the registered + parametrizations applied to the original tensor. + + .. note:: + This function is implemented using the parametrization functionality + in :func:`~torch.nn.utils.parametrize.register_parametrization`. + + + .. _`Cayley map`: https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map + .. _`Trivializations for Gradient-Based Optimization on Manifolds`: https://arxiv.org/abs/1909.09501 + + Args: + module (nn.Module): module on which to register the parametrization. + name (str, optional): name of the tensor to make orthogonal. Default: ``"weight"``. + orthogonal_map (str, optional): One of the following: ``"matrix_exp"``, ``"cayley"``, ``"householder"``. + Default: ``"matrix_exp"`` if the matrix is square or complex, ``"householder"`` otherwise. + use_trivialization (bool, optional): whether to use the dynamic trivialization framework. + Default: ``True``. + + Returns: + The original module with an orthogonal parametrization registered to the specified + weight + + Example:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK) + >>> orth_linear = orthogonal(nn.Linear(20, 40)) + >>> orth_linear + ParametrizedLinear( + in_features=20, out_features=40, bias=True + (parametrizations): ModuleDict( + (weight): ParametrizationList( + (0): _Orthogonal() + ) + ) + ) + >>> # xdoctest: +IGNORE_WANT + >>> Q = orth_linear.weight + >>> torch.dist(Q.T @ Q, torch.eye(20)) + tensor(4.9332e-07) + """ + weight = getattr(module, name, None) + if not isinstance(weight, Tensor): + raise ValueError( + f"Module '{module}' has no parameter or buffer with name '{name}'" + ) + + # We could implement this for 1-dim tensors as the maps on the sphere + # but I believe it'd bite more people than it'd help + if weight.ndim < 2: + raise ValueError( + "Expected a matrix or batch of matrices. " + f"Got a tensor of {weight.ndim} dimensions." + ) + + if orthogonal_map is None: + orthogonal_map = ( + "matrix_exp" + if weight.size(-2) == weight.size(-1) or weight.is_complex() + else "householder" + ) + + orth_enum = getattr(_OrthMaps, orthogonal_map, None) + if orth_enum is None: + raise ValueError( + 'orthogonal_map has to be one of "matrix_exp", "cayley", "householder". ' + f"Got: {orthogonal_map}" + ) + orth = _Orthogonal(weight, orth_enum, use_trivialization=use_trivialization) + parametrize.register_parametrization(module, name, orth, unsafe=True) + return module + + +class _WeightNorm(Module): + def __init__( + self, + dim: int | None = 0, + ) -> None: + super().__init__() + if dim is None: + dim = -1 + self.dim = dim + + def forward(self, weight_g, weight_v): + return torch._weight_norm(weight_v, weight_g, self.dim) + + def right_inverse(self, weight): + weight_g = torch.norm_except_dim(weight, 2, self.dim) + weight_v = weight + + return weight_g, weight_v + + +def weight_norm(module: Module, name: str = "weight", dim: int = 0): + r"""Apply weight normalization to a parameter in the given module. + + .. math:: + \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|} + + Weight normalization is a reparameterization that decouples the magnitude + of a weight tensor from its direction. This replaces the parameter specified + by :attr:`name` with two parameters: one specifying the magnitude + and one specifying the direction. + + By default, with ``dim=0``, the norm is computed independently per output + channel/plane. To compute a norm over the entire weight tensor, use + ``dim=None``. + + See https://arxiv.org/abs/1602.07868 + + Args: + module (Module): containing module + name (str, optional): name of weight parameter + dim (int, optional): dimension over which to compute the norm + + Returns: + The original module with the weight norm hook + + Example:: + + >>> m = weight_norm(nn.Linear(20, 40), name='weight') + >>> m + ParametrizedLinear( + in_features=20, out_features=40, bias=True + (parametrizations): ModuleDict( + (weight): ParametrizationList( + (0): _WeightNorm() + ) + ) + ) + >>> m.parametrizations.weight.original0.size() + torch.Size([40, 1]) + >>> m.parametrizations.weight.original1.size() + torch.Size([40, 20]) + + """ + _weight_norm = _WeightNorm(dim) + parametrize.register_parametrization(module, name, _weight_norm, unsafe=True) + + def _weight_norm_compat_hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) -> None: + g_key = f"{prefix}{name}_g" + v_key = f"{prefix}{name}_v" + if g_key in state_dict and v_key in state_dict: + original0 = state_dict.pop(g_key) + original1 = state_dict.pop(v_key) + state_dict[f"{prefix}parametrizations.{name}.original0"] = original0 + state_dict[f"{prefix}parametrizations.{name}.original1"] = original1 + + module._register_load_state_dict_pre_hook(_weight_norm_compat_hook) + return module + + +class _SpectralNorm(Module): + def __init__( + self, + weight: torch.Tensor, + n_power_iterations: int = 1, + dim: int = 0, + eps: float = 1e-12, + ) -> None: + super().__init__() + ndim = weight.ndim + if dim >= ndim or dim < -ndim: + raise IndexError( + "Dimension out of range (expected to be in range of " + f"[-{ndim}, {ndim - 1}] but got {dim})" + ) + + if n_power_iterations <= 0: + raise ValueError( + "Expected n_power_iterations to be positive, but " + f"got n_power_iterations={n_power_iterations}" + ) + self.dim = dim if dim >= 0 else dim + ndim + self.eps = eps + if ndim > 1: + # For ndim == 1 we do not need to approximate anything (see _SpectralNorm.forward) + self.n_power_iterations = n_power_iterations + weight_mat = self._reshape_weight_to_matrix(weight) + h, w = weight_mat.size() + + u = weight_mat.new_empty(h).normal_(0, 1) + v = weight_mat.new_empty(w).normal_(0, 1) + self.register_buffer("_u", F.normalize(u, dim=0, eps=self.eps)) + self.register_buffer("_v", F.normalize(v, dim=0, eps=self.eps)) + + # Start with u, v initialized to some reasonable values by performing a number + # of iterations of the power method + self._power_method(weight_mat, 15) + + def _reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor: + # Precondition + assert weight.ndim > 1 + + if self.dim != 0: + # permute dim to front + weight = weight.permute( + self.dim, *(d for d in range(weight.dim()) if d != self.dim) + ) + + return weight.flatten(1) + + @torch.autograd.no_grad() + def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> None: + # See original note at torch/nn/utils/spectral_norm.py + # NB: If `do_power_iteration` is set, the `u` and `v` vectors are + # updated in power iteration **in-place**. This is very important + # because in `DataParallel` forward, the vectors (being buffers) are + # broadcast from the parallelized module to each module replica, + # which is a new module object created on the fly. And each replica + # runs its own spectral norm power iteration. So simply assigning + # the updated vectors to the module this function runs on will cause + # the update to be lost forever. And the next time the parallelized + # module is replicated, the same randomly initialized vectors are + # broadcast and used! + # + # Therefore, to make the change propagate back, we rely on two + # important behaviors (also enforced via tests): + # 1. `DataParallel` doesn't clone storage if the broadcast tensor + # is already on correct device; and it makes sure that the + # parallelized module is already on `device[0]`. + # 2. If the out tensor in `out=` kwarg has correct shape, it will + # just fill in the values. + # Therefore, since the same power iteration is performed on all + # devices, simply updating the tensors in-place will make sure that + # the module replica on `device[0]` will update the _u vector on the + # parallelized module (by shared storage). + # + # However, after we update `u` and `v` in-place, we need to **clone** + # them before using them to normalize the weight. This is to support + # backproping through two forward passes, e.g., the common pattern in + # GAN training: loss = D(real) - D(fake). Otherwise, engine will + # complain that variables needed to do backward for the first forward + # (i.e., the `u` and `v` vectors) are changed in the second forward. + + # Precondition + assert weight_mat.ndim > 1 + + for _ in range(n_power_iterations): + # Spectral norm of weight equals to `u^T W v`, where `u` and `v` + # are the first left and right singular vectors. + # This power iteration produces approximations of `u` and `v`. + self._u = F.normalize( + torch.mv(weight_mat, self._v), # type: ignore[has-type] + dim=0, + eps=self.eps, + out=self._u, # type: ignore[has-type] + ) + self._v = F.normalize( + torch.mv(weight_mat.H, self._u), # type: ignore[has-type] + dim=0, + eps=self.eps, + out=self._v, # type: ignore[has-type] + ) + + def forward(self, weight: torch.Tensor) -> torch.Tensor: + if weight.ndim == 1: + # Faster and more exact path, no need to approximate anything + return F.normalize(weight, dim=0, eps=self.eps) + else: + weight_mat = self._reshape_weight_to_matrix(weight) + if self.training: + self._power_method(weight_mat, self.n_power_iterations) + # See above on why we need to clone + u = self._u.clone(memory_format=torch.contiguous_format) + v = self._v.clone(memory_format=torch.contiguous_format) + # The proper way of computing this should be through F.bilinear, but + # it seems to have some efficiency issues: + # https://github.com/pytorch/pytorch/issues/58093 + sigma = torch.vdot(u, torch.mv(weight_mat, v)) + return weight / sigma + + def right_inverse(self, value: torch.Tensor) -> torch.Tensor: + # we may want to assert here that the passed value already + # satisfies constraints + return value + + +def spectral_norm( + module: Module, + name: str = "weight", + n_power_iterations: int = 1, + eps: float = 1e-12, + dim: int | None = None, +) -> Module: + r"""Apply spectral normalization to a parameter in the given module. + + .. math:: + \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})}, + \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2} + + When applied on a vector, it simplifies to + + .. math:: + \mathbf{x}_{SN} = \dfrac{\mathbf{x}}{\|\mathbf{x}\|_2} + + Spectral normalization stabilizes the training of discriminators (critics) + in Generative Adversarial Networks (GANs) by reducing the Lipschitz constant + of the model. :math:`\sigma` is approximated performing one iteration of the + `power method`_ every time the weight is accessed. If the dimension of the + weight tensor is greater than 2, it is reshaped to 2D in power iteration + method to get spectral norm. + + + See `Spectral Normalization for Generative Adversarial Networks`_ . + + .. _`power method`: https://en.wikipedia.org/wiki/Power_iteration + .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957 + + .. note:: + This function is implemented using the parametrization functionality + in :func:`~torch.nn.utils.parametrize.register_parametrization`. It is a + reimplementation of :func:`torch.nn.utils.spectral_norm`. + + .. note:: + When this constraint is registered, the singular vectors associated to the largest + singular value are estimated rather than sampled at random. These are then updated + performing :attr:`n_power_iterations` of the `power method`_ whenever the tensor + is accessed with the module on `training` mode. + + .. note:: + If the `_SpectralNorm` module, i.e., `module.parametrization.weight[idx]`, + is in training mode on removal, it will perform another power iteration. + If you'd like to avoid this iteration, set the module to eval mode + before its removal. + + Args: + module (nn.Module): containing module + name (str, optional): name of weight parameter. Default: ``"weight"``. + n_power_iterations (int, optional): number of power iterations to + calculate spectral norm. Default: ``1``. + eps (float, optional): epsilon for numerical stability in + calculating norms. Default: ``1e-12``. + dim (int, optional): dimension corresponding to number of outputs. + Default: ``0``, except for modules that are instances of + ConvTranspose{1,2,3}d, when it is ``1`` + + Returns: + The original module with a new parametrization registered to the specified + weight + + Example:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> snm = spectral_norm(nn.Linear(20, 40)) + >>> snm + ParametrizedLinear( + in_features=20, out_features=40, bias=True + (parametrizations): ModuleDict( + (weight): ParametrizationList( + (0): _SpectralNorm() + ) + ) + ) + >>> torch.linalg.matrix_norm(snm.weight, 2) + tensor(1.0081, grad_fn=) + """ + weight = getattr(module, name, None) + if not isinstance(weight, Tensor): + raise ValueError( + f"Module '{module}' has no parameter or buffer with name '{name}'" + ) + + if dim is None: + if isinstance( + module, + ( + torch.nn.ConvTranspose1d, + torch.nn.ConvTranspose2d, + torch.nn.ConvTranspose3d, + ), + ): + dim = 1 + else: + dim = 0 + parametrize.register_parametrization( + module, name, _SpectralNorm(weight, n_power_iterations, dim, eps) + ) + return module diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py new file mode 100644 index 0000000000000000000000000000000000000000..28599db7bdf116f7e3af1bcd7d8576fc2fe51f9b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py @@ -0,0 +1,838 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import collections +import copyreg +from collections.abc import Sequence +from contextlib import contextmanager +from copy import deepcopy + +import torch +from torch import Tensor +from torch.__future__ import get_swap_module_params_on_conversion +from torch.nn.modules.container import Module, ModuleDict, ModuleList +from torch.nn.parameter import Parameter +from torch.utils._python_dispatch import is_traceable_wrapper_subclass + + +__all__ = [ + "cached", + "ParametrizationList", + "register_parametrization", + "is_parametrized", + "remove_parametrizations", + "type_before_parametrizations", + "transfer_parametrizations_and_params", +] + +_cache_enabled = 0 +_cache: dict[tuple[int, str], Tensor | None] = {} + + +@contextmanager +def cached(): + r"""Context manager that enables the caching system within parametrizations registered with :func:`register_parametrization`. + + The value of the parametrized objects is computed and cached the first time + they are required when this context manager is active. The cached values are + discarded when leaving the context manager. + + This is useful when using a parametrized parameter more than once in the forward pass. + An example of this is when parametrizing the recurrent kernel of an RNN or when + sharing weights. + + The simplest way to activate the cache is by wrapping the forward pass of the neural network + + .. code-block:: python + + import torch.nn.utils.parametrize as P + + ... + with P.cached(): + output = model(inputs) + + in training and evaluation. One may also wrap the parts of the modules that use + several times the parametrized tensors. For example, the loop of an RNN with a + parametrized recurrent kernel: + + .. code-block:: python + + with P.cached(): + for x in xs: + out_rnn = self.rnn_cell(x, out_rnn) + """ + global _cache + global _cache_enabled + _cache_enabled += 1 + try: + yield + finally: + _cache_enabled -= 1 + if not _cache_enabled: + _cache = {} + + +def _register_parameter_or_buffer(module, name, X) -> None: + if isinstance(X, Parameter): + module.register_parameter(name, X) + else: + module.register_buffer(name, X) + + +def _maybe_set(dest: Tensor, src: Tensor) -> None: + should_swap = ( + get_swap_module_params_on_conversion() or is_traceable_wrapper_subclass(dest) + ) + if should_swap: + if isinstance(dest, Parameter) and not isinstance(src, Parameter): + src = Parameter(src, requires_grad=dest.requires_grad) + torch.utils.swap_tensors(dest, src) + else: + dest.set_(src) # type: ignore[call-overload] + + +class ParametrizationList(ModuleList): + r"""A sequential container that holds and manages the original parameters or buffers of a parametrized :class:`torch.nn.Module`. + + It is the type of ``module.parametrizations[tensor_name]`` when ``module[tensor_name]`` + has been parametrized with :func:`register_parametrization`. + + If the first registered parametrization has a ``right_inverse`` that returns one tensor or + does not have a ``right_inverse`` (in which case we assume that ``right_inverse`` is the identity), + it will hold the tensor under the name ``original``. + If it has a ``right_inverse`` that returns more than one tensor, these will be registered as + ``original0``, ``original1``, ... + + .. warning:: + This class is used internally by :func:`register_parametrization`. It is documented + here for completeness. It shall not be instantiated by the user. + + Args: + modules (sequence): sequence of modules representing the parametrizations + original (Parameter or Tensor): parameter or buffer that is parametrized + unsafe (bool): a boolean flag that denotes whether the parametrization + may change the dtype and shape of the tensor. Default: `False` + Warning: the parametrization is not checked for consistency upon registration. + Enable this flag at your own risk. + """ + + original: Tensor + unsafe: bool + + def __init__( + self, + modules: Sequence[Module], + original: Tensor | Parameter, + unsafe: bool = False, + ) -> None: + # We require this because we need to treat differently the first parametrization + # This should never throw, unless this class is used from the outside + if len(modules) == 0: + raise ValueError("ParametrizationList requires one or more modules.") + + super().__init__(modules) + self.unsafe = unsafe + + # In plain words: + # module.weight must keep its dtype and shape. + # Furthermore, if there is no right_inverse or the right_inverse returns a tensor, + # this should be of the same dtype as the original tensor + # + # We check that the following invariants hold: + # X = module.weight + # Y = param.right_inverse(X) + # assert isinstance(Y, Tensor) or + # (isinstance(Y, collections.abc.Sequence) and all(isinstance(t, Tensor) for t in Y)) + # Z = param(Y) if isinstance(Y, Tensor) else param(*Y) + # # Consistency checks + # assert X.dtype == Z.dtype and X.shape == Z.shape + # # If it has one input, this allows to be able to use set_ to be able to + # # move data to/from the original tensor without changing its id (which is what the + # # optimizer uses to track parameters) + # if isinstance(Y, Tensor) + # assert X.dtype == Y.dtype + # Below we use original = X, new = Y + + original_shape = original.shape + original_dtype = original.dtype + + # Compute new + with torch.no_grad(): + new = original + for module in reversed(self): # type: ignore[call-overload] + if hasattr(module, "right_inverse"): + try: + new = module.right_inverse(new) # type: ignore[operator] + except NotImplementedError: + pass + # else, or if it throws, we assume that right_inverse is the identity + + if not isinstance(new, Tensor) and not isinstance(new, Sequence): + raise ValueError( + "'right_inverse' must return a Tensor or a Sequence of tensors (list, tuple...). " + f"Got {type(new).__name__}" + ) + + # Set the number of original tensors + self.is_tensor = isinstance(new, Tensor) + self.ntensors = 1 if self.is_tensor else len(new) + + # Register the tensor(s) + if self.is_tensor: + # pyrefly: ignore [missing-attribute] + if original.dtype != new.dtype: + raise ValueError( + "When `right_inverse` outputs one tensor, it may not change the dtype.\n" + f"original.dtype: {original.dtype}\n" + # pyrefly: ignore [missing-attribute] + f"right_inverse(original).dtype: {new.dtype}" + ) + + # pyrefly: ignore [missing-attribute] + if original.device != new.device: + raise ValueError( + "When `right_inverse` outputs one tensor, it may not change the device.\n" + f"original.device: {original.device}\n" + # pyrefly: ignore [missing-attribute] + f"right_inverse(original).device: {new.device}" + ) + + # Set the original to original so that the user does not need to re-register the parameter + # manually in the optimiser + with torch.no_grad(): + # pyrefly: ignore [bad-argument-type] + _maybe_set(original, new) + _register_parameter_or_buffer(self, "original", original) + else: + for i, originali in enumerate(new): + if not isinstance(originali, Tensor): + raise ValueError( + "'right_inverse' must return a Tensor or a Sequence of tensors " + "(list, tuple...). " + f"Got element {i} of the sequence with type {type(originali).__name__}." + ) + + # If the original tensor was a Parameter that required grad, we expect the user to + # add the new parameters to the optimizer after registering the parametrization + # (this is documented) + if isinstance(original, Parameter): + originali = Parameter(originali, original.requires_grad) + originali.requires_grad_(original.requires_grad) + _register_parameter_or_buffer(self, f"original{i}", originali) + + if not self.unsafe: + # Consistency checks: + # Since f : A -> B, right_inverse : B -> A, Z and original should live in B + # Z = forward(right_inverse(original)) + Z = self() + if not isinstance(Z, Tensor): + raise ValueError( + f"A parametrization must return a tensor. Got {type(Z).__name__}." + ) + if Z.dtype != original_dtype: + raise ValueError( + "Registering a parametrization may not change the dtype of the tensor, unless `unsafe` flag is enabled.\n" + f"unparametrized dtype: {original_dtype}\n" + f"parametrized dtype: {Z.dtype}" + ) + if Z.shape != original_shape: + raise ValueError( + "Registering a parametrization may not change the shape of the tensor, unless `unsafe` flag is enabled.\n" + f"unparametrized shape: {original_shape}\n" + f"parametrized shape: {Z.shape}" + ) + + def right_inverse(self, value: Tensor) -> None: + r"""Call the ``right_inverse`` methods of the parametrizations in the inverse registration order. + + Then, it stores the result in ``self.original`` if ``right_inverse`` outputs one tensor + or in ``self.original0``, ``self.original1``, ... if it outputs several. + + Args: + value (Tensor): Value to which initialize the module + """ + # All the exceptions in this function should almost never throw. + # They could throw if, for example, right_inverse function returns a different + # dtype when given a different input, which should most likely be caused by a + # bug in the user's code + + with torch.no_grad(): + # See https://github.com/pytorch/pytorch/issues/53103 + for module in reversed(self): # type: ignore[call-overload] + if hasattr(module, "right_inverse"): + value = module.right_inverse(value) # type: ignore[operator] + else: + raise RuntimeError( + f"parametrization {type(module).__name__} does not implement " + "right_inverse." + ) + if self.is_tensor: + # These exceptions should only throw when a right_inverse function does not + # return the same dtype for every input, which should most likely be caused by a bug + if not isinstance(value, Tensor): + raise ValueError( + f"`right_inverse` should return a tensor. Got {type(value).__name__}" + ) + if value.dtype != self.original.dtype: + raise ValueError( + f"The tensor returned by `right_inverse` has dtype {value.dtype} " + f"while `original` has dtype {self.original.dtype}" + ) + # We know that the result is going to have the same dtype + _maybe_set(self.original, value) + else: + if not isinstance(value, collections.abc.Sequence): + raise ValueError( + "'right_inverse' must return a sequence of tensors. " + f"Got {type(value).__name__}." + ) + if len(value) != self.ntensors: + raise ValueError( + "'right_inverse' must return a sequence of tensors of length " + f"{self.ntensors}. Got a sequence of length {len(value)}." + ) + for i, tensor in enumerate(value): + original_i = getattr(self, f"original{i}") + if not isinstance(tensor, Tensor): + raise ValueError( + f"`right_inverse` must return a sequence of tensors. " + f"Got element {i} of type {type(tensor).__name__}" + ) + if original_i.dtype != tensor.dtype: + raise ValueError( + f"Tensor {i} returned by `right_inverse` has dtype {tensor.dtype} " + f"while `original{i}` has dtype {original_i.dtype}" + ) + _maybe_set(original_i, tensor) + + def forward(self) -> Tensor: + if torch.jit.is_scripting(): + raise RuntimeError("Parametrization is not working with scripting.") + # Unpack the originals for the first parametrization + if self.is_tensor: + x = self[0](self.original) + else: + originals = (getattr(self, f"original{i}") for i in range(self.ntensors)) + x = self[0](*originals) + # It's not possible to call self[1:] here, so we have to be a bit more cryptic + # Also we want to skip all non-integer keys + curr_idx = 1 + while hasattr(self, str(curr_idx)): + x = self[curr_idx](x) + curr_idx += 1 + return x + + +def _inject_new_class(module: Module) -> None: + r"""Set up a module to be parametrized. + + This works by substituting the class of the module by a class + that extends it to be able to inject a property + + Args: + module (nn.Module): module into which to inject the property + """ + cls = module.__class__ + + def default_deepcopy(self, memo): + # Just emulate a standard deepcopy procedure when __deepcopy__ doesn't exist in the current class. + obj = memo.get(id(self), None) + if obj is not None: + return obj + replica = self.__new__(self.__class__) + memo[id(self)] = replica + replica.__dict__ = deepcopy(self.__dict__, memo) + # Also save all slots if they exist. + slots_to_save = copyreg._slotnames(self.__class__) # type: ignore[attr-defined] + for slot in slots_to_save: + if hasattr(self, slot): + setattr(replica, slot, deepcopy(getattr(self, slot), memo)) + return replica + + def getstate(self): + raise RuntimeError( + "Serialization of parametrized modules is only " + "supported through state_dict(). See:\n" + "https://pytorch.org/tutorials/beginner/saving_loading_models.html" + "#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training" + ) + + dct = {"__getstate__": getstate} + # We don't allow serialization of parametrized modules but should still allow deepcopying. + # Default 'deepcopy' function invokes __deepcopy__ method instead of __getstate__ when it exists. + if not hasattr(cls, "__deepcopy__"): + dct["__deepcopy__"] = default_deepcopy # type: ignore[assignment] + + param_cls = type( + f"Parametrized{cls.__name__}", + (cls,), + dct, + ) + + module.__class__ = param_cls + + +def _inject_property(module: Module, tensor_name: str) -> None: + r"""Injects a property into module[tensor_name]. + + It assumes that the class in the module has already been modified from its + original one using _inject_new_class and that the tensor under :attr:`tensor_name` + has already been moved out + + Args: + module (nn.Module): module into which to inject the property + tensor_name (str): name of the name of the property to create + """ + # We check the precondition. + # This should never fire if register_parametrization is correctly implemented + assert not hasattr(module, tensor_name) + + @torch.jit.unused + def get_cached_parametrization(parametrization) -> Tensor: + global _cache + key = (id(module), tensor_name) + tensor = _cache.get(key) + if tensor is None: + tensor = parametrization() + _cache[key] = tensor + return tensor + + def get_parametrized(self) -> Tensor: + if torch.jit.is_scripting(): + raise RuntimeError("Parametrization is not working with scripting.") + parametrization = self.parametrizations[tensor_name] + # pyrefly: ignore [redundant-condition] + if _cache_enabled: + if torch.jit.is_scripting(): + # Scripting + raise RuntimeError( + "Caching is not implemented for scripting. " + "Either disable caching or avoid scripting." + ) + elif torch._C._get_tracing_state() is not None: + # Tracing + raise RuntimeError( + "Cannot trace a model while caching parametrizations." + ) + else: + return get_cached_parametrization(parametrization) + else: + # If caching is not active, this function just evaluates the parametrization + return parametrization() + + def set_original(self, value: Tensor) -> None: + if torch.jit.is_scripting(): + raise RuntimeError("Parametrization is not working with scripting.") + self.parametrizations[tensor_name].right_inverse(value) + + setattr(module.__class__, tensor_name, property(get_parametrized, set_original)) + + +def register_parametrization( + module: Module, + tensor_name: str, + parametrization: Module, + *, + unsafe: bool = False, +) -> Module: + r"""Register a parametrization to a tensor in a module. + + Assume that ``tensor_name="weight"`` for simplicity. When accessing ``module.weight``, + the module will return the parametrized version ``parametrization(module.weight)``. + If the original tensor requires a gradient, the backward pass will differentiate + through :attr:`parametrization`, and the optimizer will update the tensor accordingly. + + The first time that a module registers a parametrization, this function will add an attribute + ``parametrizations`` to the module of type :class:`~ParametrizationList`. + + The list of parametrizations on the tensor ``weight`` will be accessible under + ``module.parametrizations.weight``. + + The original tensor will be accessible under + ``module.parametrizations.weight.original``. + + Parametrizations may be concatenated by registering several parametrizations + on the same attribute. + + The training mode of a registered parametrization is updated on registration + to match the training mode of the host module + + Parametrized parameters and buffers have an inbuilt caching system that can be activated + using the context manager :func:`cached`. + + A :attr:`parametrization` may optionally implement a method with signature + + .. code-block:: python + + def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]] + + This method is called on the unparametrized tensor when the first parametrization + is registered to compute the initial value of the original tensor. + If this method is not implemented, the original tensor will be just the unparametrized tensor. + + If all the parametrizations registered on a tensor implement `right_inverse` it is possible + to initialize a parametrized tensor by assigning to it, as shown in the example below. + + It is possible for the first parametrization to depend on several inputs. + This may be implemented returning a tuple of tensors from ``right_inverse`` + (see the example implementation of a ``RankOne`` parametrization below). + + In this case, the unconstrained tensors are also located under ``module.parametrizations.weight`` + with names ``original0``, ``original1``,... + + .. note:: + + If unsafe=False (default) both the forward and right_inverse methods will be called + once to perform a number of consistency checks. + If unsafe=True, then right_inverse will be called if the tensor is not parametrized, + and nothing will be called otherwise. + + .. note:: + + In most situations, ``right_inverse`` will be a function such that + ``forward(right_inverse(X)) == X`` (see + `right inverse `_). + Sometimes, when the parametrization is not surjective, it may be reasonable + to relax this. + + .. warning:: + + If a parametrization depends on several inputs, :func:`~register_parametrization` + will register a number of new parameters. If such parametrization is registered + after the optimizer is created, these new parameters will need to be added manually + to the optimizer. See :meth:`torch.Optimizer.add_param_group`. + + Args: + module (nn.Module): module on which to register the parametrization + tensor_name (str): name of the parameter or buffer on which to register + the parametrization + parametrization (nn.Module): the parametrization to register + Keyword args: + unsafe (bool): a boolean flag that denotes whether the parametrization + may change the dtype and shape of the tensor. Default: `False` + Warning: the parametrization is not checked for consistency upon registration. + Enable this flag at your own risk. + + Raises: + ValueError: if the module does not have a parameter or a buffer named :attr:`tensor_name` + + Examples: + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK) + >>> import torch + >>> import torch.nn as nn + >>> import torch.nn.utils.parametrize as P + >>> + >>> class Symmetric(nn.Module): + >>> def forward(self, X): + >>> return X.triu() + X.triu(1).T # Return a symmetric matrix + >>> + >>> def right_inverse(self, A): + >>> return A.triu() + >>> + >>> m = nn.Linear(5, 5) + >>> P.register_parametrization(m, "weight", Symmetric()) + >>> print(torch.allclose(m.weight, m.weight.T)) # m.weight is now symmetric + True + >>> A = torch.rand(5, 5) + >>> A = A + A.T # A is now symmetric + >>> m.weight = A # Initialize the weight to be the symmetric matrix A + >>> print(torch.allclose(m.weight, A)) + True + + >>> class RankOne(nn.Module): + >>> def forward(self, x, y): + >>> # Form a rank 1 matrix multiplying two vectors + >>> return x.unsqueeze(-1) @ y.unsqueeze(-2) + >>> + >>> def right_inverse(self, Z): + >>> # Project Z onto the rank 1 matrices + >>> U, S, Vh = torch.linalg.svd(Z, full_matrices=False) + >>> # Return rescaled singular vectors + >>> s0_sqrt = S[0].sqrt().unsqueeze(-1) + >>> return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt + >>> + >>> linear_rank_one = P.register_parametrization( + ... nn.Linear(4, 4), "weight", RankOne() + ... ) + >>> print(torch.linalg.matrix_rank(linear_rank_one.weight).item()) + 1 + + """ + parametrization.train(module.training) + if is_parametrized(module, tensor_name): + # Correctness checks. + # If A is the space of tensors with shape and dtype equal to module.weight + # we check that parametrization.forward and parametrization.right_inverse are + # functions from A to A + if not unsafe: + Y = getattr(module, tensor_name) + X = parametrization(Y) + if not isinstance(X, Tensor): + raise ValueError( + f"A parametrization must return a tensor. Got {type(X).__name__}." + ) + if X.dtype != Y.dtype: + raise ValueError( + "Registering a parametrization may not change the dtype of the tensor, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.dtype: {Y.dtype}\n" + f"parametrization(module.{tensor_name}).dtype: {X.dtype}" + ) + if X.shape != Y.shape: + raise ValueError( + "Registering a parametrization may not change the shape of the tensor, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.shape: {Y.shape}\n" + f"parametrization(module.{tensor_name}).shape: {X.shape}" + ) + if hasattr(parametrization, "right_inverse"): + try: + Z = parametrization.right_inverse(X) # type: ignore[operator] + except NotImplementedError: + pass + else: + if not isinstance(Z, Tensor): + raise ValueError( + f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}" + ) + if Z.dtype != Y.dtype: + raise ValueError( + "The tensor returned by parametrization.right_inverse must have the same dtype " + f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.dtype: {Y.dtype}\n" + f"returned dtype: {Z.dtype}" + ) + if Z.shape != Y.shape: + raise ValueError( + "The tensor returned by parametrization.right_inverse must have the same shape " + f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.shape: {Y.shape}\n" + f"returned shape: {Z.shape}" + ) + # else right_inverse is assumed to be the identity + + # add the new parametrization to the parametrization list + assert isinstance(module.parametrizations, ModuleDict) # Make mypy happy + module.parametrizations[tensor_name].append(parametrization) # type: ignore[operator] + # If unsafe was True in previous parametrization, keep it enabled + module.parametrizations[tensor_name].unsafe |= unsafe # type: ignore[index, union-attr, operator] + elif tensor_name in module._buffers or tensor_name in module._parameters: + # Set the parametrization mechanism + # Fetch the original buffer or parameter + original = getattr(module, tensor_name) + # We create this early to check for possible errors + parametrizations = ParametrizationList( + [parametrization], original, unsafe=unsafe + ) + # Delete the previous parameter or buffer + delattr(module, tensor_name) + # If this is the first parametrization registered on the module, + # we prepare the module to inject the property + if not is_parametrized(module): + # Change the class + _inject_new_class(module) + # Inject a ``ModuleDict`` into the instance under module.parametrizations + module.parametrizations = ModuleDict() + # Add a property into the class + _inject_property(module, tensor_name) + # Add a ParametrizationList + assert isinstance(module.parametrizations, ModuleDict) # Make mypy happy + module.parametrizations[tensor_name] = parametrizations + else: + raise ValueError( + f"Module '{module}' does not have a parameter, a buffer, or a " + f"parametrized element with name '{tensor_name}'" + ) + return module + + +def is_parametrized(module: Module, tensor_name: str | None = None) -> bool: + r"""Determine if a module has a parametrization. + + Args: + module (nn.Module): module to query + tensor_name (str, optional): name of the parameter in the module + Default: ``None`` + Returns: + ``True`` if :attr:`module` has a parametrization for the parameter named :attr:`tensor_name`, + or if it has any parametrization when :attr:`tensor_name` is ``None``; + otherwise ``False`` + """ + parametrizations = getattr(module, "parametrizations", None) + if parametrizations is None or not isinstance(parametrizations, ModuleDict): + return False + if tensor_name is None: + # Check that there is at least one parametrized buffer or Parameter + return len(parametrizations) > 0 + else: + return tensor_name in parametrizations + + +def remove_parametrizations( + module: Module, + tensor_name: str, + leave_parametrized: bool = True, +) -> Module: + r"""Remove the parametrizations on a tensor in a module. + + - If ``leave_parametrized=True``, ``module[tensor_name]`` will be set to + its current output. In this case, the parametrization shall not change the ``dtype`` + of the tensor. + - If ``leave_parametrized=False``, ``module[tensor_name]`` will be set to + the unparametrised tensor in ``module.parametrizations[tensor_name].original``. + This is only possible when the parametrization depends on just one tensor. + + Args: + module (nn.Module): module from which remove the parametrization + tensor_name (str): name of the parametrization to be removed + leave_parametrized (bool, optional): leave the attribute :attr:`tensor_name` parametrized. + Default: ``True`` + + Returns: + Module: module + + Raises: + ValueError: if ``module[tensor_name]`` is not parametrized + ValueError: if ``leave_parametrized=False`` and the parametrization depends on several tensors + """ + if not is_parametrized(module, tensor_name): + raise ValueError( + f"Module {module} does not have a parametrization on {tensor_name}" + ) + + # Fetch the original tensor + assert isinstance(module.parametrizations, ModuleDict) # Make mypy happy + parametrizations = module.parametrizations[tensor_name] + # pyrefly: ignore [invalid-argument] + if parametrizations.is_tensor: + original = parametrizations.original + assert isinstance(original, torch.Tensor), "is_tensor promised us a Tensor" + if leave_parametrized: + with torch.no_grad(): + t = getattr(module, tensor_name) + # We know they have the same dtype because we have checked this when registering the + # parametrizations. As such, we can use set_ + # We do this so that the parameter does not to change the id() + # This way the user does not need to update the optimizer + with torch.no_grad(): + if type(original) is torch.Tensor: + _maybe_set(original, t) + else: + try: + _maybe_set(original, t) + except RuntimeError as e: + # TODO: Fix this for tensor subclasses that are parameters: + # RuntimeError: set_storage is not allowed on a Tensor created from .data or .detach(). + raise RuntimeError( + "Calling remove_parametrizations() with leave_parametrized=True " + "for a parameter that is an instance of a tensor subclass requires " + "set_() to be implemented correctly for the tensor subclass." + "Alternatively, one can opt into the swap_tensors path" + "Either set leave_parametrized=False or provide a working implementation" + "for set_() in the tensor subclass or set " + "torch.__future__.set_swap_module_params_on_conversion(True)." + ) from e + else: + if leave_parametrized: + # We cannot use no_grad because we need to know whether one or more + # original tensors required grad + t = getattr(module, tensor_name) + # We'll have to trust the user to add it to the optimizer + original = Parameter(t) if t.requires_grad else t + else: + raise ValueError( + "Cannot leave unparametrized (`leave_parametrized=False`) a tensor " + "that is parametrized in terms of a sequence of tensors." + ) + + # Delete the property that manages the parametrization + delattr(module.__class__, tensor_name) + # Delete the ParametrizationList + del module.parametrizations[tensor_name] + + # Restore the parameter / buffer into the main class + _register_parameter_or_buffer(module, tensor_name, original) + + # Roll back the parametrized class if no other buffer or parameter + # is currently parametrized in this class + if not is_parametrized(module): + delattr(module, "parametrizations") + # Restore class + orig_cls = module.__class__.__bases__[0] + module.__class__ = orig_cls + return module + + +def type_before_parametrizations(module: Module) -> type: + r"""Return the module type before parametrizations were applied and if not, then it returns the module type. + + Args: + module (nn.Module): module to get type of + """ + if is_parametrized(module): + return module.__class__.__bases__[0] + else: + return type(module) + + +def transfer_parametrizations_and_params( + from_module: Module, + to_module: Module, + tensor_name: str | None = None, +) -> Module: + r"""Transfer parametrizations and the parameters they parametrize from :attr:`from_module` to :attr:`to_module`. + + If :attr:`tensor_name` is specified, only transfers the specified parameter, otherwise + transfers all parametrized parameters. If those parameters do not exist in to_module, it will create them. + Does nothing if from_module is not parametrized. + + Args: + from_module (nn.Module): module to transfer from + to_module (nn.Module): module to transfer to + tensor_name (str, optional): parameter to transfer + + Returns: + Module: to_module + """ + if is_parametrized(from_module): + assert isinstance(from_module.parametrizations, ModuleDict) # for mypy + + # get list of all params or the single param to transfer + parameters_to_transfer: list | ModuleDict = ( + from_module.parametrizations if tensor_name is None else [tensor_name] + ) + + assert hasattr(parameters_to_transfer, "__iter__") # for mypy + for parameter_name in parameters_to_transfer: + # initialize the to-be-transferred param in to_module if it doesn't exist already + if not hasattr(to_module, parameter_name): + setattr( + to_module, + parameter_name, + Parameter(getattr(from_module, parameter_name)), + ) + + # apply the params's parametrizations to to_module + for param_func in from_module.parametrizations[ # type: ignore[attr-defined] + parameter_name + ]: + register_parametrization(to_module, parameter_name, param_func) + assert isinstance(to_module.parametrizations, ModuleDict) # for mypy + + # make values match, original values can be stored in either original or + # original0, original1..., need to check both cases + if hasattr(from_module.parametrizations[parameter_name], "original"): + to_module.parametrizations[ + parameter_name + ].original = from_module.parametrizations[parameter_name].original + else: + num = 0 + orig_num = "original" + str(num) + # loop through each original# until all values have been set + while hasattr(from_module.parametrizations[parameter_name], orig_num): + setattr( + to_module.parametrizations[parameter_name], + orig_num, + getattr(from_module.parametrizations[parameter_name], orig_num), + ) + num = num + 1 + orig_num = "original" + str(num) + + return to_module diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py new file mode 100644 index 0000000000000000000000000000000000000000..827bf19ed4bea00723e38d2ca60dcf14cc3abbc2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py @@ -0,0 +1,1385 @@ +# mypy: allow-untyped-defs +r"""Pruning methods.""" + +import numbers +from abc import ABC, abstractmethod +from collections.abc import Iterable + +import torch + + +class BasePruningMethod(ABC): + r"""Abstract base class for creation of new pruning techniques. + + Provides a skeleton for customization requiring the overriding of methods + such as :meth:`compute_mask` and :meth:`apply`. + """ + + _tensor_name: str + + def __call__(self, module, inputs): + r"""Multiply the mask into original tensor and store the result. + + Multiplies the mask (stored in ``module[name + '_mask']``) + into the original tensor (stored in ``module[name + '_orig']``) + and stores the result into ``module[name]`` by using :meth:`apply_mask`. + + Args: + module (nn.Module): module containing the tensor to prune + inputs: not used. + """ + setattr(module, self._tensor_name, self.apply_mask(module)) + + @abstractmethod + def compute_mask(self, t, default_mask): + r"""Compute and returns a mask for the input tensor ``t``. + + Starting from a base ``default_mask`` (which should be a mask of ones + if the tensor has not been pruned yet), generate a random mask to + apply on top of the ``default_mask`` according to the specific pruning + method recipe. + + Args: + t (torch.Tensor): tensor representing the importance scores of the + parameter to prune. + default_mask (torch.Tensor): Base mask from previous pruning + iterations, that need to be respected after the new mask is + applied. Same dims as ``t``. + + Returns: + mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t`` + """ + + def apply_mask(self, module): + r"""Simply handles the multiplication between the parameter being pruned and the generated mask. + + Fetches the mask and the original tensor from the module + and returns the pruned version of the tensor. + + Args: + module (nn.Module): module containing the tensor to prune + + Returns: + pruned_tensor (torch.Tensor): pruned version of the input tensor + """ + # to carry out the multiplication, the mask needs to have been computed, + # so the pruning method must know what tensor it's operating on + assert self._tensor_name is not None, ( + f"Module {module} has to be pruned" + ) # this gets set in apply() + mask = getattr(module, self._tensor_name + "_mask") + orig = getattr(module, self._tensor_name + "_orig") + pruned_tensor = mask.to(dtype=orig.dtype) * orig + return pruned_tensor + + @classmethod + def apply(cls, module, name, *args, importance_scores=None, **kwargs): + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + args: arguments passed on to a subclass of + :class:`BasePruningMethod` + importance_scores (torch.Tensor): tensor of importance scores (of + same shape as module parameter) used to compute mask for pruning. + The values in this tensor indicate the importance of the + corresponding elements in the parameter being pruned. + If unspecified or None, the parameter will be used in its place. + kwargs: keyword arguments passed on to a subclass of a + :class:`BasePruningMethod` + """ + + def _get_composite_method(cls, module, name, *args, **kwargs): + # Check if a pruning method has already been applied to + # `module[name]`. If so, store that in `old_method`. + old_method = None + found = 0 + # there should technically be only 1 hook with hook.name == name + # assert this using `found` + hooks_to_remove = [] + for k, hook in module._forward_pre_hooks.items(): + # if it exists, take existing thing, remove hook, then + # go through normal thing + if isinstance(hook, BasePruningMethod) and hook._tensor_name == name: + old_method = hook + hooks_to_remove.append(k) + found += 1 + assert found <= 1, ( + f"Avoid adding multiple pruning hooks to the\ + same tensor {name} of module {module}. Use a PruningContainer." + ) + + for k in hooks_to_remove: + del module._forward_pre_hooks[k] + + # Apply the new pruning method, either from scratch or on top of + # the previous one. + method = cls(*args, **kwargs) # new pruning + # Have the pruning method remember what tensor it's been applied to + method._tensor_name = name + + # combine `methods` with `old_method`, if `old_method` exists + if old_method is not None: # meaning that there was a hook + # if the hook is already a pruning container, just add the + # new pruning method to the container + if isinstance(old_method, PruningContainer): + old_method.add_pruning_method(method) + method = old_method # rename old_method --> method + + # if the hook is simply a single pruning method, create a + # container, add the old pruning method and the new one + elif isinstance(old_method, BasePruningMethod): + container = PruningContainer(old_method) + # Have the pruning method remember the name of its tensor + # setattr(container, '_tensor_name', name) + container.add_pruning_method(method) + method = container # rename container --> method + return method + + method = _get_composite_method(cls, module, name, *args, **kwargs) + # at this point we have no forward_pre_hooks but we could have an + # active reparameterization of the tensor if another pruning method + # had been applied (in which case `method` would be a PruningContainer + # and not a simple pruning method). + + # Pruning is to be applied to the module's tensor named `name`, + # starting from the state it is found in prior to this iteration of + # pruning. The pruning mask is calculated based on importances scores. + + orig = getattr(module, name) + if importance_scores is not None: + assert importance_scores.shape == orig.shape, ( + f"importance_scores should have the same shape as parameter {name} of {module}" + ) + else: + importance_scores = orig + + # If this is the first time pruning is applied, take care of moving + # the original tensor to a new parameter called name + '_orig' and + # and deleting the original parameter + if not isinstance(method, PruningContainer): + # copy `module[name]` to `module[name + '_orig']` + module.register_parameter(name + "_orig", orig) + # temporarily delete `module[name]` + del module._parameters[name] + default_mask = torch.ones_like(orig) # temp + # If this is not the first time pruning is applied, all of the above + # has been done before in a previous pruning iteration, so we're good + # to go + else: + default_mask = ( + getattr(module, name + "_mask") + .detach() + .clone(memory_format=torch.contiguous_format) + ) + + # Use try/except because if anything goes wrong with the mask + # computation etc., you'd want to roll back. + try: + # get the final mask, computed according to the specific method + mask = method.compute_mask(importance_scores, default_mask=default_mask) + # reparameterize by saving mask to `module[name + '_mask']`... + module.register_buffer(name + "_mask", mask) + # ... and the new pruned tensor to `module[name]` + setattr(module, name, method.apply_mask(module)) + # associate the pruning method to the module via a hook to + # compute the function before every forward() (compile by run) + module.register_forward_pre_hook(method) + + except Exception as e: + if not isinstance(method, PruningContainer): + orig = getattr(module, name + "_orig") + module.register_parameter(name, orig) + del module._parameters[name + "_orig"] + raise e + + return method + + def prune(self, t, default_mask=None, importance_scores=None): + r"""Compute and returns a pruned version of input tensor ``t``. + + According to the pruning rule specified in :meth:`compute_mask`. + + Args: + t (torch.Tensor): tensor to prune (of same dimensions as + ``default_mask``). + importance_scores (torch.Tensor): tensor of importance scores (of + same shape as ``t``) used to compute mask for pruning ``t``. + The values in this tensor indicate the importance of the + corresponding elements in the ``t`` that is being pruned. + If unspecified or None, the tensor ``t`` will be used in its place. + default_mask (torch.Tensor, optional): mask from previous pruning + iteration, if any. To be considered when determining what + portion of the tensor that pruning should act on. If None, + default to a mask of ones. + + Returns: + pruned version of tensor ``t``. + """ + if importance_scores is not None: + assert importance_scores.shape == t.shape, ( + "importance_scores should have the same shape as tensor t" + ) + else: + importance_scores = t + default_mask = default_mask if default_mask is not None else torch.ones_like(t) + return t * self.compute_mask(importance_scores, default_mask=default_mask) + + def remove(self, module) -> None: + r"""Remove the pruning reparameterization from a module. + + The pruned parameter named ``name`` remains permanently pruned, + and the parameter named ``name+'_orig'`` is removed from the parameter list. + Similarly, the buffer named ``name+'_mask'`` is removed from the buffers. + + Note: + Pruning itself is NOT undone or reversed! + """ + # before removing pruning from a tensor, it has to have been applied + assert self._tensor_name is not None, ( + f"Module {module} has to be pruned before pruning can be removed" + ) # this gets set in apply() + + # to update module[name] to latest trained weights + weight = self.apply_mask(module) # masked weights + + # delete and reset + if hasattr(module, self._tensor_name): + delattr(module, self._tensor_name) + orig = module._parameters[self._tensor_name + "_orig"] + orig.data = weight.data + del module._parameters[self._tensor_name + "_orig"] + del module._buffers[self._tensor_name + "_mask"] + setattr(module, self._tensor_name, orig) + + +class PruningContainer(BasePruningMethod): + """Container holding a sequence of pruning methods for iterative pruning. + + Keeps track of the order in which pruning methods are applied and handles + combining successive pruning calls. + + Accepts as argument an instance of a BasePruningMethod or an iterable of + them. + """ + + def __init__(self, *args) -> None: + self._pruning_methods: tuple[BasePruningMethod, ...] = () + if not isinstance(args, Iterable): # only 1 item + self._tensor_name = args._tensor_name + self.add_pruning_method(args) + # pyrefly: ignore [bad-argument-type] + elif len(args) == 1: # only 1 item in a tuple + # pyrefly: ignore [index-error] + self._tensor_name = args[0]._tensor_name + # pyrefly: ignore [index-error] + self.add_pruning_method(args[0]) + else: # manual construction from list or other iterable (or no args) + for method in args: + self.add_pruning_method(method) + + def add_pruning_method(self, method) -> None: + r"""Add a child pruning ``method`` to the container. + + Args: + method (subclass of BasePruningMethod): child pruning method + to be added to the container. + """ + # check that we're adding a pruning method to the container + if not isinstance(method, BasePruningMethod) and method is not None: + raise TypeError(f"{type(method)} is not a BasePruningMethod subclass") + elif method is not None and self._tensor_name != method._tensor_name: + raise ValueError( + "Can only add pruning methods acting on " + f"the parameter named '{self._tensor_name}' to PruningContainer {self}." + + f" Found '{method._tensor_name}'" + ) + # if all checks passed, add to _pruning_methods tuple + self._pruning_methods += (method,) # type: ignore[operator] + + def __len__(self) -> int: + return len(self._pruning_methods) + + def __iter__(self): + return iter(self._pruning_methods) + + def __getitem__(self, idx): + return self._pruning_methods[idx] + + def compute_mask(self, t, default_mask): + r"""Apply the latest ``method`` by computing the new partial masks and returning its combination with the ``default_mask``. + + The new partial mask should be computed on the entries or channels + that were not zeroed out by the ``default_mask``. + Which portions of the tensor ``t`` the new mask will be calculated from + depends on the ``PRUNING_TYPE`` (handled by the type handler): + + * for 'unstructured', the mask will be computed from the raveled + list of nonmasked entries; + + * for 'structured', the mask will be computed from the nonmasked + channels in the tensor; + + * for 'global', the mask will be computed across all entries. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + (of same dimensions as ``default_mask``). + default_mask (torch.Tensor): mask from previous pruning iteration. + + Returns: + mask (torch.Tensor): new mask that combines the effects + of the ``default_mask`` and the new mask from the current + pruning ``method`` (of same dimensions as ``default_mask`` and + ``t``). + """ + + def _combine_masks(method, t, mask): + r"""Combine the masks from all pruning methods and returns a new mask. + + Args: + method (a BasePruningMethod subclass): pruning method + currently being applied. + t (torch.Tensor): tensor representing the parameter to prune + (of same dimensions as mask). + mask (torch.Tensor): mask from previous pruning iteration + + Returns: + new_mask (torch.Tensor): new mask that combines the effects + of the old mask and the new mask from the current + pruning method (of same dimensions as mask and t). + """ + new_mask = mask # start off from existing mask + new_mask = new_mask.to(dtype=t.dtype) + + # compute a slice of t onto which the new pruning method will operate + if method.PRUNING_TYPE == "unstructured": + # prune entries of t where the mask is 1 + slc = mask == 1 + + # for struct pruning, exclude channels that have already been + # entirely pruned + elif method.PRUNING_TYPE == "structured": + if not hasattr(method, "dim"): + raise AttributeError( + "Pruning methods of PRUNING_TYPE " + '"structured" need to have the attribute `dim` defined.' + ) + + # find the channels to keep by removing the ones that have been + # zeroed out already (i.e. where sum(entries) == 0) + n_dims = t.dim() # "is this a 2D tensor? 3D? ..." + dim = method.dim + # convert negative indexing + if dim < 0: + dim = n_dims + dim + # if dim is still negative after subtracting it from n_dims + if dim < 0: + raise IndexError( + f"Index is out of bounds for tensor with dimensions {n_dims}" + ) + # find channels along dim = dim that aren't already tots 0ed out + keep_channel = mask.sum(dim=[d for d in range(n_dims) if d != dim]) != 0 + # create slice to identify what to prune + slc = [slice(None)] * n_dims + slc[dim] = keep_channel + + elif method.PRUNING_TYPE == "global": + n_dims = len(t.shape) # "is this a 2D tensor? 3D? ..." + slc = [slice(None)] * n_dims + + else: + raise ValueError(f"Unrecognized PRUNING_TYPE {method.PRUNING_TYPE}") + + # compute the new mask on the unpruned slice of the tensor t + if isinstance(slc, list): + slc = tuple(slc) + partial_mask = method.compute_mask(t[slc], default_mask=mask[slc]) + new_mask[slc] = partial_mask.to(dtype=new_mask.dtype) + + return new_mask + + method = self._pruning_methods[-1] + mask = _combine_masks(method, t, default_mask) + return mask + + +class Identity(BasePruningMethod): + r"""Utility pruning method that does not prune any units but generates the pruning parametrization with a mask of ones.""" + + PRUNING_TYPE = "unstructured" + + def compute_mask(self, t, default_mask): + mask = default_mask + return mask + + @classmethod + def apply(cls, module, name): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + """ + return super().apply(module, name) + + +class RandomUnstructured(BasePruningMethod): + r"""Prune (currently unpruned) units in a tensor at random. + + Args: + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + """ + + PRUNING_TYPE = "unstructured" + + def __init__(self, amount) -> None: + # Check range of validity of pruning amount + _validate_pruning_amount_init(amount) + self.amount = amount + + def compute_mask(self, t, default_mask): + # Check that the amount of units to prune is not > than the number of + # parameters in t + tensor_size = t.nelement() + # Compute number of units to prune: amount if int, + # else amount * tensor_size + nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) + # This should raise an error if the number of units to prune is larger + # than the number of units in the tensor + _validate_pruning_amount(nparams_toprune, tensor_size) + + mask = default_mask.clone(memory_format=torch.contiguous_format) + + if nparams_toprune != 0: # k=0 not supported by torch.kthvalue + prob = torch.rand_like(t) + topk = torch.topk(prob.view(-1), k=nparams_toprune) + mask.view(-1)[topk.indices] = 0 + + return mask + + @classmethod + def apply(cls, module, name, amount): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + """ + return super().apply(module, name, amount=amount) + + +class L1Unstructured(BasePruningMethod): + r"""Prune (currently unpruned) units in a tensor by zeroing out the ones with the lowest L1-norm. + + Args: + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + """ + + PRUNING_TYPE = "unstructured" + + def __init__(self, amount) -> None: + # Check range of validity of pruning amount + _validate_pruning_amount_init(amount) + self.amount = amount + + def compute_mask(self, t, default_mask): + # Check that the amount of units to prune is not > than the number of + # parameters in t + tensor_size = t.nelement() + # Compute number of units to prune: amount if int, + # else amount * tensor_size + nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) + # This should raise an error if the number of units to prune is larger + # than the number of units in the tensor + _validate_pruning_amount(nparams_toprune, tensor_size) + + mask = default_mask.clone(memory_format=torch.contiguous_format) + + if nparams_toprune != 0: # k=0 not supported by torch.kthvalue + # largest=True --> top k; largest=False --> bottom k + # Prune the smallest k + topk = torch.topk(torch.abs(t).view(-1), k=nparams_toprune, largest=False) + # topk will have .indices and .values + mask.view(-1)[topk.indices] = 0 + + return mask + + @classmethod + def apply(cls, module, name, amount, importance_scores=None): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + importance_scores (torch.Tensor): tensor of importance scores (of same + shape as module parameter) used to compute mask for pruning. + The values in this tensor indicate the importance of the corresponding + elements in the parameter being pruned. + If unspecified or None, the module parameter will be used in its place. + """ + return super().apply( + module, name, amount=amount, importance_scores=importance_scores + ) + + +class RandomStructured(BasePruningMethod): + r"""Prune entire (currently unpruned) channels in a tensor at random. + + Args: + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + dim (int, optional): index of the dim along which we define + channels to prune. Default: -1. + """ + + PRUNING_TYPE = "structured" + + def __init__(self, amount, dim=-1) -> None: + # Check range of validity of amount + _validate_pruning_amount_init(amount) + self.amount = amount + self.dim = dim + + def compute_mask(self, t, default_mask): + r"""Compute and returns a mask for the input tensor ``t``. + + Starting from a base ``default_mask`` (which should be a mask of ones + if the tensor has not been pruned yet), generate a random mask to + apply on top of the ``default_mask`` by randomly zeroing out channels + along the specified dim of the tensor. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + default_mask (torch.Tensor): Base mask from previous pruning + iterations, that need to be respected after the new mask is + applied. Same dims as ``t``. + + Returns: + mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t`` + + Raises: + IndexError: if ``self.dim >= len(t.shape)`` + """ + # Check that tensor has structure (i.e. more than 1 dimension) such + # that the concept of "channels" makes sense + _validate_structured_pruning(t) + + # Check that self.dim is a valid dim to index t, else raise IndexError + _validate_pruning_dim(t, self.dim) + + # Check that the amount of channels to prune is not > than the number of + # channels in t along the dim to prune + tensor_size = t.shape[self.dim] + # Compute number of units to prune: amount if int, + # else amount * tensor_size + nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) + # This should raise an error if the number of units to prune is larger + # than the number of units in the tensor + _validate_pruning_amount(nparams_toprune, tensor_size) + + # Compute binary mask by initializing it to all 0s and then filling in + # 1s wherever topk.indices indicates, along self.dim. + # mask has the same shape as tensor t + def make_mask(t, dim, nchannels, nchannels_toprune): + # generate a random number in [0, 1] to associate to each channel + prob = torch.rand(nchannels) + # generate mask for each channel by 0ing out the channels that + # got assigned the k = nchannels_toprune lowest values in prob + threshold = torch.kthvalue(prob, k=nchannels_toprune).values + channel_mask = prob > threshold + + mask = torch.zeros_like(t) + slc = [slice(None)] * len(t.shape) + slc[dim] = channel_mask + slc = tuple(slc) + mask[slc] = 1 + return mask + + if nparams_toprune == 0: # k=0 not supported by torch.kthvalue + mask = default_mask + else: + # apply the new structured mask on top of prior (potentially + # unstructured) mask + mask = make_mask(t, self.dim, tensor_size, nparams_toprune) + mask *= default_mask.to(dtype=mask.dtype) + return mask + + @classmethod + def apply(cls, module, name, amount, dim=-1): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + dim (int, optional): index of the dim along which we define + channels to prune. Default: -1. + """ + return super().apply(module, name, amount=amount, dim=dim) + + +class LnStructured(BasePruningMethod): + r"""Prune entire (currently unpruned) channels in a tensor based on their L\ ``n``-norm. + + Args: + amount (int or float): quantity of channels to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid + entries for argument ``p`` in :func:`torch.norm`. + dim (int, optional): index of the dim along which we define + channels to prune. Default: -1. + """ + + PRUNING_TYPE = "structured" + + def __init__(self, amount, n, dim=-1) -> None: + # Check range of validity of amount + _validate_pruning_amount_init(amount) + self.amount = amount + self.n = n + self.dim = dim + + def compute_mask(self, t, default_mask): + r"""Compute and returns a mask for the input tensor ``t``. + + Starting from a base ``default_mask`` (which should be a mask of ones + if the tensor has not been pruned yet), generate a mask to apply on + top of the ``default_mask`` by zeroing out the channels along the + specified dim with the lowest L\ ``n``-norm. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + default_mask (torch.Tensor): Base mask from previous pruning + iterations, that need to be respected after the new mask is + applied. Same dims as ``t``. + + Returns: + mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t`` + + Raises: + IndexError: if ``self.dim >= len(t.shape)`` + """ + # Check that tensor has structure (i.e. more than 1 dimension) such + # that the concept of "channels" makes sense + _validate_structured_pruning(t) + # Check that self.dim is a valid dim to index t, else raise IndexError + _validate_pruning_dim(t, self.dim) + + # Check that the amount of channels to prune is not > than the number of + # channels in t along the dim to prune + tensor_size = t.shape[self.dim] + # Compute number of units to prune: amount if int, + # else amount * tensor_size + nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) + nparams_tokeep = tensor_size - nparams_toprune + # This should raise an error if the number of units to prune is larger + # than the number of units in the tensor + _validate_pruning_amount(nparams_toprune, tensor_size) + + # Structured pruning prunes entire channels so we need to know the + # L_n norm along each channel to then find the topk based on this + # metric + norm = _compute_norm(t, self.n, self.dim) + # largest=True --> top k; largest=False --> bottom k + # Keep the largest k channels along dim=self.dim + topk = torch.topk(norm, k=nparams_tokeep, largest=True) + # topk will have .indices and .values + + # Compute binary mask by initializing it to all 0s and then filling in + # 1s wherever topk.indices indicates, along self.dim. + # mask has the same shape as tensor t + def make_mask(t, dim, indices): + # init mask to 0 + mask = torch.zeros_like(t) + # e.g.: slc = [None, None, None], if len(t.shape) = 3 + slc = [slice(None)] * len(t.shape) + # replace a None at position=dim with indices + # e.g.: slc = [None, None, [0, 2, 3]] if dim=2 & indices=[0,2,3] + slc[dim] = indices + slc = tuple(slc) + # use slc to slice mask and replace all its entries with 1s + # e.g.: mask[:, :, [0, 2, 3]] = 1 + mask[slc] = 1 + return mask + + if nparams_toprune == 0: # k=0 not supported by torch.kthvalue + mask = default_mask + else: + mask = make_mask(t, self.dim, topk.indices) + mask *= default_mask.to(dtype=mask.dtype) + + return mask + + @classmethod + def apply(cls, module, name, amount, n, dim, importance_scores=None): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid + entries for argument ``p`` in :func:`torch.norm`. + dim (int): index of the dim along which we define channels to + prune. + importance_scores (torch.Tensor): tensor of importance scores (of same + shape as module parameter) used to compute mask for pruning. + The values in this tensor indicate the importance of the corresponding + elements in the parameter being pruned. + If unspecified or None, the module parameter will be used in its place. + """ + return super().apply( + module, + name, + amount=amount, + n=n, + dim=dim, + importance_scores=importance_scores, + ) + + +class CustomFromMask(BasePruningMethod): + PRUNING_TYPE = "global" + + def __init__(self, mask) -> None: + self.mask = mask + + def compute_mask(self, t, default_mask): + assert default_mask.shape == self.mask.shape + mask = default_mask * self.mask.to(dtype=default_mask.dtype) + return mask + + @classmethod + def apply(cls, module, name, mask): # type: ignore[override] + r"""Add pruning on the fly and reparametrization of a tensor. + + Adds the forward pre-hook that enables pruning on the fly and + the reparametrization of a tensor in terms of the original tensor + and the pruning mask. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + """ + return super().apply(module, name, mask=mask) + + +def identity(module, name): + r"""Apply pruning reparametrization without pruning any units. + + Applies pruning reparametrization to the tensor corresponding to the + parameter called ``name`` in ``module`` without actually pruning any + units. Modifies module in place (and also return the modified module) + by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Note: + The mask is a tensor of ones. + + Args: + module (nn.Module): module containing the tensor to prune. + name (str): parameter name within ``module`` on which pruning + will act. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> # xdoctest: +SKIP + >>> m = prune.identity(nn.Linear(2, 3), "bias") + >>> print(m.bias_mask) + tensor([1., 1., 1.]) + """ + Identity.apply(module, name) + return module + + +def random_unstructured(module, name, amount): + r"""Prune tensor by removing random (currently unpruned) units. + + Prunes tensor corresponding to parameter called ``name`` in ``module`` + by removing the specified ``amount`` of (currently unpruned) units + selected at random. + Modifies module in place (and also return the modified module) by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> # xdoctest: +SKIP + >>> m = prune.random_unstructured(nn.Linear(2, 3), "weight", amount=1) + >>> torch.sum(m.weight_mask == 0) + tensor(1) + + """ + RandomUnstructured.apply(module, name, amount) + return module + + +def l1_unstructured(module, name, amount, importance_scores=None): + r"""Prune tensor by removing units with the lowest L1-norm. + + Prunes tensor corresponding to parameter called ``name`` in ``module`` + by removing the specified `amount` of (currently unpruned) units with the + lowest L1-norm. + Modifies module in place (and also return the modified module) + by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + importance_scores (torch.Tensor): tensor of importance scores (of same + shape as module parameter) used to compute mask for pruning. + The values in this tensor indicate the importance of the corresponding + elements in the parameter being pruned. + If unspecified or None, the module parameter will be used in its place. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> # xdoctest: +SKIP + >>> m = prune.l1_unstructured(nn.Linear(2, 3), "weight", amount=0.2) + >>> m.state_dict().keys() + odict_keys(['bias', 'weight_orig', 'weight_mask']) + """ + L1Unstructured.apply( + module, name, amount=amount, importance_scores=importance_scores + ) + return module + + +def random_structured(module, name, amount, dim): + r"""Prune tensor by removing random channels along the specified dimension. + + Prunes tensor corresponding to parameter called ``name`` in ``module`` + by removing the specified ``amount`` of (currently unpruned) channels + along the specified ``dim`` selected at random. + Modifies module in place (and also return the modified module) + by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + dim (int): index of the dim along which we define channels to prune. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> # xdoctest: +SKIP + >>> m = prune.random_structured(nn.Linear(5, 3), "weight", amount=3, dim=1) + >>> columns_pruned = int(sum(torch.sum(m.weight, dim=0) == 0)) + >>> print(columns_pruned) + 3 + """ + RandomStructured.apply(module, name, amount, dim) + return module + + +def ln_structured(module, name, amount, n, dim, importance_scores=None): + r"""Prune tensor by removing channels with the lowest L\ ``n``-norm along the specified dimension. + + Prunes tensor corresponding to parameter called ``name`` in ``module`` + by removing the specified ``amount`` of (currently unpruned) channels + along the specified ``dim`` with the lowest L\ ``n``-norm. + Modifies module in place (and also return the modified module) + by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + amount (int or float): quantity of parameters to prune. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid + entries for argument ``p`` in :func:`torch.norm`. + dim (int): index of the dim along which we define channels to prune. + importance_scores (torch.Tensor): tensor of importance scores (of same + shape as module parameter) used to compute mask for pruning. + The values in this tensor indicate the importance of the corresponding + elements in the parameter being pruned. + If unspecified or None, the module parameter will be used in its place. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> from torch.nn.utils import prune + >>> m = prune.ln_structured( + ... nn.Conv2d(5, 3, 2), "weight", amount=0.3, dim=1, n=float("-inf") + ... ) + """ + LnStructured.apply( + module, name, amount, n, dim, importance_scores=importance_scores + ) + return module + + +def global_unstructured( + parameters, pruning_method, importance_scores=None, **kwargs +) -> None: + r""" + Globally prunes tensors corresponding to all parameters in ``parameters`` by applying the specified ``pruning_method``. + + Modifies modules in place by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + parameters (Iterable of (module, name) tuples): parameters of + the model to prune in a global fashion, i.e. by aggregating all + weights prior to deciding which ones to prune. module must be of + type :class:`nn.Module`, and name must be a string. + pruning_method (function): a valid pruning function from this module, + or a custom one implemented by the user that satisfies the + implementation guidelines and has ``PRUNING_TYPE='unstructured'``. + importance_scores (dict): a dictionary mapping (module, name) tuples to + the corresponding parameter's importance scores tensor. The tensor + should be the same shape as the parameter, and is used for computing + mask for pruning. + If unspecified or None, the parameter will be used in place of its + importance scores. + kwargs: other keyword arguments such as: + amount (int or float): quantity of parameters to prune across the + specified parameters. + If ``float``, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If ``int``, it represents the + absolute number of parameters to prune. + + Raises: + TypeError: if ``PRUNING_TYPE != 'unstructured'`` + + Note: + Since global structured pruning doesn't make much sense unless the + norm is normalized by the size of the parameter, we now limit the + scope of global pruning to unstructured methods. + + Examples: + >>> from torch.nn.utils import prune + >>> from collections import OrderedDict + >>> net = nn.Sequential( + ... OrderedDict( + ... [ + ... ("first", nn.Linear(10, 4)), + ... ("second", nn.Linear(4, 1)), + ... ] + ... ) + ... ) + >>> parameters_to_prune = ( + ... (net.first, "weight"), + ... (net.second, "weight"), + ... ) + >>> prune.global_unstructured( + ... parameters_to_prune, + ... pruning_method=prune.L1Unstructured, + ... amount=10, + ... ) + >>> print(sum(torch.nn.utils.parameters_to_vector(net.buffers()) == 0)) + tensor(10) + + """ + # ensure parameters is a list or generator of tuples + if not isinstance(parameters, Iterable): + raise TypeError("global_unstructured(): parameters is not an Iterable") + + importance_scores = importance_scores if importance_scores is not None else {} + if not isinstance(importance_scores, dict): + raise TypeError("global_unstructured(): importance_scores must be of type dict") + + # flatten importance scores to consider them all at once in global pruning + relevant_importance_scores = torch.nn.utils.parameters_to_vector( + # pyrefly: ignore [bad-argument-type] + [ + importance_scores.get((module, name), getattr(module, name)) + for (module, name) in parameters + ] + ) + # similarly, flatten the masks (if they exist), or use a flattened vector + # of 1s of the same dimensions as t + default_mask = torch.nn.utils.parameters_to_vector( + [ + getattr(module, name + "_mask", torch.ones_like(getattr(module, name))) + for (module, name) in parameters + ] + ) + + # use the canonical pruning methods to compute the new mask, even if the + # parameter is now a flattened out version of `parameters` + container = PruningContainer() + container._tensor_name = "temp" # to make it match that of `method` + method = pruning_method(**kwargs) + method._tensor_name = "temp" # to make it match that of `container` + if method.PRUNING_TYPE != "unstructured": + raise TypeError( + 'Only "unstructured" PRUNING_TYPE supported for ' + f"the `pruning_method`. Found method {pruning_method} of type {method.PRUNING_TYPE}" + ) + + container.add_pruning_method(method) + + # use the `compute_mask` method from `PruningContainer` to combine the + # mask computed by the new method with the pre-existing mask + final_mask = container.compute_mask(relevant_importance_scores, default_mask) + + # Pointer for slicing the mask to match the shape of each parameter + pointer = 0 + for module, name in parameters: + param = getattr(module, name) + # The length of the parameter + num_param = param.numel() + # Slice the mask, reshape it + param_mask = final_mask[pointer : pointer + num_param].view_as(param) + # Assign the correct pre-computed mask to each parameter and add it + # to the forward_pre_hooks like any other pruning method + custom_from_mask(module, name, mask=param_mask) + + # Increment the pointer to continue slicing the final_mask + pointer += num_param + + +def custom_from_mask(module, name, mask): + r"""Prune tensor corresponding to parameter called ``name`` in ``module`` by applying the pre-computed mask in ``mask``. + + Modifies module in place (and also return the modified module) by: + + 1) adding a named buffer called ``name+'_mask'`` corresponding to the + binary mask applied to the parameter ``name`` by the pruning method. + 2) replacing the parameter ``name`` by its pruned version, while the + original (unpruned) parameter is stored in a new parameter named + ``name+'_orig'``. + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + mask (Tensor): binary mask to be applied to the parameter. + + Returns: + module (nn.Module): modified (i.e. pruned) version of the input module + + Examples: + >>> from torch.nn.utils import prune + >>> m = prune.custom_from_mask( + ... nn.Linear(5, 3), name="bias", mask=torch.tensor([0, 1, 0]) + ... ) + >>> print(m.bias_mask) + tensor([0., 1., 0.]) + + """ + CustomFromMask.apply(module, name, mask) + return module + + +def remove(module, name): + r"""Remove the pruning reparameterization from a module and the pruning method from the forward hook. + + The pruned parameter named ``name`` remains permanently pruned, and the parameter + named ``name+'_orig'`` is removed from the parameter list. Similarly, + the buffer named ``name+'_mask'`` is removed from the buffers. + + Note: + Pruning itself is NOT undone or reversed! + + Args: + module (nn.Module): module containing the tensor to prune + name (str): parameter name within ``module`` on which pruning + will act. + + Examples: + >>> m = random_unstructured(nn.Linear(5, 7), name="weight", amount=0.2) + >>> m = remove(m, name="weight") + """ + for k, hook in module._forward_pre_hooks.items(): + if isinstance(hook, BasePruningMethod) and hook._tensor_name == name: + hook.remove(module) + del module._forward_pre_hooks[k] + return module + + raise ValueError( + f"Parameter '{name}' of module {module} has to be pruned before pruning can be removed" + ) + + +def is_pruned(module) -> bool: + r"""Check if a module is pruned by looking for pruning pre-hooks. + + Check whether ``module`` is pruned by looking for + ``forward_pre_hooks`` in its modules that inherit from the + :class:`BasePruningMethod`. + + Args: + module (nn.Module): object that is either pruned or unpruned + + Returns: + binary answer to whether ``module`` is pruned. + + Examples: + >>> from torch.nn.utils import prune + >>> m = nn.Linear(5, 7) + >>> print(prune.is_pruned(m)) + False + >>> prune.random_unstructured(m, name="weight", amount=0.2) + >>> print(prune.is_pruned(m)) + True + """ + for _, submodule in module.named_modules(): + for hook in submodule._forward_pre_hooks.values(): + if isinstance(hook, BasePruningMethod): + return True + return False + + +def _validate_pruning_amount_init(amount) -> None: + r"""Validate helper to check the range of amount at init. + + Args: + amount (int or float): quantity of parameters to prune. + If float, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If int, it represents the + absolute number of parameters to prune. + + Raises: + ValueError: if amount is a float not in [0, 1], or if it's a negative + integer. + TypeError: if amount is neither a float nor an integer. + + Note: + This does not take into account the number of parameters in the + tensor to be pruned, which is known only at prune. + """ + if not isinstance(amount, numbers.Real): + raise TypeError(f"Invalid type for amount: {amount}. Must be int or float.") + + if (isinstance(amount, numbers.Integral) and amount < 0) or ( + not isinstance(amount, numbers.Integral) # so it's a float + and (float(amount) > 1.0 or float(amount) < 0.0) + ): + raise ValueError( + f"amount={amount} should either be a float in the range [0, 1] or a non-negative integer" + ) + + +def _validate_pruning_amount(amount, tensor_size) -> None: + r"""Validate that the pruning amount is meaningful wrt to the size of the data. + + Validation helper to check that the amount of parameters to prune + is meaningful wrt to the size of the data (`tensor_size`). + + Args: + amount (int or float): quantity of parameters to prune. + If float, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If int, it represents the + absolute number of parameters to prune. + tensor_size (int): absolute number of parameters in the tensor + to prune. + """ + # TODO: consider removing this check and allowing users to specify + # a number of units to prune that is greater than the number of units + # left to prune. In this case, the tensor will just be fully pruned. + + if isinstance(amount, numbers.Integral) and amount > tensor_size: + raise ValueError( + f"amount={amount} should be smaller than the number of parameters to prune={tensor_size}" + ) + + +def _validate_structured_pruning(t) -> None: + r"""Validate that the tensor to be pruned is at least 2-Dimensional. + + Validation helper to check that the tensor to be pruned is multi- + dimensional, such that the concept of "channels" is well-defined. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + + Raises: + ValueError: if the tensor `t` is not at least 2D. + """ + shape = t.shape + if len(shape) <= 1: + raise ValueError( + "Structured pruning can only be applied to " + "multidimensional tensors. Found tensor of shape " + f"{shape} with {len(shape)} dims" + ) + + +def _compute_nparams_toprune(amount, tensor_size): + r"""Convert the pruning amount from a percentage to absolute value. + + Since amount can be expressed either in absolute value or as a + percentage of the number of units/channels in a tensor, this utility + function converts the percentage to absolute value to standardize + the handling of pruning. + + Args: + amount (int or float): quantity of parameters to prune. + If float, should be between 0.0 and 1.0 and represent the + fraction of parameters to prune. If int, it represents the + absolute number of parameters to prune. + tensor_size (int): absolute number of parameters in the tensor + to prune. + + Returns: + int: the number of units to prune in the tensor + """ + # incorrect type already checked in _validate_pruning_amount_init + if isinstance(amount, numbers.Integral): + return amount + else: + return round(amount * tensor_size) + + +def _validate_pruning_dim(t, dim) -> None: + r"""Validate that the pruning dimension is within the bounds of the tensor dimension. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + dim (int): index of the dim along which we define channels to prune + """ + if dim >= t.dim(): + raise IndexError(f"Invalid index {dim} for tensor of size {t.shape}") + + +def _compute_norm(t, n, dim): + r"""Compute the L_n-norm of a tensor along all dimensions except for the specified dimension. + + The L_n-norm will be computed across all entries in tensor `t` along all dimension + except for the one identified by dim. + Example: if `t` is of shape, say, 3x2x4 and dim=2 (the last dim), + then norm will have Size [4], and each entry will represent the + `L_n`-norm computed using the 3x2=6 entries for each of the 4 channels. + + Args: + t (torch.Tensor): tensor representing the parameter to prune + n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid + entries for argument p in torch.norm + dim (int): dim identifying the channels to prune + + Returns: + norm (torch.Tensor): L_n norm computed across all dimensions except + for `dim`. By construction, `norm.shape = t.shape[-1]`. + """ + # dims = all axes, except for the one identified by `dim` + dims = list(range(t.dim())) + # convert negative indexing + if dim < 0: + dim = dims[dim] + dims.remove(dim) + + norm = torch.norm(t, p=n, dim=dims) + return norm diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..f0530d99f94e0a0aa5fc5821ebefd85513e44c9f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py @@ -0,0 +1,606 @@ +import warnings +from collections.abc import Callable, Iterable +from typing import Any, NamedTuple, overload, TypeVar +from typing_extensions import Self + +import torch +from torch import _VF, Tensor + + +__all__ = [ + "PackedSequence", + "invert_permutation", + "pack_padded_sequence", + "pad_packed_sequence", + "pad_sequence", + "unpad_sequence", + "pack_sequence", + "unpack_sequence", +] + +_T = TypeVar("_T") +_R = TypeVar("_R") + + +class PackedSequence_(NamedTuple): + data: torch.Tensor + batch_sizes: torch.Tensor + sorted_indices: torch.Tensor | None + unsorted_indices: torch.Tensor | None + + +def bind(optional: _T | None, fn: Callable[[_T], _R]) -> _R | None: + if optional is None: + return None + return fn(optional) + + +class PackedSequence(PackedSequence_): + r"""Holds the data and list of :attr:`batch_sizes` of a packed sequence. + + All RNN modules accept packed sequences as inputs. + + Note: + Instances of this class should never be created manually. They are meant + to be instantiated by functions like :func:`pack_padded_sequence`. + + Batch sizes represent the number elements at each sequence step in + the batch, not the varying sequence lengths passed to + :func:`pack_padded_sequence`. For instance, given data ``abc`` and ``x`` + the :class:`PackedSequence` would contain data ``axbc`` with + ``batch_sizes=[2,1,1]``. + + Attributes: + data (Tensor): Tensor containing packed sequence + batch_sizes (Tensor): Tensor of integers holding + information about the batch size at each sequence step + sorted_indices (Tensor, optional): Tensor of integers holding how this + :class:`PackedSequence` is constructed from sequences. + unsorted_indices (Tensor, optional): Tensor of integers holding how this + to recover the original sequences with correct order. + + .. note:: + :attr:`data` can be on arbitrary device and of arbitrary dtype. + :attr:`sorted_indices` and :attr:`unsorted_indices` must be ``torch.int64`` + tensors on the same device as :attr:`data`. + + However, :attr:`batch_sizes` should always be a CPU ``torch.int64`` tensor. + + This invariant is maintained throughout :class:`PackedSequence` class, + and all functions that construct a :class:`PackedSequence` in PyTorch + (i.e., they only pass in tensors conforming to this constraint). + """ + + def __new__( + cls, + data: Tensor, + batch_sizes: Tensor | None = None, + sorted_indices: Tensor | None = None, + unsorted_indices: Tensor | None = None, + ) -> Self: + return super().__new__( + cls, + *_packed_sequence_init_args( + data, batch_sizes, sorted_indices, unsorted_indices + ), + ) + + # NOTE [ device and dtype of a PackedSequence ] + # + # See the note above in doc string (starting with ":attr:`data` can be on + # arbitrary device..."). + def pin_memory(self) -> Self: + # Why not convert `batch_sizes`? + # See NOTE [ device and dtype of a PackedSequence ] + return type(self)( + self.data.pin_memory(), + self.batch_sizes, + bind(self.sorted_indices, lambda t: t.pin_memory()), + bind(self.unsorted_indices, lambda t: t.pin_memory()), + ) + + @overload + def to( + self, + dtype: torch.dtype, + non_blocking: bool = ..., + copy: bool = ..., + ) -> Self: ... + + @overload + def to( + self, + device: str | torch.device | int | None = ..., + dtype: torch.dtype | None = ..., + non_blocking: bool = ..., + copy: bool = ..., + ) -> Self: ... + + @overload + def to( + self, + other: Tensor, + non_blocking: bool = ..., + copy: bool = ..., + ) -> Self: ... + + def to(self, *args: Any, **kwargs: Any) -> Self: + r"""Perform dtype and/or device conversion on `self.data`. + + It has similar signature as :meth:`torch.Tensor.to`, except optional + arguments like `non_blocking` and `copy` should be passed as kwargs, + not args, or they will not apply to the index tensors. + + .. note:: + + If the ``self.data`` Tensor already has the correct :class:`torch.dtype` + and :class:`torch.device`, then ``self`` is returned. + Otherwise, returns a copy with the desired configuration. + """ + # Why not convert `batch_sizes`? + # See NOTE [ device and dtype of a PackedSequence ] + data = self.data.to(*args, **kwargs) + if data is self.data: + return self + else: + # Does not forward device or dtype arg/kwargs, device is set from data.device + kwargs = dict( + filter(lambda t: t[0] != "device" and t[0] != "dtype", kwargs.items()) + ) + sorted_indices = bind( + self.sorted_indices, lambda t: t.to(data.device, **kwargs) + ) + unsorted_indices = bind( + self.unsorted_indices, lambda t: t.to(data.device, **kwargs) + ) + return type(self)(data, self.batch_sizes, sorted_indices, unsorted_indices) + + def cuda(self, *args: Any, **kwargs: Any) -> Self: + # Tests to see if 'cuda' should be added to kwargs + ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to( + *args, **kwargs + ) + if ex.is_cuda: + return self.to(*args, **kwargs) + kwargs["device"] = "cuda" + return self.to(*args, **kwargs) + + def cpu(self, *args: Any, **kwargs: Any) -> Self: + ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to( + *args, **kwargs + ) + if ex.device.type == "cpu": + return self.to(*args, **kwargs) + kwargs["device"] = "cpu" + return self.to(*args, **kwargs) + + def double(self) -> Self: + return self.to(dtype=torch.double) + + def float(self) -> Self: + return self.to(dtype=torch.float) + + def half(self) -> Self: + return self.to(dtype=torch.half) + + def long(self) -> Self: + return self.to(dtype=torch.long) + + def int(self) -> Self: + return self.to(dtype=torch.int) + + def short(self) -> Self: + return self.to(dtype=torch.short) + + def char(self) -> Self: + return self.to(dtype=torch.int8) + + def byte(self) -> Self: + return self.to(dtype=torch.uint8) + + @property + def is_cuda(self) -> bool: + r"""Return true if `self.data` stored on a gpu.""" + return self.data.is_cuda + + def is_pinned(self) -> bool: + r"""Return true if `self.data` stored on in pinned memory.""" + return self.data.is_pinned() + + +# TorchScript doesn't support constructors on named tuples, so we use this helper +# method to construct PackedSequence +def _packed_sequence_init_args( + data: Tensor, + batch_sizes: Tensor | None = None, + sorted_indices: Tensor | None = None, + unsorted_indices: Tensor | None = None, +) -> tuple[Tensor, Tensor, Tensor | None, Tensor | None]: + # NB: if unsorted_indices is provided, it should be the inverse permutation + # to sorted_indices. Don't assert it here because the PackedSequence ctor + # should only be used internally. + + if unsorted_indices is None: + unsorted_indices = invert_permutation(sorted_indices) + + # support being called as `PackedSequence(data, batch_sizes, sorted_indices)` + if batch_sizes is not None: + # TODO: Re-enable this check (.type isn't supported in TorchScript) + if batch_sizes.device.type != "cpu": + raise ValueError( + "batch_sizes should always be on CPU. " + "Instances of PackedSequence should never be created manually. " + "They should be instantiated by functions like pack_sequence " + "and pack_padded_sequences in nn.utils.rnn. " + "https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_sequence" + ) + return data, batch_sizes, sorted_indices, unsorted_indices + + # support being called as `PackedSequence((data, batch_sizes), *, sorted_indices)` + else: + assert isinstance(data, (list, tuple)) and len(data) == 2 + return data[0], data[1], sorted_indices, unsorted_indices + + +def _packed_sequence_init( + data: Tensor, + batch_sizes: Tensor | None = None, + sorted_indices: Tensor | None = None, + unsorted_indices: Tensor | None = None, +) -> PackedSequence: + data, batch_sizes, sorted_indices, unsorted_indices = _packed_sequence_init_args( + data, batch_sizes, sorted_indices, unsorted_indices + ) + return PackedSequence(data, batch_sizes, sorted_indices, unsorted_indices) + + +def invert_permutation(permutation: Tensor | None) -> Tensor | None: + """Returns the inverse of ``permutation``. + + This is useful for converting between sorted and unsorted indices in + a :class:`~nn.utils.rnn.PackedSequence`. + + Args: + permutation (Tensor, optional): a 1-D tensor of indices to invert + """ + if permutation is None: + return None + output = torch.empty_like(permutation, memory_format=torch.legacy_contiguous_format) + output.scatter_( + 0, permutation, torch.arange(0, permutation.numel(), device=permutation.device) + ) + return output + + +def pack_padded_sequence( + input: Tensor, + lengths: Tensor | list[int], + batch_first: bool = False, + enforce_sorted: bool = True, +) -> PackedSequence: + r"""Packs a Tensor containing padded sequences of variable length. + + :attr:`input` can be of size ``T x B x *`` (if :attr:`batch_first` is ``False``) + or ``B x T x *`` (if :attr:`batch_first` is ``True``) where ``T`` is the length + of the longest sequence, ``B`` is the batch size, and ``*`` is any number of dimensions + (including 0). + + For unsorted sequences, use `enforce_sorted = False`. If :attr:`enforce_sorted` is + ``True``, the sequences should be sorted by length in a decreasing order, i.e. + ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the shortest + one. `enforce_sorted = True` is only necessary for ONNX export. + + It is an inverse operation to :func:`pad_packed_sequence`, and hence :func:`pad_packed_sequence` + can be used to recover the underlying tensor packed in :class:`PackedSequence`. + + Note: + This function accepts any input that has at least two dimensions. You + can apply it to pack the labels, and use the output of the RNN with + them to compute the loss directly. A Tensor can be retrieved from + a :class:`PackedSequence` object by accessing its ``.data`` attribute. + + Args: + input (Tensor): padded batch of variable length sequences. + lengths (Tensor or list(int)): list of sequence lengths of each batch + element (must be on the CPU if provided as a tensor). + batch_first (bool, optional): if ``True``, the input is expected in ``B x T x *`` + format, ``T x B x *`` otherwise. Default: ``False``. + enforce_sorted (bool, optional): if ``True``, the input is expected to + contain sequences sorted by length in a decreasing order. If + ``False``, the input will get sorted unconditionally. Default: ``True``. + + .. warning:: + The dim of ``input`` tensor will be truncated if its length larger than + correspond value in ``length``. + + Returns: + a :class:`PackedSequence` object + """ + if not isinstance(lengths, torch.Tensor): + if torch._C._get_tracing_state(): + warnings.warn( + "pack_padded_sequence has been called with a Python list of " + "sequence lengths. The tracer cannot track the data flow of Python " + "values, and it will treat them as constants, likely rendering " + "the trace incorrect for any other combination of lengths.", + stacklevel=2, + ) + lengths = torch.as_tensor(lengths, dtype=torch.int64, device="cpu") + else: + lengths = lengths.to(dtype=torch.int64) + + if enforce_sorted: + sorted_indices = None + else: + lengths, sorted_indices = torch.sort(lengths, descending=True) + sorted_indices = sorted_indices.to(input.device) + batch_dim = 0 if batch_first else 1 + input = input.index_select(batch_dim, sorted_indices) + + data, batch_sizes = _VF._pack_padded_sequence(input, lengths, batch_first) + return _packed_sequence_init(data, batch_sizes, sorted_indices, None) + + +def pad_packed_sequence( + sequence: PackedSequence, + batch_first: bool = False, + padding_value: float = 0.0, + total_length: int | None = None, +) -> tuple[Tensor, Tensor]: + r"""Pad a packed batch of variable length sequences. + + It is an inverse operation to :func:`pack_padded_sequence`. + + The returned Tensor's data will be of size ``T x B x *`` (if :attr:`batch_first` is ``False``) + or ``B x T x *`` (if :attr:`batch_first` is ``True``) , where ``T`` is the length of the longest + sequence and ``B`` is the batch size. + + Example: + >>> from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + >>> seq = torch.tensor([[1, 2, 0], [3, 0, 0], [4, 5, 6]]) + >>> lens = [2, 1, 3] + >>> packed = pack_padded_sequence( + ... seq, lens, batch_first=True, enforce_sorted=False + ... ) + >>> packed + PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]), + sorted_indices=tensor([2, 0, 1]), unsorted_indices=tensor([1, 2, 0])) + >>> seq_unpacked, lens_unpacked = pad_packed_sequence(packed, batch_first=True) + >>> seq_unpacked + tensor([[1, 2, 0], + [3, 0, 0], + [4, 5, 6]]) + >>> lens_unpacked + tensor([2, 1, 3]) + + .. note:: + :attr:`total_length` is useful to implement the + ``pack sequence -> recurrent network -> unpack sequence`` pattern in a + :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`. + See :ref:`this FAQ section ` for + details. + + Args: + sequence (PackedSequence): batch to pad + batch_first (bool, optional): if ``True``, the output will be in ``B x T x *`` + format, ``T x B x *`` otherwise. + padding_value (float, optional): values for padded elements. + total_length (int, optional): if not ``None``, the output will be padded to + have length :attr:`total_length`. This method will throw :class:`ValueError` + if :attr:`total_length` is less than the max sequence length in + :attr:`sequence`. + + Returns: + Tuple of Tensor containing the padded sequence, and a Tensor + containing the list of lengths of each sequence in the batch. + Batch elements will be re-ordered as they were ordered originally when + the batch was passed to ``pack_padded_sequence`` or ``pack_sequence``. + """ + max_seq_length = sequence.batch_sizes.size(0) + if total_length is not None: + if total_length < max_seq_length: + raise ValueError( + "Expected total_length to be at least the length " + "of the longest sequence in input, but got " + f"total_length={total_length} and max sequence length being {max_seq_length}" + ) + max_seq_length = total_length + padded_output, lengths = _VF._pad_packed_sequence( + sequence.data, sequence.batch_sizes, batch_first, padding_value, max_seq_length + ) + unsorted_indices = sequence.unsorted_indices + if unsorted_indices is not None: + batch_dim = 0 if batch_first else 1 + return ( + padded_output.index_select(batch_dim, unsorted_indices), + lengths[unsorted_indices.cpu()], + ) + return padded_output, lengths + + +# NOTE: for JIT-compatibility, we need to be more restrictive here and use specific types instead of Iterable. +def pad_sequence( + sequences: Tensor | list[Tensor], + batch_first: bool = False, + padding_value: float = 0.0, + padding_side: str = "right", +) -> Tensor: + r"""Pad a list of variable length Tensors with :attr:`padding_value`. + + ``pad_sequence`` stacks a list of Tensors along a new dimension, and pads them + to equal length. :attr:`sequences` can be list of sequences with size ``L x *``, + where `L` is length of the sequence and ``*`` is any number of dimensions + (including ``0``). If :attr:`batch_first` is ``False``, the output is of size + ``T x B x *``, and ``B x T x *`` otherwise, where ``B`` is the batch size + (the number of elements in :attr:`sequences`), ``T`` is the length of the longest + sequence. + + Example: + >>> from torch.nn.utils.rnn import pad_sequence + >>> a = torch.ones(25, 300) + >>> b = torch.ones(22, 300) + >>> c = torch.ones(15, 300) + >>> pad_sequence([a, b, c]).size() + torch.Size([25, 3, 300]) + + Note: + This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` + where `T` is the length of the longest sequence. This function assumes + trailing dimensions and type of all the Tensors in sequences are same. + + Args: + sequences (list[Tensor]): list of variable length sequences. + batch_first (bool, optional): if ``True``, the output will be in ``B x T x *`` + format, ``T x B x *`` otherwise. + padding_value (float, optional): value for padded elements. Default: ``0``. + padding_side (str, optional): the side to pad the sequences on. + Default: ``'right'``. + + Returns: + Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. + Tensor of size ``B x T x *`` otherwise + """ + if not (torch.jit.is_tracing() or torch.jit.is_scripting()): + # JIT doesn't support `Iterable` + if not isinstance(sequences, Iterable): + msg = ( + "pad_sequence: Expected iterable for input sequences, but got arg of type: " + f"{type(sequences)}" + ) + raise RuntimeError(msg) + + # In JIT context this leads to, + # RuntimeError: cannot statically infer the expected size of a list in this context + sequences = tuple(sequences) # type: ignore[assignment] + else: + # For JIT, we only support Union[Tensor, Tuple[Tensor]] + if isinstance(sequences, torch.Tensor): + sequences = sequences.unbind(0) # type: ignore[assignment] + + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] + return torch._C._nn.pad_sequence( + sequences, # type: ignore[arg-type] + batch_first, + padding_value, + padding_side, # type: ignore[arg-type] + ) + + +def unpad_sequence( + padded_sequences: Tensor, + lengths: Tensor, + batch_first: bool = False, +) -> list[Tensor]: + r"""Unpad padded Tensor into a list of variable length Tensors. + + ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors. + + Example: + >>> from torch.nn.utils.rnn import pad_sequence, unpad_sequence + >>> a = torch.ones(25, 300) + >>> b = torch.ones(22, 300) + >>> c = torch.ones(15, 300) + >>> sequences = [a, b, c] + >>> padded_sequences = pad_sequence(sequences) + >>> lengths = torch.as_tensor([v.size(0) for v in sequences]) + >>> unpadded_sequences = unpad_sequence(padded_sequences, lengths) + >>> torch.allclose(sequences[0], unpadded_sequences[0]) + True + >>> torch.allclose(sequences[1], unpadded_sequences[1]) + True + >>> torch.allclose(sequences[2], unpadded_sequences[2]) + True + + Args: + padded_sequences (Tensor): padded sequences. + lengths (Tensor): length of original (unpadded) sequences. + batch_first (bool, optional): whether batch dimension first or not. Default: ``False``. + + Returns: + a list of :class:`Tensor` objects + """ + unpadded_sequences = [] + + if not batch_first: + padded_sequences.transpose_(0, 1) + + max_length = padded_sequences.shape[1] + idx = torch.arange(max_length, device=lengths.device) + + for seq, length in zip(padded_sequences, lengths, strict=True): + mask = idx < length + unpacked_seq = seq[mask] + unpadded_sequences.append(unpacked_seq) + + return unpadded_sequences + + +def pack_sequence( + sequences: list[Tensor], + enforce_sorted: bool = True, +) -> PackedSequence: + r"""Packs a list of variable length Tensors. + + Consecutive call of the next functions: ``pad_sequence``, ``pack_padded_sequence``. + + ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is + the length of a sequence and `*` is any number of trailing dimensions, + including ``0``. + + For unsorted sequences, use `enforce_sorted = False`. If ``enforce_sorted`` + is ``True``, the sequences should be sorted in the order of decreasing length. + ``enforce_sorted = True`` is only necessary for ONNX export. + + Example: + >>> from torch.nn.utils.rnn import pack_sequence + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5]) + >>> c = torch.tensor([6]) + >>> pack_sequence([a, b, c]) + PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None) + + Args: + sequences (list[Tensor]): A list of sequences of decreasing length. + enforce_sorted (bool, optional): if ``True``, checks that the input + contains sequences sorted by length in a decreasing order. If + ``False``, this condition is not checked. Default: ``True``. + + Returns: + a :class:`PackedSequence` object + """ + lengths = torch.as_tensor([v.size(0) for v in sequences]) + return pack_padded_sequence( + pad_sequence(sequences), lengths, enforce_sorted=enforce_sorted + ) + + +def unpack_sequence(packed_sequences: PackedSequence) -> list[Tensor]: + r"""Unpack PackedSequence into a list of variable length Tensors. + + ``packed_sequences`` should be a PackedSequence object. + + Example: + >>> from torch.nn.utils.rnn import pack_sequence, unpack_sequence + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5]) + >>> c = torch.tensor([6]) + >>> sequences = [a, b, c] + >>> print(sequences) + [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])] + >>> packed_sequences = pack_sequence(sequences) + >>> print(packed_sequences) + PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None) + >>> unpacked_sequences = unpack_sequence(packed_sequences) + >>> print(unpacked_sequences) + [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])] + + Args: + packed_sequences (PackedSequence): A PackedSequence object. + + Returns: + a list of :class:`Tensor` objects + """ + padded_sequences, lengths = pad_packed_sequence(packed_sequences, batch_first=True) + unpacked_sequences = unpad_sequence(padded_sequences, lengths, batch_first=True) + return unpacked_sequences diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..a11613a51dac49d5a52d2c55f51734de37bd9e47 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py @@ -0,0 +1,368 @@ +# mypy: allow-untyped-defs +"""Spectral Normalization from https://arxiv.org/abs/1802.05957.""" + +from typing import Any, TypeVar + +import torch +import torch.nn.functional as F +from torch.nn.modules import Module + + +__all__ = [ + "SpectralNorm", + "SpectralNormLoadStateDictPreHook", + "SpectralNormStateDictHook", + "spectral_norm", + "remove_spectral_norm", +] + + +class SpectralNorm: + # Invariant before and after each forward call: + # u = F.normalize(W @ v) + # NB: At initialization, this invariant is not enforced + + _version: int = 1 + # At version 1: + # made `W` not a buffer, + # added `v` as a buffer, and + # made eval mode use `W = u @ W_orig @ v` rather than the stored `W`. + name: str + dim: int + n_power_iterations: int + eps: float + + def __init__( + self, + name: str = "weight", + n_power_iterations: int = 1, + dim: int = 0, + eps: float = 1e-12, + ) -> None: + self.name = name + self.dim = dim + if n_power_iterations <= 0: + raise ValueError( + "Expected n_power_iterations to be positive, but " + f"got n_power_iterations={n_power_iterations}" + ) + self.n_power_iterations = n_power_iterations + self.eps = eps + + def reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor: + weight_mat = weight + if self.dim != 0: + # permute dim to front + weight_mat = weight_mat.permute( + self.dim, *[d for d in range(weight_mat.dim()) if d != self.dim] + ) + height = weight_mat.size(0) + return weight_mat.reshape(height, -1) + + def compute_weight(self, module: Module, do_power_iteration: bool) -> torch.Tensor: + # NB: If `do_power_iteration` is set, the `u` and `v` vectors are + # updated in power iteration **in-place**. This is very important + # because in `DataParallel` forward, the vectors (being buffers) are + # broadcast from the parallelized module to each module replica, + # which is a new module object created on the fly. And each replica + # runs its own spectral norm power iteration. So simply assigning + # the updated vectors to the module this function runs on will cause + # the update to be lost forever. And the next time the parallelized + # module is replicated, the same randomly initialized vectors are + # broadcast and used! + # + # Therefore, to make the change propagate back, we rely on two + # important behaviors (also enforced via tests): + # 1. `DataParallel` doesn't clone storage if the broadcast tensor + # is already on correct device; and it makes sure that the + # parallelized module is already on `device[0]`. + # 2. If the out tensor in `out=` kwarg has correct shape, it will + # just fill in the values. + # Therefore, since the same power iteration is performed on all + # devices, simply updating the tensors in-place will make sure that + # the module replica on `device[0]` will update the _u vector on the + # parallelized module (by shared storage). + # + # However, after we update `u` and `v` in-place, we need to **clone** + # them before using them to normalize the weight. This is to support + # backproping through two forward passes, e.g., the common pattern in + # GAN training: loss = D(real) - D(fake). Otherwise, engine will + # complain that variables needed to do backward for the first forward + # (i.e., the `u` and `v` vectors) are changed in the second forward. + weight = getattr(module, self.name + "_orig") + u = getattr(module, self.name + "_u") + v = getattr(module, self.name + "_v") + weight_mat = self.reshape_weight_to_matrix(weight) + + if do_power_iteration: + with torch.no_grad(): + for _ in range(self.n_power_iterations): + # Spectral norm of weight equals to `u^T W v`, where `u` and `v` + # are the first left and right singular vectors. + # This power iteration produces approximations of `u` and `v`. + v = F.normalize( + torch.mv(weight_mat.t(), u), dim=0, eps=self.eps, out=v + ) + u = F.normalize(torch.mv(weight_mat, v), dim=0, eps=self.eps, out=u) + if self.n_power_iterations > 0: + # See above on why we need to clone + u = u.clone(memory_format=torch.contiguous_format) + v = v.clone(memory_format=torch.contiguous_format) + + sigma = torch.dot(u, torch.mv(weight_mat, v)) + weight = weight / sigma + return weight + + def remove(self, module: Module) -> None: + with torch.no_grad(): + weight = self.compute_weight(module, do_power_iteration=False) + delattr(module, self.name) + delattr(module, self.name + "_u") + delattr(module, self.name + "_v") + delattr(module, self.name + "_orig") + module.register_parameter(self.name, torch.nn.Parameter(weight.detach())) + + def __call__(self, module: Module, inputs: Any) -> None: + setattr( + module, + self.name, + self.compute_weight(module, do_power_iteration=module.training), + ) + + def _solve_v_and_rescale(self, weight_mat, u, target_sigma): + # Tries to returns a vector `v` s.t. `u = F.normalize(W @ v)` + # (the invariant at top of this class) and `u @ W @ v = sigma`. + # This uses pinverse in case W^T W is not invertible. + v = torch.linalg.multi_dot( + [weight_mat.t().mm(weight_mat).pinverse(), weight_mat.t(), u.unsqueeze(1)] + ).squeeze(1) + return v.mul_(target_sigma / torch.dot(u, torch.mv(weight_mat, v))) + + @staticmethod + def apply( + module: Module, name: str, n_power_iterations: int, dim: int, eps: float + ) -> "SpectralNorm": + for hook in module._forward_pre_hooks.values(): + if isinstance(hook, SpectralNorm) and hook.name == name: + raise RuntimeError( + f"Cannot register two spectral_norm hooks on the same parameter {name}" + ) + + fn = SpectralNorm(name, n_power_iterations, dim, eps) + weight = module._parameters[name] + if weight is None: + raise ValueError( + f"`SpectralNorm` cannot be applied as parameter `{name}` is None" + ) + if isinstance(weight, torch.nn.parameter.UninitializedParameter): + raise ValueError( + "The module passed to `SpectralNorm` can't have uninitialized parameters. " + "Make sure to run the dummy forward before applying spectral normalization" + ) + + with torch.no_grad(): + weight_mat = fn.reshape_weight_to_matrix(weight) + + h, w = weight_mat.size() + # randomly initialize `u` and `v` + u = F.normalize(weight.new_empty(h).normal_(0, 1), dim=0, eps=fn.eps) + v = F.normalize(weight.new_empty(w).normal_(0, 1), dim=0, eps=fn.eps) + + delattr(module, fn.name) + module.register_parameter(fn.name + "_orig", weight) + # We still need to assign weight back as fn.name because all sorts of + # things may assume that it exists, e.g., when initializing weights. + # However, we can't directly assign as it could be an nn.Parameter and + # gets added as a parameter. Instead, we register weight.data as a plain + # attribute. + setattr(module, fn.name, weight.data) + module.register_buffer(fn.name + "_u", u) + module.register_buffer(fn.name + "_v", v) + + module.register_forward_pre_hook(fn) + module._register_state_dict_hook(SpectralNormStateDictHook(fn)) + module._register_load_state_dict_pre_hook(SpectralNormLoadStateDictPreHook(fn)) + return fn + + +# This is a top level class because Py2 pickle doesn't like inner class nor an +# instancemethod. +class SpectralNormLoadStateDictPreHook: + # See docstring of SpectralNorm._version on the changes to spectral_norm. + def __init__(self, fn) -> None: + self.fn = fn + + # For state_dict with version None, (assuming that it has gone through at + # least one training forward), we have + # + # u = F.normalize(W_orig @ v) + # W = W_orig / sigma, where sigma = u @ W_orig @ v + # + # To compute `v`, we solve `W_orig @ x = u`, and let + # v = x / (u @ W_orig @ x) * (W / W_orig). + def __call__( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) -> None: + fn = self.fn + version = local_metadata.get("spectral_norm", {}).get( + fn.name + ".version", None + ) + if version is None or version < 1: + weight_key = prefix + fn.name + if ( + version is None + and all(weight_key + s in state_dict for s in ("_orig", "_u", "_v")) + and weight_key not in state_dict + ): + # Detect if it is the updated state dict and just missing metadata. + # This could happen if the users are crafting a state dict themselves, + # so we just pretend that this is the newest. + return + has_missing_keys = False + for suffix in ("_orig", "", "_u"): + key = weight_key + suffix + if key not in state_dict: + has_missing_keys = True + if strict: + missing_keys.append(key) + if has_missing_keys: + return + with torch.no_grad(): + weight_orig = state_dict[weight_key + "_orig"] + weight = state_dict.pop(weight_key) + sigma = (weight_orig / weight).mean() + weight_mat = fn.reshape_weight_to_matrix(weight_orig) + u = state_dict[weight_key + "_u"] + v = fn._solve_v_and_rescale(weight_mat, u, sigma) + state_dict[weight_key + "_v"] = v + + +# This is a top level class because Py2 pickle doesn't like inner class nor an +# instancemethod. +class SpectralNormStateDictHook: + # See docstring of SpectralNorm._version on the changes to spectral_norm. + def __init__(self, fn) -> None: + self.fn = fn + + def __call__(self, module, state_dict, prefix, local_metadata) -> None: + if "spectral_norm" not in local_metadata: + local_metadata["spectral_norm"] = {} + key = self.fn.name + ".version" + if key in local_metadata["spectral_norm"]: + raise RuntimeError(f"Unexpected key in metadata['spectral_norm']: {key}") + local_metadata["spectral_norm"][key] = self.fn._version + + +T_module = TypeVar("T_module", bound=Module) + + +def spectral_norm( + module: T_module, + name: str = "weight", + n_power_iterations: int = 1, + eps: float = 1e-12, + dim: int | None = None, +) -> T_module: + r"""Apply spectral normalization to a parameter in the given module. + + .. math:: + \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})}, + \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2} + + Spectral normalization stabilizes the training of discriminators (critics) + in Generative Adversarial Networks (GANs) by rescaling the weight tensor + with spectral norm :math:`\sigma` of the weight matrix calculated using + power iteration method. If the dimension of the weight tensor is greater + than 2, it is reshaped to 2D in power iteration method to get spectral + norm. This is implemented via a hook that calculates spectral norm and + rescales weight before every :meth:`~Module.forward` call. + + See `Spectral Normalization for Generative Adversarial Networks`_ . + + .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957 + + Args: + module (nn.Module): containing module + name (str, optional): name of weight parameter + n_power_iterations (int, optional): number of power iterations to + calculate spectral norm + eps (float, optional): epsilon for numerical stability in + calculating norms + dim (int, optional): dimension corresponding to number of outputs, + the default is ``0``, except for modules that are instances of + ConvTranspose{1,2,3}d, when it is ``1`` + + Returns: + The original module with the spectral norm hook + + .. note:: + This function has been reimplemented as + :func:`torch.nn.utils.parametrizations.spectral_norm` using the new + parametrization functionality in + :func:`torch.nn.utils.parametrize.register_parametrization`. Please use + the newer version. This function will be deprecated in a future version + of PyTorch. + + Example:: + + >>> m = spectral_norm(nn.Linear(20, 40)) + >>> m + Linear(in_features=20, out_features=40, bias=True) + >>> m.weight_u.size() + torch.Size([40]) + + """ + if dim is None: + if isinstance( + module, + ( + torch.nn.ConvTranspose1d, + torch.nn.ConvTranspose2d, + torch.nn.ConvTranspose3d, + ), + ): + dim = 1 + else: + dim = 0 + SpectralNorm.apply(module, name, n_power_iterations, dim, eps) + # pyrefly: ignore [bad-return] + return module + + +def remove_spectral_norm(module: T_module, name: str = "weight") -> T_module: + r"""Remove the spectral normalization reparameterization from a module. + + Args: + module (Module): containing module + name (str, optional): name of weight parameter + + Example: + >>> m = spectral_norm(nn.Linear(40, 10)) + >>> remove_spectral_norm(m) + """ + for k, hook in module._forward_pre_hooks.items(): + if isinstance(hook, SpectralNorm) and hook.name == name: + hook.remove(module) + del module._forward_pre_hooks[k] + break + else: + raise ValueError(f"spectral_norm of '{name}' not found in {module}") + + for k, hook in module._state_dict_hooks.items(): + if isinstance(hook, SpectralNormStateDictHook) and hook.fn.name == name: + del module._state_dict_hooks[k] + break + + for k, hook in module._load_state_dict_pre_hooks.items(): + if isinstance(hook, SpectralNormLoadStateDictPreHook) and hook.fn.name == name: + del module._load_state_dict_pre_hooks[k] + break + + return module diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py new file mode 100644 index 0000000000000000000000000000000000000000..70f0afdeb52923a029a1843e1f2cfc702ab7473b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py @@ -0,0 +1,279 @@ +# mypy: allow-untyped-defs +import contextlib +from typing import Any +from typing_extensions import deprecated + +import torch +from torch import Tensor +from torch.nn.utils._named_member_accessor import NamedMemberAccessor + + +__all__ = ["functional_call"] + + +def _untie_named_tensors_map( + module: "torch.nn.Module", + parameters_and_buffers: dict[str, Tensor], +) -> dict[str, Tensor]: + """ + Unties all tied tensors in the module to parameters_and_buffers. + + This function returns a new untied_parameters_and_buffers dictionary and leave the original + untied_parameters_and_buffers dictionary unchanged. It adds new (missing) keys for tied tensors + in the module to untied_parameters_and_buffers. The value of the new key is the user-given value + in the original parameters_and_buffers dictionary. + + If there are more than one user-given values for the same tied tensor, it will raise an error. + + For example, if the module has two tied weights self.foo and self.tied_foo and the user passes + {'foo': foo_value, ...}, this will return {'foo': foo_value, 'tied_foo': foo_value, ...}. If the + user passes {'foo': foo_value, 'tied_foo': tied_foo_value, ...}, it will raise an error. If the + user passes {'foo': foo_value, 'tied_foo': foo_value, ...}, it will not raise an error. + + Args: + module (torch.nn.Module): the module to determine which tensors are tied. + parameters_and_buffers (Dict[str, Tensor]): a map of {name: tensor} for reparamaterizing the module. + + Returns: + A new untied version of the parameters_and_buffers dictionary. + + Raises: + ValueError: if there are more than one user-given values for the same tied tensor. + """ + # A map of {name: tensor} for all tensors (including tied ones) in the module. + all_named_tensors: dict[str, Tensor] = {} + all_named_tensors.update(module.named_parameters(remove_duplicate=False)) + all_named_tensors.update(module.named_buffers(remove_duplicate=False)) + + # A map of {tensor: set(all_tied_names)} for all tensor names in the module. + tensor_to_tied_names_map: dict[Tensor, set[str]] = {} + for name, tensor in all_named_tensors.items(): + if tensor not in tensor_to_tied_names_map: + tensor_to_tied_names_map[tensor] = set() + tensor_to_tied_names_map[tensor].add(name) + + # A map of {tied_name: set(all_tied_names)} for all tensor names in the module. + # If a name is not tied, it will not be in this map. + tied_names_map: dict[str, set[str]] = {} + for tied_names in tensor_to_tied_names_map.values(): + if len(tied_names) > 1: + for tied_name in tied_names: + tied_names_map[tied_name] = tied_names + + # Make sure the user didn't pass multiple values for the same tied tensor. + given_names = set(parameters_and_buffers.keys()) + # same as given_names.intersection(tied_names_map.keys()) but dynamo can't + # handle that + given_names_for_tied_tensors: set[str] = set() + for name in given_names: + if name in tied_names_map: + given_names_for_tied_tensors.add(name) + + for given_name in given_names_for_tied_tensors: + tied_names = tied_names_map[given_name] + if ( + # Detect if there are multiple keys present for the same tied tensor. + len(tied_names.intersection(given_names_for_tied_tensors)) > 1 + # Only raise an error if the user passed multiple values for the same tied tensor. + # If all given values are the same, don't raise. + and len({parameters_and_buffers[tied_name] for tied_name in tied_names}) + != 1 + ): + raise ValueError( + f"functional_call got multiple values for keys {sorted(tied_names)}, " + f"which are tied. Consider using tie_weights=False" + ) + + # Untie the given named tensor map + # Make a copy for not modifying the original dict + untied_parameters_and_buffers = parameters_and_buffers.copy() + for given_name in given_names_for_tied_tensors: + for tied_name in tied_names_map[given_name]: + untied_parameters_and_buffers[tied_name] = parameters_and_buffers[ + given_name + ] + return untied_parameters_and_buffers + + +@contextlib.contextmanager +def _reparametrize_module( + module: "torch.nn.Module", + parameters_and_buffers: dict[str, Tensor], + tie_weights: bool = False, + strict: bool = False, + stack_weights: bool = False, +): + if tie_weights: + untied_parameters_and_buffers = _untie_named_tensors_map( + module, parameters_and_buffers + ) + else: + untied_parameters_and_buffers = parameters_and_buffers + + accessor = NamedMemberAccessor(module) + if strict: + missing_keys, unexpected_keys = accessor.check_keys( + untied_parameters_and_buffers + ) + error_msgs = [] + if len(unexpected_keys) > 0: + error_msgs.append( + f"Unexpected key(s): {', '.join(map(repr, unexpected_keys))}." + ) + if len(missing_keys) > 0: + error_msgs.append(f"Missing key(s): {', '.join(map(repr, missing_keys))}.") + if len(error_msgs) > 0: + raise RuntimeError( + "Error(s) in reparametrizing for {}:\n\t{}".format( + module._get_name(), "\n\t".join(error_msgs) + ) + ) + + orig_parameters_and_buffers: dict[str, Tensor] = {} + try: + orig_parameters_and_buffers, _ = accessor.swap_tensors_dict( + untied_parameters_and_buffers, allow_missing=True + ) + yield + finally: + if stack_weights: + # When stacking is enabled, we will restore the weights in LIFO order. + orig_parameters_and_buffers = dict( + reversed(orig_parameters_and_buffers.items()) + ) + new_parameters_and_buffers, _ = accessor.swap_tensors_dict( + orig_parameters_and_buffers, allow_missing=True + ) + # Sometimes the module is not completely stateless and has some in-place modifications on + # the _parameters and _buffers dictionaries. + # Write the changed parameters and buffers back to the original dict. + parameters_and_buffers.update( + { + k: new_parameters_and_buffers[k] + for k in parameters_and_buffers + if k in new_parameters_and_buffers + } + ) + + +@deprecated( + "`torch.nn.utils.stateless.functional_call` is deprecated as of PyTorch 2.0 " + "and will be removed in a future version of PyTorch. " + "Please use `torch.func.functional_call` instead which is a drop-in replacement.", + category=FutureWarning, +) +def functional_call( + module: "torch.nn.Module", + parameters_and_buffers: dict[str, Tensor], + args: Any | tuple | None = None, + kwargs: dict[str, Any] | None = None, + *, + tie_weights: bool = True, + strict: bool = False, +): + r"""Perform a functional call on the module by replacing the module parameters and buffers with the provided ones. + + .. warning:: + + This API is deprecated as of PyTorch 2.0 and will be removed in a future + version of PyTorch. Please use :func:`torch.func.functional_call` instead, + which is a drop-in replacement for this API. + + .. note:: If the module has active parametrizations, passing a value in the + :attr:`parameters_and_buffers` argument with the name set to the regular parameter + name will completely disable the parametrization. + If you want to apply the parametrization function to the value passed + please set the key as ``{submodule_name}.parametrizations.{parameter_name}.original``. + + .. note:: If the module performs in-place operations on parameters/buffers, these will be reflected + in the `parameters_and_buffers` input. + + Example:: + + >>> a = {'foo': torch.zeros(())} + >>> # xdoctest: +SKIP + >>> mod = Foo() # does self.foo = self.foo + 1 + >>> print(mod.foo) # tensor(0.) + >>> functional_call(mod, a, torch.ones(())) + >>> print(mod.foo) # tensor(0.) + >>> print(a['foo']) # tensor(1.) + + .. note:: If the module has tied weights, whether or not functional_call respects the tying is determined by the + tie_weights flag. + + Example:: + + >>> a = {'foo': torch.zeros(())} + >>> # xdoctest: +SKIP + >>> mod = Foo() # has both self.foo and self.foo_tied which are tied. Returns x + self.foo + self.foo_tied + >>> print(mod.foo) # tensor(1.) + >>> mod(torch.zeros(())) # tensor(2.) + >>> functional_call(mod, a, torch.zeros(())) # tensor(0.) since it will change self.foo_tied too + >>> functional_call(mod, a, torch.zeros(()), tie_weights=False) # tensor(1.)--self.foo_tied is not updated + >>> new_a = {'foo': torch.zeros(()), 'foo_tied': torch.zeros(())} + >>> functional_call(mod, new_a, torch.zeros()) # tensor(0.) + + Args: + module (torch.nn.Module): the module to call + parameters_and_buffers (dict of str and Tensor): the parameters that will be used in + the module call. + args (Any or tuple): arguments to be passed to the module call. If not a tuple, considered a single argument. + kwargs (dict): keyword arguments to be passed to the module call + tie_weights (bool, optional): If True, then parameters and buffers tied in the original model will be treated as + tied in the reparamaterized version. Therefore, if True and different values are passed for the tied + parameters and buffers, it will error. If False, it will not respect the originally tied parameters and + buffers unless the values passed for both weights are the same. Default: True. + strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and + buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will + error. Default: False. + + Returns: + Any: the result of calling ``module``. + """ + return _functional_call( + module, + parameters_and_buffers, + args, + kwargs, + tie_weights=tie_weights, + strict=strict, + ) + + +def _functional_call( + module: "torch.nn.Module", + parameters_and_buffers: dict[str, Tensor], + args: Any | tuple | None = None, + kwargs: dict[str, Any] | None = None, + *, + tie_weights: bool = True, + strict: bool = False, +): + # TODO allow kwargs such as unsafe and others for parametrization + if ( + torch.jit.is_tracing() + or torch.jit.is_scripting() + or isinstance( + module, + ( + torch.jit.RecursiveScriptModule, + torch.jit.ScriptModule, + torch.jit.ScriptFunction, + ), + ) + ): + raise RuntimeError("The stateless API can't be used with Jitted modules") + if isinstance(module, torch.nn.DataParallel): + raise RuntimeError( + "The stateless API can't be used with nn.DataParallel module" + ) + if kwargs is None: + kwargs = {} + if args is None: + args = () + elif not isinstance(args, tuple): + args = (args,) + with _reparametrize_module( + module, parameters_and_buffers, tie_weights=tie_weights, strict=strict + ): + return module(*args, **kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..7b336e8b8c08e59b2ee3d12ab481bacb4b6aa33d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py @@ -0,0 +1,165 @@ +# mypy: allow-untyped-defs +r"""Weight Normalization from https://arxiv.org/abs/1602.07868.""" + +from typing import Any, TypeVar +from typing_extensions import deprecated + +from torch import _weight_norm, norm_except_dim +from torch.nn.modules import Module +from torch.nn.parameter import Parameter, UninitializedParameter + + +__all__ = ["WeightNorm", "weight_norm", "remove_weight_norm"] + + +class WeightNorm: + name: str + dim: int + + def __init__(self, name: str, dim: int) -> None: + if dim is None: + dim = -1 + self.name = name + self.dim = dim + + # TODO Make return type more specific + def compute_weight(self, module: Module) -> Any: + g = getattr(module, self.name + "_g") + v = getattr(module, self.name + "_v") + return _weight_norm(v, g, self.dim) + + @staticmethod + @deprecated( + "`torch.nn.utils.weight_norm` is deprecated " + "in favor of `torch.nn.utils.parametrizations.weight_norm`.", + category=FutureWarning, + ) + def apply(module, name: str, dim: int) -> "WeightNorm": + for hook in module._forward_pre_hooks.values(): + if isinstance(hook, WeightNorm) and hook.name == name: + raise RuntimeError( + f"Cannot register two weight_norm hooks on the same parameter {name}" + ) + + if dim is None: + dim = -1 + + fn = WeightNorm(name, dim) + + weight = getattr(module, name) + if isinstance(weight, UninitializedParameter): + raise ValueError( + "The module passed to `WeightNorm` can't have uninitialized parameters. " + "Make sure to run the dummy forward before applying weight normalization" + ) + # remove w from parameter list + del module._parameters[name] + + # add g and v as new parameters and express w as g/||v|| * v + module.register_parameter( + name + "_g", Parameter(norm_except_dim(weight, 2, dim).data) + ) + module.register_parameter(name + "_v", Parameter(weight.data)) + setattr(module, name, fn.compute_weight(module)) + + # recompute weight before every forward() + module.register_forward_pre_hook(fn) + + return fn + + def remove(self, module: Module) -> None: + weight = self.compute_weight(module) + delattr(module, self.name) + del module._parameters[self.name + "_g"] + del module._parameters[self.name + "_v"] + setattr(module, self.name, Parameter(weight.data)) + + def __call__(self, module: Module, inputs: Any) -> None: + setattr(module, self.name, self.compute_weight(module)) + + +T_module = TypeVar("T_module", bound=Module) + + +def weight_norm(module: T_module, name: str = "weight", dim: int = 0) -> T_module: + r"""Apply weight normalization to a parameter in the given module. + + .. math:: + \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|} + + Weight normalization is a reparameterization that decouples the magnitude + of a weight tensor from its direction. This replaces the parameter specified + by :attr:`name` (e.g. ``'weight'``) with two parameters: one specifying the magnitude + (e.g. ``'weight_g'``) and one specifying the direction (e.g. ``'weight_v'``). + Weight normalization is implemented via a hook that recomputes the weight + tensor from the magnitude and direction before every :meth:`~Module.forward` + call. + + By default, with ``dim=0``, the norm is computed independently per output + channel/plane. To compute a norm over the entire weight tensor, use + ``dim=None``. + + See https://arxiv.org/abs/1602.07868 + + .. warning:: + + This function is deprecated. Use :func:`torch.nn.utils.parametrizations.weight_norm` + which uses the modern parametrization API. The new ``weight_norm`` is compatible + with ``state_dict`` generated from old ``weight_norm``. + + Migration guide: + + * The magnitude (``weight_g``) and direction (``weight_v``) are now expressed + as ``parametrizations.weight.original0`` and ``parametrizations.weight.original1`` + respectively. If this is bothering you, please comment on + https://github.com/pytorch/pytorch/issues/102999 + + * To remove the weight normalization reparametrization, use + :func:`torch.nn.utils.parametrize.remove_parametrizations`. + + * The weight is no longer recomputed once at module forward; instead, it will + be recomputed on every access. To restore the old behavior, use + :func:`torch.nn.utils.parametrize.cached` before invoking the module + in question. + + Args: + module (Module): containing module + name (str, optional): name of weight parameter + dim (int, optional): dimension over which to compute the norm + + Returns: + The original module with the weight norm hook + + Example:: + + >>> m = weight_norm(nn.Linear(20, 40), name='weight') + >>> m + Linear(in_features=20, out_features=40, bias=True) + >>> m.weight_g.size() + torch.Size([40, 1]) + >>> m.weight_v.size() + torch.Size([40, 20]) + + """ + WeightNorm.apply(module, name, dim) + return module + + +def remove_weight_norm(module: T_module, name: str = "weight") -> T_module: + r"""Remove the weight normalization reparameterization from a module. + + Args: + module (Module): containing module + name (str, optional): name of weight parameter + + Example: + >>> m = weight_norm(nn.Linear(20, 40)) + >>> remove_weight_norm(m) + """ + for k, hook in module._forward_pre_hooks.items(): + if isinstance(hook, WeightNorm) and hook.name == name: + hook.remove(module) + del module._forward_pre_hooks[k] + return module + + raise ValueError(f"weight_norm of '{name}' not found in {module}")